性能分析方法¶
带宽估算¶
// RS 阶段数据量
size_t rs_bytes = (nranks - 1) * total_data_bytes / nranks;
// AG 阶段数据量
size_t ag_bytes = (nranks - 1) * total_data_bytes / nranks;
// 实际带宽
float rs_bw_gbps = rs_bytes / (rs_time_us * 1e-6) / 1e9;
float ag_bw_gbps = ag_bytes / (ag_time_us * 1e-6) / 1e9;
// 理论峰值带宽(示例:HCCS 节点内 ~30GB/s per link)
float peak_bw_gbps = 30.0;
float utilization = actual_bw / peak_bw_gbps * 100;
printf("BW utilization: %.1f%%\n", utilization);
性能报告模板¶
void PrintPerfReport(float comp_us, float pipe_us, size_t data_bytes, int nranks)
{
float comm_est_us = pipe_us - comp_us;
float speedup = (comp_us + comm_est_us) / pipe_us;
float overlap_pct = (1.0 - (pipe_us - std::max(comp_us, comm_est_us))
/ std::min(comp_us, comm_est_us)) * 100;
size_t rs_bytes = (nranks - 1) * data_bytes / nranks;
size_t ag_bytes = rs_bytes;
float rs_bw = rs_bytes / (comm_est_us * 0.5 * 1e-6) / 1e9;
float ag_bw = ag_bytes / (comm_est_us * 0.5 * 1e-6) / 1e9;
printf("=== Performance Report ===\n");
printf("Compute-only: %.1f us\n", comp_us);
printf("Pipelined: %.1f us\n", pipe_us);
printf("Comm estimate: %.1f us\n", comm_est_us);
printf("Speedup: %.2fx\n", speedup);
printf("Overlap: %.1f%%\n", overlap_pct);
printf("RS bandwidth: %.1f GB/s\n", rs_bw);
printf("AG bandwidth: %.1f GB/s\n", ag_bw);
}
Profiling 方法¶
Host 侧 Event 计时¶
aclrtEvent start, end;
aclrtCreateEvent(&start);
aclrtCreateEvent(&end);
aclrtRecordEvent(start, computeStream);
launchCompute(..., computeStream);
launchComm(..., commStream);
aclrtRecordEvent(end, computeStream);
aclrtSynchronizeStream(commStream);
aclrtSynchronizeStream(computeStream);
float total_ms;
aclrtEventElapsedTime(&total_ms, start, end);
Compute-only Baseline¶
for (int i = 0; i < COMPUTE_ONLY_ITERS; i++) {
aclrtRecordEvent(start, computeStream);
launchComputeOnly(..., computeStream);
aclrtRecordEvent(end, computeStream);
aclrtSynchronizeStream(computeStream);
float ms;
aclrtEventElapsedTime(&ms, start, end);
comp_times.push_back(ms);
}
float avg_comp = median(comp_times);
Sequential Baseline¶
aclrtRecordEvent(start, stream);
launchCompute(..., stream);
aclrtSynchronizeStream(stream);
launchComm(..., stream);
aclrtRecordEvent(end, stream);
aclrtSynchronizeStream(stream);
float seq_ms;
aclrtEventElapsedTime(&seq_ms, start, end);
性能迭代策略¶
1. 建立 baseline(compute-only + sequential)
2. 测量 pipelined 性能
3. 计算 speedup 和 overlap%
4. 如果 overlap < 80%:
a. 检查通信是否太早开始(队列空转)
b. 检查通信是否太晚开始(Tile 太大)
c. 检查 Block 负载均衡
5. 如果带宽利用率 < 60%:
a. 增大 Tile 以减少传输次数
b. 使用乒乓双缓冲
c. 检查数据对齐
6. 重复优化迭代
msprof 集成¶
对于更深入的 profiling,可使用 msprof 采集硬件 timeline:
# 采集 kernel 执行 timeline
msprof --output=./prof_data --application="mpirun -np 8 ./my_operator"
# 分析结果
msprof --export=timeline --output=./prof_data
msprof 可展示各 AICore 的 MTE2/MTE3/Cube/Vec 管道占用率,帮助定位重叠空洞。