Multi-core Programming¶
This document introduces PTO multi-core parallel programming techniques, helping developers fully utilize Ascend's multi-core architecture for high-performance operators.
Contents¶
- 1. Multi-core Architecture Overview
- 2. SPMD Programming Pattern
- 3. MPMD Programming Pattern
- 4. Load Balancing
- 5. Inter-core Communication
- 6. Performance Optimization
1. Multi-core Architecture Overview¶
1.1 Ascend Multi-core Architecture¶
Hardware Configuration: - A2/A3: 24 AI Cores - A5: More cores (varies by model)
Architecture Features:
┌─────────────────────────────────┐
│ Host CPU │
└────────────┬────────────────────┘
│
┌────────┴────────┐
│ NPU Device │
│ ┌───┬───┬───┐ │
│ │C0 │C1 │...│ │ AI Cores
│ └───┴───┴───┘ │
│ ┌───────────┐ │
│ │ GM │ │ Global Memory
│ └───────────┘ │
└─────────────────┘
Core Features: - Each core executes independently - Shared global memory (GM) - Independent L1 cache - Inter-core communication via GM
1.2 Parallel Programming Models¶
Two Main Patterns:
| Pattern | Features | Use Case |
|---|---|---|
| SPMD | All cores run same code | Regular data parallelism |
| MPMD | Different cores run different code | Pipeline, producer-consumer |
2. SPMD Programming Pattern¶
2.1 Basic Concept¶
SPMD (Single Program, Multiple Data):
- All cores execute the same program
- Distinguish different data blocks via block_idx
- Most commonly used parallel pattern
2.2 Basic Example¶
Vector Addition:
__global__ __aicore__ void VecAddKernel(__gm__ float* out,
__gm__ const float* in0,
__gm__ const float* in1,
uint32_t totalLength) {
// Get current core ID
int block_idx = get_block_idx();
int block_num = get_block_num();
// Calculate data range for current core
int elements_per_block = (totalLength + block_num - 1) / block_num;
int start = block_idx * elements_per_block;
int end = min(start + elements_per_block, totalLength);
// Process current block
for (int i = start; i < end; i += TILE_SIZE) {
int size = min(TILE_SIZE, end - i);
using TileT = Tile<TileType::Vec, float, 8, 256>;
TileT a, b, c;
TLOAD(a, GlobalTensor(in0 + i));
TLOAD(b, GlobalTensor(in1 + i));
TADD(c, a, b);
TSTORE(GlobalTensor(out + i), c);
}
}
2.3 2D Data Partitioning¶
Matrix Multiplication Example:
__global__ __aicore__ void MatMulKernel(__gm__ float* C,
__gm__ const float* A,
__gm__ const float* B,
int M, int K, int N) {
// Get core ID
int block_idx = get_block_idx();
// 2D partitioning: M and N dimensions
int blocks_m = (M + TILE_M - 1) / TILE_M;
int blocks_n = (N + TILE_N - 1) / TILE_N;
int block_m = block_idx / blocks_n;
int block_n = block_idx % blocks_n;
// Calculate matrix block for current core
int m_start = block_m * TILE_M;
int n_start = block_n * TILE_N;
// Ensure no out-of-bounds
if (m_start >= M || n_start >= N) return;
int m_size = min(TILE_M, M - m_start);
int n_size = min(TILE_N, N - n_start);
// Execute matrix multiplication
TileAcc acc;
TFILL(acc, 0);
for (int k = 0; k < K; k += TILE_K) {
int k_size = min(TILE_K, K - k);
TLOAD(tileA, A[m_start:m_start+m_size, k:k+k_size]);
TLOAD(tileB, B[k:k+k_size, n_start:n_start+n_size]);
TMATMUL_ACC(acc, tileA, tileB);
}
TSTORE(C[m_start:m_start+m_size, n_start:n_start+n_size], acc);
}
3. MPMD Programming Pattern¶
3.1 Basic Concept¶
MPMD (Multiple Program, Multiple Data): - Different cores execute different programs - Suitable for pipeline and producer-consumer patterns - Requires inter-core synchronization
3.2 Task Dispatch Pattern¶
Method 1: Single Entry + Switch
__global__ __aicore__ void MPMDKernel(__gm__ float* out,
__gm__ const float* in,
uint32_t task_id) {
switch (task_id) {
case 0:
ProducerTask(out, in);
break;
case 1:
ConsumerTask(out, in);
break;
case 2:
ProcessorTask(out, in);
break;
default:
break;
}
}
3.3 Pipeline Pattern¶
Three-stage Pipeline:
__global__ __aicore__ void PipelineKernel(__gm__ float* out,
__gm__ const float* in,
uint32_t stage_id) {
switch (stage_id) {
case 0: // Stage 1: Load
for (int i = 0; i < N; i++) {
TLOAD(buffer1[i], in[i]);
signal_stage2();
}
break;
case 1: // Stage 2: Compute
for (int i = 0; i < N; i++) {
wait_stage1();
TCOMPUTE(buffer2[i], buffer1[i]);
signal_stage3();
}
break;
case 2: // Stage 3: Store
for (int i = 0; i < N; i++) {
wait_stage2();
TSTORE(out[i], buffer2[i]);
}
break;
}
}
4. Load Balancing¶
4.1 Static Load Balancing¶
Uniform Partitioning:
// Method 1: Simple division
int elements_per_block = totalLength / block_num;
int start = block_idx * elements_per_block;
int end = (block_idx == block_num - 1) ?
totalLength : start + elements_per_block;
// Method 2: Ceiling division
int elements_per_block = (totalLength + block_num - 1) / block_num;
int start = block_idx * elements_per_block;
int end = min(start + elements_per_block, totalLength);
4.2 Load Imbalance Detection¶
Detection Method:
// Record execution time for each core
#ifdef PROFILE
auto start = GetTime();
// Execute task
process_block(block_idx);
auto end = GetTime();
execution_times[block_idx] = end - start;
#endif
// Analyze load balance
float max_time = *max_element(execution_times.begin(),
execution_times.end());
float min_time = *min_element(execution_times.begin(),
execution_times.end());
float imbalance = (max_time - min_time) / max_time;
if (imbalance > 0.2) {
printf("Warning: Load imbalance detected: %.2f%%\n",
imbalance * 100);
}
5. Inter-core Communication¶
5.1 Communication via Global Memory¶
Basic Pattern:
// Core 0: Write data
__global__ __aicore__ void Writer(__gm__ float* shared_buffer) {
if (get_block_idx() == 0) {
TLOAD(tile, local_data);
TSTORE(shared_buffer, tile);
// Set flag indicating data is ready
shared_buffer[FLAG_OFFSET] = 1;
}
}
// Core 1: Read data
__global__ __aicore__ void Reader(__gm__ float* shared_buffer) {
if (get_block_idx() == 1) {
// Wait for data ready
while (shared_buffer[FLAG_OFFSET] != 1) {
// Spin wait
}
TLOAD(tile, shared_buffer);
process(tile);
}
}
5.2 Synchronization with Atomic Operations¶
Counter Synchronization:
__gm__ atomic<int> counter = 0;
__global__ __aicore__ void SyncKernel(...) {
// Each core increments counter after completing work
process_local_work();
counter.fetch_add(1);
// Wait for all cores to complete
while (counter.load() < block_num) {
// Spin wait
}
// Continue to next stage
next_stage_work();
}
6. Performance Optimization¶
6.1 Reduce Inter-core Communication¶
Strategy 1: Increase Data Block Size
// Bad: Frequent communication
for (int i = 0; i < N; i++) {
process_small_block(i);
sync_with_other_cores(); // Sync every time
}
// Good: Batch processing
for (int i = 0; i < N; i += BATCH_SIZE) {
process_large_block(i, BATCH_SIZE);
sync_with_other_cores(); // Batch sync
}
Strategy 2: Localize Computation
// Make each core complete work independently
__global__ __aicore__ void LocalizedKernel(...) {
int block_idx = get_block_idx();
// Each core processes complete subproblem
// No need to communicate with other cores
process_independent_subproblem(block_idx);
}
6.2 Optimize Data Partitioning¶
Consider Data Locality:
// 2D matrix: Partition by blocks rather than rows/columns
// Good: Each core accesses contiguous memory blocks
for (int bm = 0; bm < blocks_m; bm++) {
for (int bn = 0; bn < blocks_n; bn++) {
int block_id = bm * blocks_n + bn;
if (block_id == get_block_idx()) {
process_block(bm, bn);
}
}
}
6.3 Avoid False Sharing¶
Problem:
// Bad: Multiple cores write to adjacent locations
__gm__ float results[NUM_CORES];
__global__ __aicore__ void BadKernel(...) {
int idx = get_block_idx();
results[idx] = compute(); // May cause cache line conflicts
}
Solution:
// Good: Use padding to avoid false sharing
constexpr int CACHE_LINE_SIZE = 64;
constexpr int PADDING = CACHE_LINE_SIZE / sizeof(float);
__gm__ float results[NUM_CORES * PADDING];
__global__ __aicore__ void GoodKernel(...) {
int idx = get_block_idx();
results[idx * PADDING] = compute(); // Avoid cache line conflicts
}