lightspeed_fa/fa_v2_pipelined.cu at master · ziereis/lightspeed_fa · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#include "common.cuh"
#include "utils.cuh"
#include <cfloat>
#include <cmath>
#include <cuda.h>

// N-stage pipelined scaled dot-product attention kernel
// Q, K, V: [B, H, N, D] in row-major
// O: [B, H, N, D] output
template <int BR, int BC, int D, int NUM_THREADS, int STAGES = 2>
__global__ __launch_bounds__(NUM_THREADS, 2) void sdpa_kernel(
    const bf16 *__restrict__ Q, const bf16 *__restrict__ K,
    const bf16 *__restrict__ V, bf16 *__restrict__ O, int N, int H, int B) {
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  constexpr int DIST_BR = NUM_WARPS * MMA_SIZE;
  const int widx = threadIdx.x / WARP_SIZE;
  int tile_idx = blockIdx.x;
  int h = blockIdx.y;
  int b = blockIdx.z;

  const bf16 *Q_bh = Q + b * H * N * D + h * N * D;
  const bf16 *K_bh = K + b * H * N * D + h * N * D;
  const bf16 *V_bh = V + b * H * N * D + h * N * D;
  bf16 *O_bh = O + b * H * N * D + h * N * D;

  const bf16 *Q_i = Q_bh + tile_idx * BR * D;
  bf16 *O_i = O_bh + tile_idx * BR * D;

  __shared__ bf16 KV_smem[STAGES][BC * D * 2];

  bf16 *Q_s = KV_smem[0];

  ld_global_shmem<D, BR, D, NUM_WARPS, true>(Q_i, Q_s, D);
  cp_async_commit();
  cp_async_wait<0>();
  __syncthreads();

  bf16 Q_frag[BR / DIST_BR][D / MMA_SIZE][8];
  load_frag<D, DIST_BR, MMA_SIZE, false, true>(Q_s + widx * MMA_SIZE * D,
                                               Q_frag);
  __syncthreads();

  float O_frag[BR / DIST_BR][D / MMA_SIZE][8] = {0.0f};

  constexpr int MMA_TILES_M = BR / DIST_BR;
  constexpr int ROWS_PER_MMA = 2;
  constexpr int ROWS_PER_THREAD = MMA_TILES_M * ROWS_PER_MMA;

  float m_i[ROWS_PER_THREAD];
  float l_i[ROWS_PER_THREAD];
  for (int r = 0; r < ROWS_PER_THREAD; r++) {
    m_i[r] = -FLT_MAX;
    l_i[r] = 0.0f;
  }

  float scale = 1.0f / sqrtf(D);
  int num_tiles = N / BC;

#pragma unroll
  for (int s = 0; s < STAGES - 1 && s < num_tiles; s++) {
    bf16 *K_s = KV_smem[s];
    bf16 *V_s = KV_smem[s] + BC * D;
    ld_global_shmem<D, BC, D, NUM_WARPS, true>(K_bh + s * BC * D, K_s, D);
    ld_global_shmem<D, BC, D, NUM_WARPS, true>(V_bh + s * BC * D, V_s, D);
    cp_async_commit();
  }

  for (int tile = 0; tile < num_tiles; tile++) {
    int buf = tile % STAGES;
    bf16 *K_s = KV_smem[buf];
    bf16 *V_s = KV_smem[buf] + BC * D;

    cp_async_wait<STAGES - 2>();
    __syncthreads();

    int prefetch_tile = tile + STAGES - 1;
    if (prefetch_tile < num_tiles) {
      int prefetch_buf = prefetch_tile % STAGES;
      bf16 *K_prefetch = KV_smem[prefetch_buf];
      bf16 *V_prefetch = KV_smem[prefetch_buf] + BC * D;
      ld_global_shmem<D, BC, D, NUM_WARPS, true>(K_bh + prefetch_tile * BC * D,
                                                 K_prefetch, D);
      ld_global_shmem<D, BC, D, NUM_WARPS, true>(V_bh + prefetch_tile * BC * D,
                                                 V_prefetch, D);
      cp_async_commit();
    }

    // S_ij = (Q_i @ K_j^T) * scale
    bf16 K_frag[BC / MMA_SIZE][D / MMA_SIZE][8];
    load_frag<D, MMA_SIZE, MMA_SIZE, false, true>(K_s, K_frag);
    float S_frag[BR / DIST_BR][BC / MMA_SIZE][8] = {0};
    mma_transpose_b(Q_frag, K_frag, S_frag);

#pragma unroll
    for (int row = 0; row < BR / DIST_BR; row++) {
#pragma unroll
      for (int col = 0; col < BC / MMA_SIZE; col++) {
#pragma unroll
        for (int elem = 0; elem < 8; elem++) {
          S_frag[row][col][elem] *= scale;
        }
      }
    }

    // m_ij = max(S_ij, axis=1)
    float local_rowmax[ROWS_PER_THREAD];
#pragma unroll
    for (int r = 0; r < ROWS_PER_THREAD; r++) {
      local_rowmax[r] = -FLT_MAX;
    }

    frag_reduce(S_frag, local_rowmax,
                [](float a, float b) { return max(a, b); });

#pragma unroll
    for (int r = 0; r < ROWS_PER_THREAD; r++) {
      local_rowmax[r] = max(local_rowmax[r], m_i[r]);
    }

    float alpha[ROWS_PER_THREAD];
#pragma unroll
    for (int r = 0; r < ROWS_PER_THREAD; r++) {
      alpha[r] = __expf(m_i[r] - local_rowmax[r]);
    }

#pragma unroll
    for (int r = 0; r < ROWS_PER_THREAD; r++) {
      m_i[r] = local_rowmax[r];
    }

    // P_ij = exp(S_ij - m_i_new)
    frag_row_apply(S_frag, local_rowmax,
                   [](float a, float b) { return __expf(a - b); });

    // Compute row sums for normalization
    float (&l_ij)[ROWS_PER_THREAD] = local_rowmax;
#pragma unroll
    for (int r = 0; r < ROWS_PER_THREAD; r++) {
      l_ij[r] = 0.0f;
    }
    frag_reduce(S_frag, l_ij, [](float a, float b) { return a + b; });

#pragma unroll
    for (int r = 0; r < ROWS_PER_THREAD; r++) {
      l_i[r] = alpha[r] * l_i[r] + l_ij[r];
    }

    // Scale previous O accumulator
    frag_row_apply(O_frag, alpha, [](float a, float b) { return a * b; });

    // O_i += P_ij @ V_j
    bf16 P_frag[BR / DIST_BR][BC / MMA_SIZE][8];
    frag_f32_to_bf16(S_frag, P_frag);
    bf16 V_frag[BC / MMA_SIZE][D / MMA_SIZE][8];
    load_frag<D, MMA_SIZE, MMA_SIZE, true, true>(V_s, V_frag);
    mma(P_frag, V_frag, O_frag);
  }
  __syncthreads();

  frag_row_apply(O_frag, l_i, [](float a, float b) { return a / b; });

  bf16 *O_s = KV_smem[0];
  store_frag_bf16<D, DIST_BR, MMA_SIZE, false>(O_frag,
                                               O_s + widx * MMA_SIZE * D);
  __syncthreads();
  store_shmem_global<D, BR, D, NUM_WARPS, false>(O_s, O_i, D);
}

template <int BR, int BC, int D, int NUM_THREADS, int STAGES = 2>
void sdpa(bf16 *Q, bf16 *K, bf16 *V, bf16 *O, int N, int H, int B) {
  int num_seq_blocks = (N + BR - 1) / BR;
  dim3 grid(num_seq_blocks, H, B);
  sdpa_kernel<BR, BC, D, NUM_THREADS, STAGES>
      <<<grid, NUM_THREADS>>>(Q, K, V, O, N, H, B);
}

template <int BR, int BC, int D, int NUM_THREADS, int STAGES = 2>
float benchmark_sdpa(bf16 *Q, bf16 *K, bf16 *V, bf16 *O, int N, int H, int B,
                     int warmup_iters = 10, int bench_iters = 100) {
  for (int i = 0; i < warmup_iters; i++) {
    sdpa<BR, BC, D, NUM_THREADS, STAGES>(Q, K, V, O, N, H, B);
  }
  cudaDeviceSynchronize();

  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

  cudaEventRecord(start);
  for (int i = 0; i < bench_iters; i++) {
    sdpa<BR, BC, D, NUM_THREADS, STAGES>(Q, K, V, O, N, H, B);
  }
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);

  float ms_total = 0;
  cudaEventElapsedTime(&ms_total, start, stop);
  float ms_per_iter = ms_total / bench_iters;

  cudaEventDestroy(start);
  cudaEventDestroy(stop);

  // Attention: Q @ K^T (2*B*H*N*N*D) + P @ V (2*B*H*N*N*D) = 4*B*H*N^2*D
  double flops = 4.0 * B * H * (double)N * N * D;
  double tflops = (flops / (ms_per_iter / 1000.0)) / 1e12;

  return tflops;
}

int main() {
  constexpr int BATCH = 2;
  constexpr int HEADS = 8;
  constexpr int SEQ_LEN = 4096;
  constexpr int HEAD_DIM = 64;

  constexpr int B_R = 64;
  constexpr int B_C = 64;
  constexpr int NUM_THREADS = 128;
  constexpr int STAGES = 2;

  printf("Config: B=%d, H=%d, N=%d, D=%d\n", BATCH, HEADS, SEQ_LEN, HEAD_DIM);
  printf("Kernel: B_R=%d, B_C=%d, NUM_THREADS=%d, STAGES=%d\n\n", B_R, B_C,
         NUM_THREADS, STAGES);

  size_t num_elements = BATCH * HEADS * SEQ_LEN * HEAD_DIM;
  bf16 *d_Q = load_bin_to_device_bf16("Q.bin", num_elements);
  bf16 *d_K = load_bin_to_device_bf16("K.bin", num_elements);
  bf16 *d_V = load_bin_to_device_bf16("V.bin", num_elements);

  bf16 *d_O;
  cudaMalloc(&d_O, num_elements * sizeof(bf16));
  cudaMemset(d_O, 0, num_elements * sizeof(bf16));

  sdpa<B_R, B_C, HEAD_DIM, NUM_THREADS, STAGES>(d_Q, d_K, d_V, d_O, SEQ_LEN,
                                                HEADS, BATCH);
  cudaDeviceSynchronize();

  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess) {
    printf("CUDA error: %s\n", cudaGetErrorString(err));
    return 1;
  }

  bool passed = compare_with_reference_bin(d_O, "out.bin", num_elements, 1e-2f,
                                           false, SEQ_LEN, HEAD_DIM);

  float tflops = benchmark_sdpa<B_R, B_C, HEAD_DIM, NUM_THREADS, STAGES>(
      d_Q, d_K, d_V, d_O, SEQ_LEN, HEADS, BATCH, 10, 100);
  printf("Performance: %.2f TFLOPS\n", tflops);

  // Cleanup
  cudaFree(d_Q);
  cudaFree(d_K);
  cudaFree(d_V);
  cudaFree(d_O);

  return passed ? 0 : 1;
}