fix eb5 big tensor bug

wanghuancoder · wanghuancoder · commit ebc6cfbca365 · 2025-11-11T02:52:05.000Z
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_spaq.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_spaq.cu
@@ -182,8 +182,8 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin,
                                 const float *__restrict__ prob,
                                 phi::float8_e4m3fn *__restrict__ out,
                                 float *__restrict__ scales,
-                                const int rows,
-                                const int cols) {
+                                const int64_t rows,
+                                const int64_t cols) {
   // Configure shared memory
   __shared__ float smem_tile[256];  // Shared memory for activation values
   __shared__ float warp_max[2][4];  // Shared memory for warp maxima (2 quant
@@ -192,12 +192,12 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin,
       quant_block_amax[2];  // Shared memory for quant block maxima
 
   const __nv_bfloat16 *X = reinterpret_cast<const __nv_bfloat16 *>(Xin);
-  const int x_offset = threadIdx.x;
+  const uint32_t x_offset = threadIdx.x;
   const int quant_block_idx =
       threadIdx.x / 128;  // 0 or 1, two quant blocks per block
-  const int in_y_idx = blockIdx.y;
-  const int in_x_idx = blockIdx.x * blockDim.x + x_offset;
-  const int src_idx = in_y_idx * cols + in_x_idx;
+  const int64_t in_y_idx = blockIdx.y;
+  const int64_t in_x_idx = static_cast<uint64_t>(blockIdx.x) * blockDim.x + x_offset;
+  const int64_t src_idx = in_y_idx * cols + in_x_idx;
 
   // Load data and compute swiGLU activation
   if (in_x_idx < cols / 2) [[likely]] {
@@ -255,7 +255,7 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin,
 
   // Phase 3: Compute scales and quantize the outputs
   const float block_max_float = (float)quant_block_amax[quant_block_idx];
-  const int scale_stride = (cols / 2 + 127) / 128;
+  const int64_t scale_stride = (cols / 2 + 127) / 128;
 
   float scale = ComputeScale<float, __nv_fp8_e4m3, using_pow2_scaling>(
       block_max_float, 0.0f);
@@ -265,8 +265,8 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin,
   float output_scaled_fp32 = smem_tile[x_offset] * scale;
 
 
-  const int g_output_y_offset = in_y_idx;
-  const int g_output_x_offset = in_x_idx;
+  const int64_t g_output_y_offset = in_y_idx;
+  const int64_t g_output_x_offset = in_x_idx;
 
   // Write output and scales
   if (g_output_y_offset < rows && g_output_x_offset < cols / 2) {
@@ -284,8 +284,8 @@ void dispatch_fused_spaq(const paddle::Tensor &X,
                          const paddle::optional<paddle::Tensor> &prob,
                          paddle::Tensor &out,
                          paddle::Tensor &scale,
-                         const int rows,
-                         const int cols,
+                         const int64_t rows,
+                         const int64_t cols,
                          const bool &using_pow2_scaling,
                          const bool &with_prob) {
   constexpr int thread_per_block = 256;
@@ -297,8 +297,8 @@ void dispatch_fused_spaq(const paddle::Tensor &X,
     // 1x128 vector Each block handles several sub-row (numel = 4 x blockDim.x)
     // of input vector
     block.x = thread_per_block;
-    constexpr int vec_numel = 4;
-    const int scale_cols = scale.shape().back();
+    constexpr int64_t vec_numel = 4;
+    const int64_t scale_cols = scale.shape().back();
     DISPATCH_BOOL(
         using_pow2_scaling,
         k_using_pow2_scaling,
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_stack_transpose_quant.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_stack_transpose_quant.cu
@@ -183,7 +183,7 @@ __global__ void __launch_bounds__(1024)
     for (int j = 0; j < 4; j++) {
       float input_fp32 = static_cast<float>(input[i][j]);
       float output_scaled = input_fp32 * scale_inv;
-      shm[threadIdx.x * 4 + j][i * 32 + threadIdx.y] =
+      shm[static_cast<size_t>(threadIdx.x) * 4 + j][i * 32 + threadIdx.y] =
           static_cast<OutT>(output_scaled);
     }
   }
@@ -193,13 +193,13 @@ __global__ void __launch_bounds__(1024)
   for (size_t i = 0; i < 4; i++) {
     size_t idx_n = blockIdx.z;
     size_t idx_k = block_x * 128 + threadIdx.y + i * 32;
-    size_t idx_m = block_y * 128 + threadIdx.x * 4;
+    size_t idx_m = block_y * 128 + static_cast<size_t>(threadIdx.x) * 4;
     size_t idx = (idx_n * K + idx_k) * M + idx_m;
 
     using StoreT = VecType<OutT, 4>;
     StoreT data;
     for (int j = 0; j < 4; j++) {
-      data[j] = shm[i * 32 + threadIdx.y][threadIdx.x * 4 + j];
+      data[j] = shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];
     }
     *reinterpret_cast<StoreT*>(out + idx) = data;
   }
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_swiglu_probs_bwd.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_swiglu_probs_bwd.cu
@@ -49,9 +49,9 @@ __global__ void SwigluProbsGradKernel(
     BFloat16* do1,                // [seq_len*topk, moe_intermediate_size*2]
     float* probs_grad,            // [seq_len*topk, 1]
     BFloat16* o2_s,               // [seq_len*topk, moe_intermediate_size]
-    int moe_intermediate_size) {
-  const int row_idx = blockIdx.x;
-  const int tid = threadIdx.x;
+    int64_t moe_intermediate_size) {
+  const int64_t row_idx = blockIdx.x;
+  const int64_t tid = threadIdx.x;
 
   const BFloat16* o1_row = o1 + row_idx * moe_intermediate_size * 2;
   const BFloat16* do2_s_row = do2_s + row_idx * moe_intermediate_size;
@@ -64,7 +64,7 @@ __global__ void SwigluProbsGradKernel(
 
   float local_probs_grad = 0.0f;
 
-  for (int i = tid; i < moe_intermediate_size; i += blockDim.x) {
+  for (int64_t i = tid; i < moe_intermediate_size; i += blockDim.x) {
     float lhs = static_cast<float>(o1_row[i]);
     float rhs = static_cast<float>(o1_row[i + moe_intermediate_size]);
 
@@ -185,7 +185,7 @@ __global__ void SwigluProbsGradKernelVec4(
     BFloat16* do1,                // [seq_len*topk, moe_intermediate_size*2]
     float* probs_grad,            // [seq_len*topk, 1]
     BFloat16* o2_s,               // [seq_len*topk, moe_intermediate_size]
-    int moe_intermediate_size) {
+    int64_t moe_intermediate_size) {
   constexpr int numel_per_thread = 4;
   constexpr int k_warp_size = 32;
   const int64_t row_idx = blockIdx.x;
@@ -210,7 +210,7 @@ __global__ void SwigluProbsGradKernelVec4(
 
   float local_probs_grad = 0.0f;
 
-  const int vec_numel = (int64_t)moe_intermediate_size / numel_per_thread;
+  const int64_t vec_numel = (int64_t)moe_intermediate_size / numel_per_thread;
   for (int64_t i = tid; i < vec_numel; i += blockDim.x) {
     float4 lhs_vec4 = load_and_cast_float4(o1_row_left_half_vec4 + i);
     float4 rhs_vec4 = load_and_cast_float4(o1_row_right_half_vec4 + i);
@@ -262,13 +262,13 @@ std::vector<paddle::Tensor> SwigluProbsGradCUDABackward(
     const paddle::Tensor& unzipped_probs,
     bool inplace) {
   auto o1_dims = o1.dims();
-  int o1_outer_dim = 1;
+  int64_t o1_outer_dim = 1;
   for (int i = 0; i < o1_dims.size() - 1; i++) {
     o1_outer_dim *= o1_dims[i];
   }
 
-  const int moe_intermediate_size_2 = o1_dims[o1_dims.size() - 1];
-  const int moe_intermediate_size = moe_intermediate_size_2 / 2;
+  const int64_t moe_intermediate_size_2 = o1_dims[o1_dims.size() - 1];
+  const int64_t moe_intermediate_size = moe_intermediate_size_2 / 2;
 
   auto do1 = inplace ? o1 : paddle::empty_like(o1);
   auto probs_grad =
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_transpose_split_quant.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_transpose_split_quant.cu
@@ -14,8 +14,8 @@ __device__ void BlockLoad(const phi::bfloat16* X,
                           __nv_bfloat16 input[4][4],
                           size_t K) {
   for (size_t i = 0; i < 4; i++) {
-    size_t off_m = blockIdx.x * 128 + threadIdx.y + i * 32;
-    size_t off_k = blockIdx.y * 128 + threadIdx.x * VecSize;
+    size_t off_m = static_cast<size_t>(blockIdx.x) * 128 + threadIdx.y + i * 32;
+    size_t off_k = static_cast<size_t>(blockIdx.y) * 128 + threadIdx.x * VecSize;
     size_t offset = off_m * K + off_k;
 
     for (size_t j = 0; j < 4; j += VecSize) {
@@ -45,15 +45,15 @@ __device__ void BlockColumnMax(const __nv_bfloat16 input[4][4],
 
   // Reduce [(32), 32, 4] => [32, 4]
   for (int i = 0; i < 4; i++) {
-    shm[threadIdx.y * 128 + i * 32 + threadIdx.x] = warp_max[i];
+    shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x] = warp_max[i];
   }
   __syncthreads();
   for (int offset = 16; offset > 0; offset /= 2) {
     if (threadIdx.y < offset) {
       for (int i = 0; i < 4; i++) {
-        shm[threadIdx.y * 128 + i * 32 + threadIdx.x] =
-            __hmax(shm[threadIdx.y * 128 + i * 32 + threadIdx.x],
-                   shm[(threadIdx.y + offset) * 128 + i * 32 + threadIdx.x]);
+        shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x] =
+            __hmax(shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x],
+                   shm[(static_cast<size_t>(threadIdx.y) + offset) * 128 + i * 32 + threadIdx.x]);
       }
     }
     __syncthreads();
@@ -79,7 +79,7 @@ __device__ void BlockStoreScale(float* scale,
   }
   if (threadIdx.y == 0) {
     size_t idx_m = blockIdx.x - off_m / 128;
-    size_t off_k = blockIdx.y * 128 + threadIdx.x * VecSize;
+    size_t off_k = static_cast<size_t>(blockIdx.y) * 128 + threadIdx.x * VecSize;
     size_t offset = idx_m * K + off_k;
 
     for (size_t j = 0; j < 4; j += VecSize) {
@@ -103,15 +103,15 @@ __device__ void BlockStoreOut(OutT* out,
                               const OutT shm[128][129],
                               size_t K) {
   for (size_t i = 0; i < 4; i++) {
-    size_t idx_m = blockIdx.x * 128 + threadIdx.x * 4;
-    size_t idx_k = blockIdx.y * 128 + threadIdx.y + i * 32;
+    size_t idx_m = static_cast<size_t>(blockIdx.x) * 128 + threadIdx.x * 4;
+    size_t idx_k = static_cast<size_t>(blockIdx.y) * 128 + threadIdx.y + i * 32;
     size_t idx = idx_k * cur_tokens + (idx_m - off_m);
 
     if (idx_k < K) {
       using StoreT = VecType<OutT, VecSize>;
       StoreT data;
       for (int j = 0; j < VecSize; j++) {
-        data[j] = shm[i * 32 + threadIdx.y][threadIdx.x * 4 + j];
+        data[j] = shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];
       }
       *reinterpret_cast<StoreT*>(out + idx) = data;
     }
@@ -123,7 +123,7 @@ __device__ std::pair<size_t, size_t> GetExpertIdx(int64_t* tokens_per_expert,
   __shared__ size_t expert_idx_, off_m_;
 
   if (threadIdx.x == 0 && threadIdx.y == 0) {
-    size_t idx_m = blockIdx.x * 128;
+    size_t idx_m = static_cast<size_t>(blockIdx.x) * 128;
     size_t off_m = 0, next_off_m = 0;
     size_t expert_idx;
     for (expert_idx = 0; expert_idx < num_experts; expert_idx++) {
@@ -176,7 +176,7 @@ __global__ void __launch_bounds__(1024)
       for (int k = 0; k < VecSize; k++) {
         float input_fp32 = static_cast<float>(input[i][j + k]);
         float output_scaled = input_fp32 * scale_inv[j + k];
-        shm[threadIdx.x * VecSize + j * 32 + k][i * 32 + threadIdx.y] =
+        shm[static_cast<size_t>(threadIdx.x) * VecSize + j * 32 + k][i * 32 + threadIdx.y] =
             static_cast<OutT>(output_scaled);
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ __global__ void __launch_bounds__(1024)`
`183`	`183`	`for (int j = 0; j < 4; j++) {`
`184`	`184`	`float input_fp32 = static_cast<float>(input[i][j]);`
`185`	`185`	`float output_scaled = input_fp32 * scale_inv;`
`186`		`- shm[threadIdx.x * 4 + j][i * 32 + threadIdx.y] =`
	`186`	`+ shm[static_cast<size_t>(threadIdx.x) * 4 + j][i * 32 + threadIdx.y] =`
`187`	`187`	`static_cast<OutT>(output_scaled);`
`188`	`188`	`}`
`189`	`189`	`}`
`@@ -193,13 +193,13 @@ __global__ void __launch_bounds__(1024)`
`193`	`193`	`for (size_t i = 0; i < 4; i++) {`
`194`	`194`	`size_t idx_n = blockIdx.z;`
`195`	`195`	`size_t idx_k = block_x * 128 + threadIdx.y + i * 32;`
`196`		`- size_t idx_m = block_y * 128 + threadIdx.x * 4;`
	`196`	`+ size_t idx_m = block_y * 128 + static_cast<size_t>(threadIdx.x) * 4;`
`197`	`197`	`size_t idx = (idx_n * K + idx_k) * M + idx_m;`
`198`	`198`
`199`	`199`	`using StoreT = VecType<OutT, 4>;`
`200`	`200`	`StoreT data;`
`201`	`201`	`for (int j = 0; j < 4; j++) {`
`202`		`- data[j] = shm[i * 32 + threadIdx.y][threadIdx.x * 4 + j];`
	`202`	`+ data[j] = shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];`
`203`	`203`	`}`
`204`	`204`	`reinterpret_cast<StoreT>(out + idx) = data;`
`205`	`205`	`}`