refine

wanghuancoder · wanghuancoder · commit 315aef4a8305 · 2025-11-11T03:04:22.000Z
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_spaq.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_spaq.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "quant_utils.h"
 
 #define LAUNCH_FUSED_SPAQ(__using_pow2_scaling, __with_prob)          \
@@ -196,7 +210,8 @@ __global__ void FusedSPAQKernel(const phi::bfloat16 *__restrict__ Xin,
   const int quant_block_idx =
       threadIdx.x / 128;  // 0 or 1, two quant blocks per block
   const int64_t in_y_idx = blockIdx.y;
-  const int64_t in_x_idx = static_cast<uint64_t>(blockIdx.x) * blockDim.x + x_offset;
+  const int64_t in_x_idx =
+      static_cast<uint64_t>(blockIdx.x) * blockDim.x + x_offset;
   const int64_t src_idx = in_y_idx * cols + in_x_idx;
 
   // Load data and compute swiGLU activation
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_stack_transpose_quant.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_stack_transpose_quant.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "quant_utils.h"
 
 template <typename T, int VecSize>
@@ -199,7 +213,8 @@ __global__ void __launch_bounds__(1024)
     using StoreT = VecType<OutT, 4>;
     StoreT data;
     for (int j = 0; j < 4; j++) {
-      data[j] = shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];
+      data[j] =
+          shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];
     }
     *reinterpret_cast<StoreT*>(out + idx) = data;
   }
diff --git a/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_transpose_split_quant.cu b/slm/model_zoo/gpt-3/external_ops/fused_quanted_ops/fused_transpose_split_quant.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "quant_utils.h"
 
 template <typename T, int VecSize>
@@ -15,7 +29,8 @@ __device__ void BlockLoad(const phi::bfloat16* X,
                           size_t K) {
   for (size_t i = 0; i < 4; i++) {
     size_t off_m = static_cast<size_t>(blockIdx.x) * 128 + threadIdx.y + i * 32;
-    size_t off_k = static_cast<size_t>(blockIdx.y) * 128 + threadIdx.x * VecSize;
+    size_t off_k =
+        static_cast<size_t>(blockIdx.y) * 128 + threadIdx.x * VecSize;
     size_t offset = off_m * K + off_k;
 
     for (size_t j = 0; j < 4; j += VecSize) {
@@ -45,15 +60,18 @@ __device__ void BlockColumnMax(const __nv_bfloat16 input[4][4],
 
   // Reduce [(32), 32, 4] => [32, 4]
   for (int i = 0; i < 4; i++) {
-    shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x] = warp_max[i];
+    shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x] =
+        warp_max[i];
   }
   __syncthreads();
   for (int offset = 16; offset > 0; offset /= 2) {
     if (threadIdx.y < offset) {
       for (int i = 0; i < 4; i++) {
         shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x] =
-            __hmax(shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 + threadIdx.x],
-                   shm[(static_cast<size_t>(threadIdx.y) + offset) * 128 + i * 32 + threadIdx.x]);
+            __hmax(shm[static_cast<size_t>(threadIdx.y) * 128 + i * 32 +
+                       threadIdx.x],
+                   shm[(static_cast<size_t>(threadIdx.y) + offset) * 128 +
+                       i * 32 + threadIdx.x]);
       }
     }
     __syncthreads();
@@ -79,7 +97,8 @@ __device__ void BlockStoreScale(float* scale,
   }
   if (threadIdx.y == 0) {
     size_t idx_m = blockIdx.x - off_m / 128;
-    size_t off_k = static_cast<size_t>(blockIdx.y) * 128 + threadIdx.x * VecSize;
+    size_t off_k =
+        static_cast<size_t>(blockIdx.y) * 128 + threadIdx.x * VecSize;
     size_t offset = idx_m * K + off_k;
 
     for (size_t j = 0; j < 4; j += VecSize) {
@@ -111,7 +130,8 @@ __device__ void BlockStoreOut(OutT* out,
       using StoreT = VecType<OutT, VecSize>;
       StoreT data;
       for (int j = 0; j < VecSize; j++) {
-        data[j] = shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];
+        data[j] =
+            shm[i * 32 + threadIdx.y][static_cast<size_t>(threadIdx.x) * 4 + j];
       }
       *reinterpret_cast<StoreT*>(out + idx) = data;
     }
@@ -176,8 +196,8 @@ __global__ void __launch_bounds__(1024)
       for (int k = 0; k < VecSize; k++) {
         float input_fp32 = static_cast<float>(input[i][j + k]);
         float output_scaled = input_fp32 * scale_inv[j + k];
-        shm[static_cast<size_t>(threadIdx.x) * VecSize + j * 32 + k][i * 32 + threadIdx.y] =
-            static_cast<OutT>(output_scaled);
+        shm[static_cast<size_t>(threadIdx.x) * VecSize + j * 32 + k]
+           [i * 32 + threadIdx.y] = static_cast<OutT>(output_scaled);
       }
     }
   }