[Optimize] Optimize tensorwise fp8 performance (PaddlePaddle#2729)

ming1753 · web-flow · commit ef6649a577ea · 2025-07-07T20:06:28.000+08:00
* [Optimize] Optimize tensorwise fp8 performance
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -468,6 +468,28 @@ std::vector<paddle::Tensor> NoauxTc(
       int topk,
       float routed_scaling_factor);
 
+paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
+    const paddle::Tensor& x,
+    const paddle::Tensor& y,
+    const paddle::optional<paddle::Tensor>& bias,
+    bool trans_x,
+    bool trans_y,
+    float scale,  // only support per-tensor quantization
+    std::string output_dtype,
+    std::string activation_type);
+
+paddle::Tensor MoeFusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const paddle::Tensor &scale,
+                const paddle::Tensor &topk_ids,
+                const int top_k,
+                const int intermediate_size,
+                const bool tiled);
+
+paddle::Tensor FusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const float scale);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -697,38 +719,21 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         "text_image_gather_scatter function");
 
   m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
+
   m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
 
   m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi,
-  py::arg("a"),
-  py::arg("c_or_none"),
-  py::arg("b_q_weight"),
-  py::arg("b_scales"),
-  py::arg("global_scale_or_none"),
-  py::arg("b_zeros_or_none"),
-  py::arg("g_idx_or_none"),
-  py::arg("perm_or_none"),
-  py::arg("workspace"),
-  py::arg("sorted_token_ids"),
-  py::arg("expert_ids"),
-    py::arg("num_tokens_post_padded"),
-  py::arg("topk_weights"),
-  py::arg("moe_block_size"),
-    py::arg("top_k"),
-      py::arg("mul_topk_weights"),
-        py::arg("is_ep"),
-          py::arg("b_q_type_str"),
-            py::arg("size_m"),
-              py::arg("size_n"),
-              py::arg("size_k"),
-              py::arg("is_k_full"),
-              py::arg("use_atomic_add"),
-              py::arg("use_fp32_reduce"),
-              py::arg("is_zp_float"));
+      py::arg("a"), py::arg("c_or_none"), py::arg("b_q_weight"),
+      py::arg("b_scales"), py::arg("global_scale_or_none"), py::arg("b_zeros_or_none"),
+      py::arg("g_idx_or_none"), py::arg("perm_or_none"), py::arg("workspace"), py::arg("sorted_token_ids"),
+      py::arg("expert_ids"), py::arg("num_tokens_post_padded"), py::arg("topk_weights"), py::arg("moe_block_size"),
+      py::arg("top_k"), py::arg("mul_topk_weights"), py::arg("is_ep"),  py::arg("b_q_type_str"),
+      py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"), py::arg("use_atomic_add"),
+      py::arg("use_fp32_reduce"), py::arg("is_zp_float"));
+
   m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch,
         "get_position_ids_and_mask_encoder_batch function");
 
-
   /**
    * cutlass_scaled_mm.cu
    * cutlass_scaled_mm
@@ -753,6 +758,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant,
         "dynamic_per_token_scaled_fp8_quant function",
          py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub"));
+
   m.def("decode_mla_write_cache", &DecodeMLAWriteCacheKernel, "decode_mla_write_cache function");
 
   m.def("prefill_mla_write_cache", &PrefillMLAWriteCacheKernel, "prefill_mla_write_cache function");
@@ -762,4 +768,16 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function");
 
   m.def("noaux_tc",&NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
+
+  m.def("cutlass_fp8_fp8_half_gemm_fused", &cutlass_fp8_fp8_half_gemm_func,
+        py::arg("x"), py::arg("y"), py::arg("bias"), py::arg("transpose_x"),
+        py::arg("transpose_y"), py::arg("scale"), py::arg("output_dtype"),
+        py::arg("activation_type"), "cutlass_fp8_fp8_half_gemm_fused function");
+
+  m.def("moe_fused_hadamard_quant_fp8", &MoeFusedHadamardQuantFp8Func,
+      py::arg("input"), py::arg("scale"), py::arg("topk_ids"),
+      py::arg("top_k"), py::arg("intermediate_size"), py::arg("tiled"), "moe_fused_hadamard_quant_fp8 function");
+
+  m.def("fused_hadamard_quant_fp8", &FusedHadamardQuantFp8Func,
+      py::arg("input"), py::arg("scale"), "fused_hadamard_quant_fp8 function");
 }
diff --git a/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu b/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_gemm.cu
@@ -19,7 +19,7 @@
 #include "fp8_fp8_half_cuda_core_gemm.h"
 
 
-std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
+paddle::Tensor cutlass_fp8_fp8_half_gemm_func(
     const paddle::Tensor& x,
     const paddle::Tensor& y,
     const paddle::optional<paddle::Tensor>& bias,
@@ -142,7 +142,7 @@ std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
         {
             if(output_dtype == "bfloat16") {
                 cuda_core_gemm_launcher<__nv_fp8_e4m3, __nv_bfloat16>(params);
-                
+
             } else {
                 cuda_core_gemm_launcher<__nv_fp8_e4m3, half>(params);
             }
@@ -174,7 +174,21 @@ std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
                                         fuse_gemm_config};
         fp8_fp8_gemm_scale_bias_act(params);
     }
-    return {out};
+    return out;
+}
+
+std::vector<paddle::Tensor> cutlass_fp8_fp8_half_gemm(
+    const paddle::Tensor& x,
+    const paddle::Tensor& y,
+    const paddle::optional<paddle::Tensor>& bias,
+    bool trans_x,
+    bool trans_y,
+    float scale,  // only support per-tensor quantization
+    std::string output_dtype,
+    std::string activation_type) {
+    return {cutlass_fp8_fp8_half_gemm_func(
+            x, y, bias, trans_x, trans_y, scale,
+            output_dtype, activation_type)};
 }
 
 std::vector<std::vector<int64_t>> CutlassFp8Fp8HalfGemmFusedInferShape(
diff --git a/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu b/custom_ops/gpu_ops/fused_hadamard_quant_fp8.cu
@@ -0,0 +1,198 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+#include "helper.h"
+
+__device__ __forceinline__ void hadamard32_warp(__nv_bfloat16& x) {
+    int lane_id = threadIdx.x % 32;
+#pragma unroll
+    for (int step = 0; step < 5; ++step) {
+        const int lane_mask = 1 << step;
+        const __nv_bfloat16 sign = (lane_id & lane_mask) ? -1.f : 1.f;
+        __nv_bfloat16 x_val_other = __shfl_xor_sync(0xffffffff, x, lane_mask);
+        x = sign * x + x_val_other;
+    }
+}
+
+__global__ void MoeFusedHadamardQuantFp8Kernel(
+    const __nv_bfloat16* __restrict__ input,
+    const float* __restrict__ scale,
+    const int64_t* __restrict__ topk_ids,
+    __nv_fp8_e4m3* out,
+    const int top_k,
+    const int intermediate_size,
+    const int64_t numel
+) {
+    int64_t out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (out_idx >= numel) return;
+
+    int64_t token_idx = out_idx / (top_k * intermediate_size);
+    int64_t topk_idx  = (out_idx / intermediate_size) % top_k;
+    int64_t inter_idx = out_idx % intermediate_size;
+
+    int64_t input_idx = token_idx * intermediate_size + inter_idx;
+    if (input_idx >= numel / top_k) return;
+
+    int64_t expert_id = topk_ids[token_idx * top_k + topk_idx];
+    float scale_value = scale[expert_id];
+
+    __nv_bfloat16 x = input[input_idx];
+    hadamard32_warp(x);
+
+    float x_fp32 = __bfloat162float(x);
+    float quantized = x_fp32 / scale_value;
+    out[out_idx] = static_cast<__nv_fp8_e4m3>(quantized);
+}
+
+__global__ void MoeFusedHadamardQuantFp8TiledKernel(
+    const __nv_bfloat16* __restrict__ input,
+    const float* __restrict__ scale,
+    const int64_t* __restrict__ topk_ids,
+    __nv_fp8_e4m3* out,
+    const int top_k,
+    const int intermediate_size,
+    const int64_t numel
+) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) return;
+
+    int64_t token_idx = idx / intermediate_size;
+    int64_t expert_id = topk_ids[token_idx];
+    float scale_value = scale[expert_id];
+
+    __nv_bfloat16 x = input[idx];
+    hadamard32_warp(x);
+
+    float x_fp32 = __bfloat162float(x);
+    float quantized = x_fp32 / scale_value;
+    out[idx] = static_cast<__nv_fp8_e4m3>(quantized);
+}
+
+std::vector<paddle::Tensor> MoeFusedHadamardQuantFp8(
+                                const paddle::Tensor &input,
+                                const paddle::Tensor &scale,
+                                const paddle::Tensor &topk_ids,
+                                const int top_k,
+                                const int intermediate_size,
+                                const bool tiled) {
+    int64_t numel = input.numel();
+    if (!tiled) numel *= top_k;
+    paddle::Tensor out = GetEmptyTensor(
+            {numel / intermediate_size, intermediate_size},
+            paddle::DataType::FLOAT8_E4M3FN,
+            input.place());
+    constexpr int64_t thread_per_block = 256;
+    int64_t block_per_grid = (numel + thread_per_block - 1) / thread_per_block;
+    auto stream = input.stream();
+    if (tiled) {
+        MoeFusedHadamardQuantFp8TiledKernel<<<block_per_grid, thread_per_block, 0, stream>>>(
+            reinterpret_cast<const __nv_bfloat16*>(input.data<paddle::bfloat16>()),
+            scale.data<float>(),
+            topk_ids.data<int64_t>(),
+            reinterpret_cast<__nv_fp8_e4m3*>(out.mutable_data<phi::dtype::float8_e4m3fn>()),
+            top_k,
+            intermediate_size,
+            numel
+        );
+    } else {
+        MoeFusedHadamardQuantFp8Kernel<<<block_per_grid, thread_per_block, 0, stream>>>(
+            reinterpret_cast<const __nv_bfloat16*>(input.data<phi::dtype::bfloat16>()),
+            scale.data<float>(),
+            topk_ids.data<int64_t>(),
+            reinterpret_cast<__nv_fp8_e4m3*>(out.mutable_data<phi::dtype::float8_e4m3fn>()),
+            top_k,
+            intermediate_size,
+            numel
+        );
+    }
+    return {out};
+}
+
+PD_BUILD_STATIC_OP(moe_fused_hadamard_quant_fp8)
+    .Inputs({"input", "scale", "topk_ids"})
+    .Outputs({"output"})
+    .Attrs({"top_k: int",
+            "intermediate_size: int",
+            "tiled: bool"})
+    .SetKernelFn(PD_KERNEL(MoeFusedHadamardQuantFp8));
+
+
+paddle::Tensor MoeFusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const paddle::Tensor &scale,
+                const paddle::Tensor &topk_ids,
+                const int top_k,
+                const int intermediate_size,
+                const bool tiled) {
+    return MoeFusedHadamardQuantFp8(input, scale, topk_ids, top_k, intermediate_size, tiled)[0];
+}
+
+
+__global__ void FusedHadamardQuantFp8Kernel(
+                              const __nv_bfloat16* __restrict__ input,
+                              __nv_fp8_e4m3* out,
+                              const float scale,
+                              const int64_t numel) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) return;
+
+    __nv_bfloat16 x = input[idx];
+    hadamard32_warp(x);
+
+    float x_fp32 = __bfloat162float(x);
+    float quantized = x_fp32 / scale;
+    out[idx] = static_cast<__nv_fp8_e4m3>(quantized);
+}
+
+std::vector<paddle::Tensor> FusedHadamardQuantFp8(
+                                const paddle::Tensor &input,
+                                const float scale) {
+    int64_t numel = input.numel();
+    paddle::Tensor out = GetEmptyTensor(
+            input.dims(),
+            paddle::DataType::FLOAT8_E4M3FN,
+            input.place());
+    constexpr int64_t thread_per_block = 256;
+    int64_t block_per_grid = (numel + thread_per_block - 1) / thread_per_block;
+    auto stream = input.stream();
+    FusedHadamardQuantFp8Kernel<<<block_per_grid, thread_per_block, 0, stream>>>(
+        reinterpret_cast<const __nv_bfloat16*>(input.data<paddle::bfloat16>()),
+        reinterpret_cast<__nv_fp8_e4m3*>(out.mutable_data<phi::dtype::float8_e4m3fn>()),
+        scale,
+        numel
+    );
+    return {out};
+}
+
+PD_BUILD_STATIC_OP(fused_hadamard_quant_fp8)
+    .Inputs({"input"})
+    .Outputs({"output"})
+    .Attrs({"scale: float"})
+    .SetKernelFn(PD_KERNEL(FusedHadamardQuantFp8));
+
+
+paddle::Tensor FusedHadamardQuantFp8Func(
+                const paddle::Tensor &input,
+                const float scale) {
+    return FusedHadamardQuantFp8(input, scale)[0];
+}
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -442,6 +442,7 @@ def find_end_files(directory, end_str):
             "gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",
             "gpu_ops/cutlass_kernels/cutlass_heuristic.cu",
             "gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",
+            "gpu_ops/fused_hadamard_quant_fp8.cu"
         ]
 
         sources += find_end_files(fp8_auto_gen_directory, ".cu")
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
diff --git a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py

Original file line number	Diff line number	Diff line change
`@@ -442,6 +442,7 @@ def find_end_files(directory, end_str):`
`442`	`442`	`"gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",`
`443`	`443`	`"gpu_ops/cutlass_kernels/cutlass_heuristic.cu",`
`444`	`444`	`"gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",`
	`445`	`+ "gpu_ops/fused_hadamard_quant_fp8.cu"`
`445`	`446`	`]`
`446`	`447`
`447`	`448`	`sources += find_end_files(fp8_auto_gen_directory, ".cu")`