Update MSLK Triton FP8 row quantization kernel to match CUDA arithmetic and delete the C++ quantize_fp8_per_row kernel (#224)

momochen · facebook-github-bot · commit dd270fc02987 · 2026-03-13T16:41:48.000-07:00
Summary:

The Triton kernel _kernel_quantize_fp8_row used different arithmetic than the CUDA quantize_fp8_per_row — reciprocal-scale-multiply (a * (MAX_FP8 / amax)) vs true-division (a / (amax / MAX_FP8)). This caused bitwise discrepancies at FP8 rounding boundaries. This diff converges the Triton kernel to match CUDA's arithmetic exactly, then removes the now-redundant C++ kernel.

Conducted discrepancy test before removing fp8_quantize_discrepancy_test.py, which shows 100% bitwise parity between cuda and triton impls.

Differential Revision: D96502922
diff --git a/ci/scripts/mslk_build.bash b/ci/scripts/mslk_build.bash
@@ -597,7 +597,6 @@ __verify_library_symbols () {
       mslk::gemm::f8f8bf16_rowwise
       mslk::kv_cache::rope_qkv_decoding
       mslk::moe::index_shuffling_torch
-      mslk::quantize::quantize_fp8_per_row
     )
   fi
 
diff --git a/csrc/quantize/quantize.cu b/csrc/quantize/quantize.cu
@@ -884,184 +884,6 @@ __global__ void computeFP8QuantizeScaleRowwise(
   }
 }
 
-template <typename SCALE, typename T_OUT, typename T_S, typename T_IN>
-void invokeComputeScalesAndQuantizeMatrix(
-    T_OUT* output,
-    T_S* quant_ptr,
-    const T_IN* input,
-    const int64_t numel,
-    const int64_t lda,
-    const float* scale_ub,
-    bool stochastic_rounding,
-    const c10::cuda::CUDAStream stream) {
-  dim3 grid(numel / lda);
-#ifdef USE_ROCM
-  bool use_shmem = true;
-#else
-  bool use_shmem = false;
-#endif
-  auto const shmem_size = lda * sizeof(T_IN);
-  if (shmem_size >= (48 << 10)) {
-    cudaError_t ret;
-#ifndef USE_ROCM
-    if (stochastic_rounding) {
-      ret = cudaFuncSetAttribute(
-          dynamicQuantizeMatrixRowwiseStoc<SCALE, T_OUT, T_S, T_IN>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          shmem_size);
-    } else {
-      ret = cudaFuncSetAttribute(
-          dynamicQuantizeMatrixRowwise<SCALE, T_OUT, T_S, T_IN>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          shmem_size);
-    }
-    use_shmem = ret == cudaSuccess;
-#else
-    use_shmem = false;
-#endif
-  }
-  if (use_shmem) {
-    dim3 block(std::min((lda + 31) / 32 * 32, static_cast<int64_t>(1024)));
-
-    if (stochastic_rounding) {
-      at::PhiloxCudaState rng_engine_inputs;
-      auto gen = at::cuda::detail::getDefaultCUDAGenerator();
-      std::lock_guard<std::mutex> lock(gen.mutex());
-      rng_engine_inputs =
-          at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_cuda_state(4);
-
-      MSLK_LAUNCH_KERNEL(
-          (dynamicQuantizeMatrixRowwiseStoc<SCALE, T_OUT, T_S, T_IN>),
-          grid,
-          block,
-          shmem_size,
-          stream,
-          output,
-          quant_ptr,
-          input,
-          numel,
-          lda,
-          scale_ub,
-          rng_engine_inputs);
-    } else {
-      MSLK_LAUNCH_KERNEL(
-          (dynamicQuantizeMatrixRowwise<SCALE, T_OUT, T_S, T_IN>),
-          grid,
-          block,
-          shmem_size,
-          stream,
-          output,
-          quant_ptr,
-          input,
-          numel,
-          lda,
-          scale_ub);
-    }
-  } else {
-    dim3 block(CTA_SIZE);
-    MSLK_LAUNCH_KERNEL(
-        (computeFP8QuantizeScaleRowwise<SCALE, T_S, T_IN>),
-        grid,
-        block,
-        0,
-        stream,
-        quant_ptr,
-        input,
-        numel,
-        lda,
-        scale_ub);
-    invokeQuantizeMatrixRowwise(
-        output, quant_ptr, input, numel, lda, stochastic_rounding, stream);
-  }
-}
-
-std::vector<at::Tensor> quantize_fp8_per_row(
-    at::Tensor input,
-    std::optional<at::Tensor> bs, // batch size
-    std::optional<at::Tensor> scale_ub, // scale upperbound
-    std::optional<c10::ScalarType> output_dtype, // Quantization type
-    bool stochastic_rounding) {
-  TORCH_CHECK(
-      input.dim() >= 2,
-      "Invalid dim. The dim of input should be greater than or equal to 2");
-  TORCH_CHECK(
-      input.scalar_type() == torch::kBFloat16 ||
-          input.scalar_type() == torch::kFloat ||
-          input.scalar_type() == torch::kHalf,
-      "Invalid datatype. input must be BF16, FP16 or FP32");
-  TORCH_CHECK(
-      !stochastic_rounding || input.size(-1) % 4 == 0,
-      "input row dim must be 4's multiple when stochastic_rounding is True");
-  // Default data type is f8_e4m3fn.
-  c10::ScalarType quantization_type = torch_fp8_e4m3;
-  if (output_dtype.has_value()) {
-    TORCH_CHECK(
-        (output_dtype.value() == torch_fp8_e4m3 ||
-         output_dtype.value() == torch_fp8_e5m2),
-        "Invalid output type, must be e4m3 or e5m2.");
-    quantization_type = output_dtype.value();
-  }
-  std::vector<long int> quantized_input_shape;
-  for (int i = 0; i < input.dim(); i++)
-    quantized_input_shape.push_back(input.size(i));
-  std::vector<int64_t> scale_shape;
-  for (int i = 0; i < input.dim() - 1; i++)
-    scale_shape.push_back(input.size(i));
-
-  input = input.cuda();
-  at::Tensor quantized_input = torch::empty(
-      quantized_input_shape,
-      torch::dtype(quantization_type)
-          .device(torch::kCUDA, at::cuda::current_device())
-          .requires_grad(false));
-  at::Tensor scales = torch::empty(
-      scale_shape,
-      torch::dtype(torch::kFloat32)
-          .device(torch::kCUDA, at::cuda::current_device())
-          .requires_grad(false));
-
-  if (input.numel() == 0) {
-    return std::vector<at::Tensor>{quantized_input, scales};
-  }
-
-  // Templatize implementation based on output type.
-  if (quantization_type == torch_fp8_e4m3) {
-    auto* const quantized_input_ptr =
-        reinterpret_cast<__nv_fp8_e4m3*>(quantized_input.data_ptr());
-    const auto stream = at::cuda::getCurrentCUDAStream();
-    invokeComputeScalesAndQuantizeMatrix<FP8_E4M3_MAX>(
-        quantized_input_ptr,
-        reinterpret_cast<float*>(scales.data_ptr()),
-        reinterpret_cast<const __nv_bfloat16*>(input.data_ptr()),
-        input.numel(),
-        input.size(-1),
-        scale_ub.has_value()
-            ? reinterpret_cast<float*>(scale_ub.value().data_ptr())
-            : nullptr,
-        stochastic_rounding,
-        stream);
-
-    return std::vector<at::Tensor>{quantized_input, scales};
-  } else {
-    auto* const quantized_input_ptr =
-        reinterpret_cast<__nv_fp8_e5m2*>(quantized_input.data_ptr());
-    const auto stream = at::cuda::getCurrentCUDAStream();
-    invokeComputeScalesAndQuantizeMatrix<FP8_E5M2_MAX>(
-        quantized_input_ptr,
-        reinterpret_cast<float*>(scales.data_ptr()),
-        reinterpret_cast<const __nv_bfloat16*>(input.data_ptr()),
-        input.numel(),
-        input.size(-1),
-        scale_ub.has_value()
-            ? reinterpret_cast<float*>(scale_ub.value().data_ptr())
-            : nullptr,
-        stochastic_rounding,
-        stream);
-
-    return std::vector<at::Tensor>{quantized_input, scales};
-  }
-}
-
 #if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
 
 #ifdef __CUDA_ARCH__
diff --git a/csrc/quantize/quantize_ops.cpp b/csrc/quantize/quantize_ops.cpp
@@ -23,9 +23,6 @@ TORCH_LIBRARY_FRAGMENT(mslk, m) {
 
   m.def(
       "quantize_fp8_per_tensor(Tensor input, Tensor? bs=None, Tensor? scale_ub=None, bool stochastic_rounding=False) -> Tensor[]");
-  m.def(
-      "quantize_fp8_per_row(Tensor input, Tensor? bs=None, Tensor? scale_ub=None, ScalarType? output_dtype=None, bool stochastic_rounding = False) -> Tensor[] ");
-
   m.def(
       "get_fp8_per_tensor_scale(Tensor input, Tensor? bs=None, Tensor? scale_ub=None) -> Tensor");
 
@@ -41,7 +38,6 @@ TORCH_LIBRARY_FRAGMENT(mslk, m) {
 
 TORCH_LIBRARY_IMPL(mslk, CUDA, m) {
   DISPATCH_TO_CUDA("quantize_fp8_per_tensor", quantize_fp8_per_tensor);
-  DISPATCH_TO_CUDA("quantize_fp8_per_row", quantize_fp8_per_row);
   DISPATCH_TO_CUDA("per_tensor_quantize_i8", per_tensor_quantize_i8);
   DISPATCH_TO_CUDA(
       "per_tensor_dynamic_quantize_i8", per_tensor_dynamic_quantize_i8);
@@ -74,30 +70,8 @@ std::vector<at::Tensor> quantize_fp8_per_tensor_meta(
   return {Y, scale};
 }
 
-std::vector<at::Tensor> quantize_fp8_per_row_meta(
-    at::Tensor input,
-    std::optional<at::Tensor> /* bs */,
-    std::optional<at::Tensor> /* scale_ub */,
-    std::optional<c10::ScalarType> /* output_dtype */,
-    bool /* stochastic_rounding */) {
-  int dims = input.dim();
-  TORCH_CHECK(dims == 2 || dims == 3, "The dim of input should be 2 or 3");
-  at::Tensor Y = at::empty_like(input, input.options().dtype(torch_fp8_e4m3));
-  at::Tensor scale;
-  if (dims == 2) {
-    const at::SymInt M = input.sym_size(0);
-    scale = at::empty_symint({M}, input.options().dtype(at::kFloat));
-  } else {
-    const at::SymInt B = input.sym_size(0);
-    const at::SymInt M = input.sym_size(1);
-    scale = at::empty_symint({B, M}, input.options().dtype(at::kFloat));
-  }
-  return {Y, scale};
-}
-
 TORCH_LIBRARY_IMPL(mslk, Meta, m) {
   DISPATCH_TO_META("quantize_fp8_per_tensor", quantize_fp8_per_tensor_meta);
-  DISPATCH_TO_META("quantize_fp8_per_row", quantize_fp8_per_row_meta);
 }
 
 } // namespace mslk::quantize
diff --git a/include/mslk/quantize/quantize.h b/include/mslk/quantize/quantize.h
@@ -21,13 +21,6 @@ std::vector<at::Tensor> quantize_fp8_per_tensor(
     std::optional<at::Tensor> scale_ub, // scale upperbound
     const bool stochastic_rounding); // whether apply stochastic rounding
 
-std::vector<at::Tensor> quantize_fp8_per_row(
-    at::Tensor input,
-    std::optional<at::Tensor> bs, // batch size
-    std::optional<at::Tensor> scale_ub, // scale upperbound
-    std::optional<c10::ScalarType> output_dtype, // output dtype
-    bool stochastic_rounding); // whether apply stochastic rounding
-
 at::Tensor quantize_fp8_per_tensor_fixed_scale(
     at::Tensor input,
     at::Tensor scale,
diff --git a/mslk/quantize/triton/fp8_quantize.py b/mslk/quantize/triton/fp8_quantize.py
@@ -155,11 +155,11 @@ def _kernel_quantize_fp8_row(
         else:
             cur_max = tl.maximum(cur_max, EPS)
 
-        # Scale and quantize.
-        a_scale = MAX_FP8 / cur_max
-        tl.store(A_scale + pid, 1.0 / a_scale)
+        # Scale and quantize
+        scale = tl.div_rn(cur_max, MAX_FP8)
+        tl.store(A_scale + pid, scale)
 
-        a_fp8 = a * a_scale
+        a_fp8 = tl.div_rn(a.to(tl.float32), scale)
         a_fp8 = tl.clamp(a_fp8, -MAX_FP8, MAX_FP8).to(TL_FP8_DTYPE)
         tl.store(
             A_fp8 + a_fp8_offset_base + n_offset * stride_ok,
@@ -188,9 +188,9 @@ def _kernel_quantize_fp8_row(
         else:
             cur_max = tl.maximum(cur_max, EPS)
 
-        # Scale and quantize.
-        a_scale = MAX_FP8 / cur_max
-        tl.store(A_scale + pid, 1.0 / a_scale)
+        # Scale and quantize
+        scale = tl.div_rn(cur_max, MAX_FP8)
+        tl.store(A_scale + pid, scale)
 
         # Write quantized values for the first K elements (from A), and pad the rest with zeros up to K_fp8
         n_offset = tl.arange(0, BLOCK_SIZE)
@@ -202,7 +202,7 @@ def _kernel_quantize_fp8_row(
                 other=0.0,
             )
             # For elements >= K, a will be 0
-            a_fp8 = a * a_scale
+            a_fp8 = tl.div_rn(a.to(tl.float32), scale)
             # Clamp A to fp8 range to make sure there's no overflow.
             # This is required for AMD. Nvidia's default saturation
             # handles it, but it's nice to have anyway.
@@ -222,7 +222,7 @@ def triton_quantize_fp8_row(
     scale_ub: Optional[torch.Tensor] = None,
     zero_start_index_M: Optional[torch.Tensor] = None,
     align_rows_to: Optional[int] = None,
-    eps_opt: Optional[float] = None,
+    eps_opt: Optional[float] = 1.0 / 512.0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Call the triton quantize fp8 row kernel to quantize a tensor to fp8 with row-wise scalings.
@@ -232,10 +232,10 @@ def triton_quantize_fp8_row(
         scale_ub (Tensor): Maximum allowed value for scale.
         zero_start_index_M (Tensor): Indicates number of nonzero elements in each row.
         align_rows_to: Pad rows to align to this value. Useful for downstream kernels accepting specific sizes (e.g., multiple of 16)
-        eps_opt: Lower bound for amax. If provided, amax will be clamped to this value.
+        eps_opt: Lower bound for amax (default 1/512 to match CUDA min_scaling_factor).
     Returns:
         torch.Tensor: fp8 scaled tensor.
-        torch.Tensor: reciprocal scale tensor per row.
+        torch.Tensor: scale tensor per row (scale = amax / MAX_FP8).
     """
     if scale_ub is not None and scale_ub.device != a.device:
         raise Exception("'scale_ub' must be on the same device as 'a'")
@@ -693,7 +693,7 @@ def quantize_fp8_row(
     use_triton: bool = True,
     output_device: Optional[torch.device] = None,
     align_rows_to: Optional[int] = None,
-    eps_opt: Optional[float] = None,
+    eps_opt: Optional[float] = 1.0 / 512.0,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize a to fp8 with row-wise scalings and optionally move to output device.
@@ -705,10 +705,10 @@ def quantize_fp8_row(
         use_triton (bool): Whether to use triton kernel or pytorch.
         output_device (torch.device): Device to optionally move the scaled tensors to.
         align_rows_to: Pad rows to align to this value. Useful for downstream kernels accepting specific sizes (e.g., multiple of 16)
-        eps_opt: Lower bound for amax. If amax is below this value, it will be clamped to this value.
+        eps_opt: Lower bound for amax (default 1/512 to match CUDA min_scaling_factor).
     Returns:
         torch.Tensor: fp8 scaled tensor.
-        torch.Tensor: The reciprocal scale tensor per row.
+        torch.Tensor: scale tensor per row (scale = amax / MAX_FP8).
     """
 
     if a.device == torch.device("cpu"):
diff --git a/test/gemm/gemm_test.py b/test/gemm/gemm_test.py
@@ -553,7 +553,7 @@ def f(
             def f(
                 x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor]
             ) -> torch.Tensor:
-                xq, x_scale = torch.ops.mslk.quantize_fp8_per_row(x, output_dtype=QType)
+                xq, x_scale = quantize_fp8_row(x)
                 wq, w_scale = quantize_fp8_row(w)
                 if UseTriton and torch.version.cuda:
                     zq = matmul_fp8_row(xq, wq, x_scale, w_scale)
diff --git a/test/quantize/fp8_quantize_discrepancy_test.py b/test/quantize/fp8_quantize_discrepancy_test.py

Original file line number	Diff line number	Diff line change
`@@ -597,7 +597,6 @@ __verify_library_symbols () {`
`597`	`597`	`mslk::gemm::f8f8bf16_rowwise`
`598`	`598`	`mslk::kv_cache::rope_qkv_decoding`
`599`	`599`	`mslk::moe::index_shuffling_torch`
`600`		`- mslk::quantize::quantize_fp8_per_row`
`601`	`600`	`)`
`602`	`601`	`fi`
`603`	`602`