Fix scaling factor usage in FP8 quant.

timlee0212 · timlee0212 · commit 5b1d53be7215 · 2026-01-09T02:03:33.000-08:00
diff --git a/include/flashinfer/comm/trtllm_mnnvl_allreduce.cuh b/include/flashinfer/comm/trtllm_mnnvl_allreduce.cuh
@@ -1175,7 +1175,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(vec_t<T, VEC_SIZE>& vec, float SFScaleV
 // ============================== Quant Device Function ==============================
 template <typename T, typename PackedType, int ELTS_PER_THREAD>
 inline __device__ void quant_fp8(PackedVec<PackedType, T> packedAccum, void* quantOutPtr,
-                                 float* outputScale, uint32_t threadOffset) {
+                                 float invOutputScale, uint32_t threadOffset) {
   static_assert(ELTS_PER_THREAD == 8 || ELTS_PER_THREAD == 4, "ELTS_PER_THREAD must be 8 or 4");
   using QuantizedPackedType = std::conditional_t<ELTS_PER_THREAD == 8, float2, float>;
 
@@ -1184,7 +1184,7 @@ inline __device__ void quant_fp8(PackedVec<PackedType, T> packedAccum, void* qua
 #pragma unroll
   for (int i = 0; i < ELTS_PER_THREAD; i++) {
     quantizedAccum.elements[i] =
-        __nv_fp8_e4m3(toFloat<T>(packedAccum.elements[i]) * (*outputScale));
+        __nv_fp8_e4m3(toFloat<T>(packedAccum.elements[i]) * invOutputScale);
   }
   reinterpret_cast<QuantizedPackedType*>(&quantOut[threadOffset])[0] = quantizedAccum.packed;
 }
@@ -1373,7 +1373,8 @@ __global__ void __launch_bounds__(config::kMaxBlockSize) oneshotAllreduceFusionK
   }
 
   if constexpr (QType == QuantType::kFP8) {
-    quant::quant_fp8<T, PackedType, kELTS_PER_THREAD>(packedAccum, quantOutPtr, outputScale,
+    float invOutputScale = 1.0f / (*outputScale);  // We need to apply inv_scale to the output
+    quant::quant_fp8<T, PackedType, kELTS_PER_THREAD>(packedAccum, quantOutPtr, invOutputScale,
                                                       threadOffset);
   }
 #if CUDA_VERSION >= 12080
@@ -1805,7 +1806,8 @@ __global__ __launch_bounds__(config::kMaxBlockSize) void rmsNormLamport_fusion(
         *reinterpret_cast<float4*>(&outputNorm[blockLoadOffset + threadLoadOffset]) = rOut.packed;
       }
       if constexpr (QType == QuantType::kFP8) {
-        quant::quant_fp8<T, float4, kELTS_PER_LOAD>(rOut, quantOut, outputScale,
+        float invOutputScale = 1.0f / (*outputScale);
+        quant::quant_fp8<T, float4, kELTS_PER_LOAD>(rOut, quantOut, invOutputScale,
                                                     blockLoadOffset + threadLoadOffset);
       }
 #if CUDA_VERSION >= 12080

Original file line number	Diff line number	Diff line change
`@@ -1175,7 +1175,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(vec_t<T, VEC_SIZE>& vec, float SFScaleV`
`1175`	`1175`	`// ============================== Quant Device Function ==============================`
`1176`	`1176`	`template <typename T, typename PackedType, int ELTS_PER_THREAD>`
`1177`	`1177`	`inline __device__ void quant_fp8(PackedVec<PackedType, T> packedAccum, void* quantOutPtr,`
`1178`		`- float* outputScale, uint32_t threadOffset) {`
	`1178`	`+ float invOutputScale, uint32_t threadOffset) {`
`1179`	`1179`	`static_assert(ELTS_PER_THREAD == 8 \|\| ELTS_PER_THREAD == 4, "ELTS_PER_THREAD must be 8 or 4");`
`1180`	`1180`	`using QuantizedPackedType = std::conditional_t<ELTS_PER_THREAD == 8, float2, float>;`
`1181`	`1181`
`@@ -1184,7 +1184,7 @@ inline __device__ void quant_fp8(PackedVec<PackedType, T> packedAccum, void* qua`
`1184`	`1184`	`#pragma unroll`
`1185`	`1185`	`for (int i = 0; i < ELTS_PER_THREAD; i++) {`
`1186`	`1186`	`quantizedAccum.elements[i] =`
`1187`		`- __nv_fp8_e4m3(toFloat<T>(packedAccum.elements[i]) * (*outputScale));`
	`1187`	`+ __nv_fp8_e4m3(toFloat<T>(packedAccum.elements[i]) * invOutputScale);`
`1188`	`1188`	`}`
`1189`	`1189`	`reinterpret_cast<QuantizedPackedType*>(&quantOut[threadOffset])[0] = quantizedAccum.packed;`
`1190`	`1190`	`}`
`@@ -1373,7 +1373,8 @@ __global__ void __launch_bounds__(config::kMaxBlockSize) oneshotAllreduceFusionK`
`1373`	`1373`	`}`
`1374`	`1374`
`1375`	`1375`	`if constexpr (QType == QuantType::kFP8) {`
`1376`		`- quant::quant_fp8<T, PackedType, kELTS_PER_THREAD>(packedAccum, quantOutPtr, outputScale,`
	`1376`	`+ float invOutputScale = 1.0f / (*outputScale); // We need to apply inv_scale to the output`
	`1377`	`+ quant::quant_fp8<T, PackedType, kELTS_PER_THREAD>(packedAccum, quantOutPtr, invOutputScale,`
`1377`	`1378`	`threadOffset);`
`1378`	`1379`	`}`
`1379`	`1380`	`#if CUDA_VERSION >= 12080`
`@@ -1805,7 +1806,8 @@ __global__ __launch_bounds__(config::kMaxBlockSize) void rmsNormLamport_fusion(`
`1805`	`1806`	`reinterpret_cast<float4>(&outputNorm[blockLoadOffset + threadLoadOffset]) = rOut.packed;`
`1806`	`1807`	`}`
`1807`	`1808`	`if constexpr (QType == QuantType::kFP8) {`
`1808`		`- quant::quant_fp8<T, float4, kELTS_PER_LOAD>(rOut, quantOut, outputScale,`
	`1809`	`+ float invOutputScale = 1.0f / (*outputScale);`
	`1810`	`+ quant::quant_fp8<T, float4, kELTS_PER_LOAD>(rOut, quantOut, invOutputScale,`
`1809`	`1811`	`blockLoadOffset + threadLoadOffset);`
`1810`	`1812`	`}`
`1811`	`1813`	`#if CUDA_VERSION >= 12080`