FP4 Triton kernel bug fix (pytorch#4181)

Tianyu Liang · facebook-github-bot · commit 11d80de8d46d · 2025-05-28T14:44:14.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1259 Fix loop iteration index calculation bug in triton kernel Reviewed By: q10, jiawenliu64, jianyuh Differential Revision: D75269590
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp4_quantize.py
@@ -290,7 +290,7 @@ def _kernel_quantize_mx4_unpack(
         # Update offsets so we work on the next block.
         input_offset += GROUP_LOAD * GROUP_SIZE
         exp_offset += GROUP_LOAD
-        output_offset += GROUP_LOAD * GROUP_SIZE
+        output_offset += GROUP_LOAD * GROUP_SIZE // 2
 
 
 def triton_quantize_mx4_unpack(