Replace torch quantization implementation with Triton version (#4217)

Tianyu Liang · facebook-github-bot · commit 45b2a8f84b1e · 2025-05-30T12:36:22.000-07:00
Summary: Pull Request resolved: #4217 X-link: facebookresearch/FBGEMM#1293 Same as title Reviewed By: jiawenliu64 Differential Revision: D75645953 fbshipit-source-id: d5e96df648a6c66d05d599218193e294b3aa3ca8
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -13,6 +13,11 @@
 
 import torch
 import triton  # @manual=//triton:triton
+
+from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
+    triton_quantize_mx4_unpack,
+)
+
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import (
     matmul_fp8_block,
     matmul_fp8_row,
@@ -27,7 +32,6 @@
 )
 from fbgemm_gpu.experimental.gen_ai.quantize import (
     quantize_int4_preshuffle,
-    scale_mxfp4_quant,
     scale_nvfp4_quant,
 )
 
@@ -2056,8 +2060,8 @@ class MXFP4Gemm(QuantizeOpBase):
     """
 
     def quantize(self, x, w):
-        xq, x_scale = scale_mxfp4_quant(x)
-        wq, w_scale = scale_mxfp4_quant(w)
+        xq, x_scale = triton_quantize_mx4_unpack(x)
+        wq, w_scale = triton_quantize_mx4_unpack(w)
         return xq, wq, x_scale, w_scale
 
     def compute(self, xq, wq, x_scale, w_scale):
@@ -2088,11 +2092,11 @@ class MXFP4GroupedGemm(QuantizeOpBase):
     """
 
     def preprocess(self, x, w):
-        wq, w_scale = zip(*[scale_mxfp4_quant(i) for i in w])
+        wq, w_scale = zip(*[triton_quantize_mx4_unpack(i) for i in w])
         return x, wq, w_scale
 
     def quantize(self, x, wq, w_scale):
-        xq, x_scale = zip(*[scale_mxfp4_quant(i) for i in x])
+        xq, x_scale = zip(*[triton_quantize_mx4_unpack(i) for i in x])
         return xq, wq, x_scale, w_scale
 
     def compute(self, xq, wq, x_scale, w_scale):
@@ -2191,13 +2195,13 @@ class MXFP4StackedGroupedGemm(QuantizeOpBase):
     def preprocess(self, x, w):
         m_values = [i.shape[0] for i in x]
         m_sizes = torch.tensor(m_values).to(dtype=torch.int64, device=x[0].device)
-        wq, w_scale = zip(*[scale_mxfp4_quant(i) for i in w])
+        wq, w_scale = zip(*[triton_quantize_mx4_unpack(i) for i in w])
         wq = torch.stack(wq, dim=0).contiguous()
         w_scale = torch.stack(w_scale, dim=0).contiguous()
         return x, wq, w_scale, m_sizes
 
     def quantize(self, x, wq, w_scale, m_sizes):
-        xq, x_scale = zip(*[scale_mxfp4_quant(i) for i in x])
+        xq, x_scale = zip(*[triton_quantize_mx4_unpack(i) for i in x])
         xq = torch.stack(xq, dim=0).contiguous()
         x_scale = torch.stack(x_scale, dim=0).contiguous()
         xq = xq.view(-1, xq.shape[-1])
diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
@@ -216,207 +216,3 @@ def round_up(x: int, y: int) -> int:
     torch.ops.fbgemm.scaled_fp4_quant(output, input, output_scale, input_global_scale)
     output_scale = output_scale.view(torch.float8_e4m3fn)
     return output, output_scale
-
-
-def _fp32_to_fp4_unpacked(x: torch.Tensor, ebits: int, mbits: int) -> torch.Tensor:
-    """Converts a float32 tensor to a unpacked float4 tensor.
-    Args:
-        x (torch.Tensor): The input float32 tensor.
-        ebits (int): The number of bits in the exponent.
-        mbits (int): The number of bits in the mantissa.
-    Returns:
-        torch.Tensor: The resulting unpacked float4 tensor.
-    """
-
-    def _n_ones(n: int) -> int:
-        return (1 << n) - 1
-
-    EBITS_F32, MBITS_F32 = 8, 23
-    F32_EXP_BIAS = _n_ones(EBITS_F32 - 1)
-
-    assert x.dtype == torch.float
-    assert 1 + ebits + mbits <= 8
-
-    # calculate constants
-    exp_bias = _n_ones(ebits - 1)
-    max_int = _n_ones(ebits + mbits)
-    sign_mask = 1 << (ebits + mbits)
-
-    magic_adder = _n_ones(MBITS_F32 - mbits - 1)
-
-    # all E bits and M bits are 1s
-    max_normal = 2 ** (_n_ones(ebits) - exp_bias) * (_n_ones(mbits + 1) / (2**mbits))
-
-    # E bits = 1, M bits = 0
-    min_normal = 2 ** (1 - exp_bias)
-
-    denorm_exp = (
-        # exp bias conversion between formats
-        (F32_EXP_BIAS - exp_bias)
-        # mantissa length difference between formats
-        + (MBITS_F32 - mbits)
-        # add one to encoded exponent for denormalized numbers
-        + 1
-    )
-    denorm_mask_int = denorm_exp << MBITS_F32
-
-    # reinterpret int32 as float32
-    denorm_mask_float = torch.tensor(denorm_mask_int, dtype=torch.int32).view(
-        torch.float32
-    )
-
-    # save the sign
-    # Note that we have torch.uint32, but some ops like cpu bit shifts
-    # do not work on it. So, we stay in int32.
-    x = x.view(torch.int32)
-    sign = x & 0x80000000
-
-    # set everything to positive, will add sign back at the end
-    x = x ^ sign
-    x = x.view(torch.float)
-
-    # rewrite saturate/denorm/norm branches without explicit data dependent
-    # control flow, to be more compiler friendly
-    saturate_mask = x >= max_normal
-    denormal_mask = torch.logical_and(torch.logical_not(saturate_mask), x < min_normal)
-    normal_mask = torch.logical_not(torch.logical_or(saturate_mask, denormal_mask))
-
-    denormal_x = x + denorm_mask_float
-    denormal_x = denormal_x.view(torch.int32)
-    denormal_x -= denorm_mask_int
-    denormal_x = denormal_x.to(torch.uint8)
-
-    normal_x = x.view(torch.int32)
-    # resulting mantissa is odd
-    mant_odd = (normal_x >> (MBITS_F32 - mbits)) & 1
-    # update exponent, rounding bias part 1
-    val_to_add = ((exp_bias - F32_EXP_BIAS) << MBITS_F32) + magic_adder
-    normal_x += val_to_add
-    # rounding bias part 2
-    normal_x += mant_odd
-    # take the bits!
-    normal_x = normal_x >> (MBITS_F32 - mbits)
-    normal_x = normal_x.to(torch.uint8)
-
-    x = torch.full_like(x, max_int, dtype=torch.uint8)
-    x = torch.where(denormal_mask, denormal_x, x)
-    x = torch.where(normal_mask, normal_x, x)
-
-    # add sign back
-    sign_lp = sign >> (MBITS_F32 + EBITS_F32 - mbits - ebits)
-    sign_lp = sign_lp.to(torch.uint8)
-    # Right shift of a negative signed integer can fill the least significant
-    # bits with either 1s or 0s, depending on the implementation. Since PyTorch
-    # doesn't have an uint32 dtype, we mask out these bits to get just the
-    # f4 sign bit
-    sign_lp = sign_lp & sign_mask
-    x = x | sign_lp
-
-    return x.to(torch.uint8)
-
-
-def _to_blocked(x: torch.Tensor) -> torch.Tensor:
-    """Converts a tensor to the blocked layout.
-    Args:
-        x (torch.Tensor): The input tensor in non-blocked layout.
-    Returns:
-        torch.Tensor: The output tensor in the blocked layout.
-    """
-
-    def ceil_div(a: int, b: int) -> int:
-        return (a + b - 1) // b
-
-    rows, cols = x.shape
-    n_row_blocks = ceil_div(rows, 128)
-    n_col_blocks = ceil_div(cols, 4)
-
-    # Calculate the padded shape
-    padded_rows = n_row_blocks * 128
-    padded_cols = n_col_blocks * 4
-
-    padded = x
-    if (rows, cols) != (padded_rows, padded_cols):
-        padded = torch.zeros(
-            (padded_rows, padded_cols),
-            device=x.device,
-            dtype=x.dtype,
-        )
-        padded[:rows, :cols] = x
-
-    # Rearrange the blocks
-    blocks = padded.view(n_row_blocks, 128, n_col_blocks, 4).permute(0, 2, 1, 3)
-    rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
-
-    return rearranged.flatten()
-
-
-# This PyTorch version refers to https://github.com/pytorch/ao/blob/v0.10.0/torchao/prototype/mx_formats/mx_tensor.py#L146
-def scale_mxfp4_quant(
-    x: torch.Tensor, block_size: int = 32
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Quantize input tensor to FP4 and return quantized tensor and scale.
-    Args:
-        x (torch.Tensor): The input tensor to be quantized to FP4
-        block_size (int): The block size to use for quantization. Default is 32.
-    Returns:
-        xq (torch.Tensor): Quantized FP4 output tensor
-        scale (torch.Tensor): Scale E8M0 tensor
-    """
-
-    F4_E2M1_MAX = 6.0
-    E8M0_EXPONENT_BIAS = 127
-    EBITS_F4_E2M1, MBITS_F4_E2M1 = 2, 1
-
-    # calculate the scale in e8m0 format
-    orig_shape = x.shape
-    x = x.reshape(-1, block_size)
-
-    # find max value of the data
-    # Note: this only implements the `minimally supported` version of
-    # https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-    # section 6.3.
-    max_abs = torch.amax(torch.abs(x), 1)
-    max_pos = F4_E2M1_MAX
-
-    descale = max_abs / max_pos
-    scale = torch.where(
-        torch.isnan(descale),
-        0xFF,  # Handle biased exponent for nan
-        # NOTE: descale < (torch.finfo(torch.float32).smallest_normal / 2) is handled through clamping
-        (
-            torch.clamp(
-                torch.ceil(torch.log2(descale)),
-                min=-E8M0_EXPONENT_BIAS,
-                max=E8M0_EXPONENT_BIAS,
-            )
-            + E8M0_EXPONENT_BIAS
-        ).to(torch.uint8),
-    )
-
-    descale_fp = torch.where(
-        scale == 0,
-        1.0,
-        torch.exp2(E8M0_EXPONENT_BIAS - scale.to(torch.float32)),
-    )
-
-    # scale and saturated cast the data elements to max of target dtype
-    xq = torch.clamp(x * descale_fp.unsqueeze(1), min=-1 * max_pos, max=max_pos)
-
-    xq = xq.reshape(orig_shape)
-    xq = _fp32_to_fp4_unpacked(xq, EBITS_F4_E2M1, MBITS_F4_E2M1)
-    orig_shape = [*orig_shape[:-1], orig_shape[-1] // 2]
-
-    shape = xq.shape
-    assert shape[-1] % 2 == 0
-    xq = xq.contiguous().view(-1)
-    xq = (xq[::2] << 4 | xq[1::2]).view((*shape[:-1], shape[-1] // 2))
-
-    target_numel = scale.numel() * block_size / 2
-    assert target_numel == xq.numel(), f"{target_numel} != {xq.numel()}"
-
-    scale = scale.view(torch.float8_e8m0fnu)
-    scale = scale.view(orig_shape[0], -1)
-    scale = _to_blocked(scale)
-
-    return xq, scale