Match QAT prepare and convert numerics exactly

andrewor14 · andrewor14 · commit 890438ad275b · 2025-03-27T13:26:17.000-07:00
**Summary:** Previously, `Int8DynActInt4QATQuantizer` had
slightly diverging numerics between the prepare and convert
steps. This is because the prepare step uses quantization
primitives shared with AQT (specifically `quantize_affine`
and `dequantize_affine`), while the convert step relies on
old ops from the `torch.ops.quantized_decomposed` namespace.
The diverging numerics is negligible for small models, but
the quantization errors begin to compound for larger models
with many linear layers.

More specifically, there are three different places where the
divergence occurs during activation quantization:

1. **Choose qparams.** The prepare step casts the qparams to
`torch.float32`, whereas the convert step casts the scales to
`torch.float64` and zero points to `torch.int64`.

2. **Quantize.** The prepare step performs round before adding
zero points and uses torch functions, while the convert step
adds before rounding and uses torch tensor methods.
```
x = torch.clamp(
    torch.round(x * (1.0 / scale)) + zero_point, qmin, qmax,
)

x = (
    x.mul(1.0 / scale)
    .add(zero_point)
    .round()
    .clamp(qmin, qmax)
    .to(quantize_dtype)
)
```

3. **Dequantize.** The prepare step casts to `torch.int32`
before adding the zero points, and casts back to the original
dtype before multiplying the scale. The convert step only casts
at the very end.
```
x = x.to(torch.int32) - zero_point.to(torch.int32)
x = x.to(orig_dtype)
x = x * scale

x = x - zero_point
x = x * scale
x = x.to(orig_dtype)
```

This commit makes the convert path use the same torchao
quantization primitives as the prepare path, thereby resolving
the 3 above differences. Now, the prepare and convert steps match
exactly in terms of numerics over many trials.

**Test Plan:**
python test/quantization/test_qat.py -k test_fake_quantize_per_token_vs_convert
python test/quantization/test_qat.py -k test_qat_8da4w_prepare_vs_convert
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -133,6 +133,18 @@ def forward(self, x):
         return x
 
 
+class M4(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(512, 256, bias=False).to(torch.float)
+
+    def example_inputs(self):
+        return (torch.randn(1, 512).to(torch.float),)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
 class ModelWithLinearBias(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1389,6 +1401,65 @@ def test_qat_linear_bias(self):
         example_inputs = m.example_inputs()
         m(*example_inputs)
 
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_fake_quantize_per_token_vs_convert(self):
+        """
+        Test that the following produce the exact same numerics:
+          1. FakeQuantizer with asymmetric per_token config
+          2. torchao.quantization.utils.per_token_dynamic_quant
+        """
+        from torchao.quantization.utils import per_token_dynamic_quant
+
+        torch.manual_seed(self.SEED)
+        x = torch.randn(1, 235, 2048)
+        config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
+        fake_quantizer = FakeQuantizer(config)
+        fake_quantizer_out = fake_quantizer(x)
+        baseline_out = per_token_dynamic_quant(x)
+        torch.testing.assert_close(fake_quantizer_out, baseline_out, atol=0, rtol=0)
+
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_qat_8da4w_prepare_vs_convert(self):
+        """
+        Test that the prepare and convert steps of Int8DynActInt4QATQuantizer produces
+        numerics that match exactly over N trials.
+        """
+        from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
+        from torchao.quantization.utils import compute_error
+
+        num_trials = 1000
+        group_size = 16
+        non_inf_sqnr = []
+
+        for seed in range(self.SEED, self.SEED + num_trials):
+            torch.manual_seed(seed)
+            m = M4()
+            torch.manual_seed(seed)
+            x = m.example_inputs()
+
+            quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
+            prepared = quantizer.prepare(m)
+            prepared_out = prepared(*x)
+            converted = quantizer.convert(prepared)
+            converted_out = converted(*x)
+            sqnr = compute_error(prepared_out, converted_out).item()
+            if sqnr != float("inf"):
+                non_inf_sqnr.append(sqnr)
+
+        avg_sqnr = (
+            sum(non_inf_sqnr) / len(non_inf_sqnr) if len(non_inf_sqnr) > 0 else -1
+        )
+        fail_message = "%s/%s trials did not match exactly, average sqnr = %s" % (
+            len(non_inf_sqnr),
+            num_trials,
+            avg_sqnr,
+        )
+        self.assertEqual(len(non_inf_sqnr), 0, fail_message)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/_executorch_ops.py b/torchao/_executorch_ops.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 import torch
 
+# TODO: delete these ops
+
 
 def _quantized_decomposed_quantize_per_channel_group_wrapper(*args, **kwargs):
     """
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -539,36 +539,48 @@ def group_quantize_tensor_symmetric(
     return w_int8, scales, zeros
 
 
-def per_token_dynamic_quant(input: torch.Tensor) -> torch.Tensor:
-    orig_dtype = input.dtype
-    # TODO: we may need to make the choose_qparams op configurable
-    from torchao._executorch_ops import (
-        _quantized_decomposed_choose_qparams_per_token_asymmetric_wrapper,
-    )
-
-    (
-        scales,
-        zero_points,
-    ) = _quantized_decomposed_choose_qparams_per_token_asymmetric_wrapper(
-        input, torch.int8
-    )
-
-    # TODO: get these from torch.int8
+def per_token_dynamic_quant(
+    input: torch.Tensor,
+    scale_dtype: torch.dtype = torch.float32,
+    zero_point_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    mapping_type = MappingType.ASYMMETRIC
+    block_size = _get_per_token_block_size(input)
     quant_min = -128
     quant_max = 127
-    from torchao._executorch_ops import _quantized_decomposed_quantize_per_token_wrapper
+    quant_dtype = torch.int8
+    output_dtype = input.dtype
 
-    input = _quantized_decomposed_quantize_per_token_wrapper(
-        input, scales, zero_points, quant_min, quant_max, torch.int8
+    scales, zero_points = choose_qparams_affine(
+        input,
+        mapping_type,
+        block_size,
+        quant_dtype,
+        quant_min,
+        quant_max,
+        scale_dtype=scale_dtype,
+        zero_point_dtype=zero_point_dtype,
     )
-    from torchao._executorch_ops import (
-        _quantized_decomposed_dequantize_per_token_wrapper,
+    q = quantize_affine(
+        input,
+        block_size,
+        scales,
+        zero_points,
+        quant_dtype,
+        quant_min,
+        quant_max,
     )
-
-    input = _quantized_decomposed_dequantize_per_token_wrapper(
-        input, scales, zero_points, quant_min, quant_max, torch.int8, orig_dtype
+    dq = dequantize_affine(
+        q,
+        block_size,
+        scales,
+        zero_points,
+        quant_dtype,
+        quant_min,
+        quant_max,
+        output_dtype=output_dtype,
     )
-    return input.to(orig_dtype)
+    return dq
 
 
 def recommended_inductor_config_setter():