add weight-only int8 QAT scheme and update tests for torchao 0.15.0 (#3859)

electroglyph · pre-commit-ci[bot] · web-flow · commit ab4061e10679 · 2026-01-16T09:32:29.000+05:30
* add int8 weight-only QAT scheme, add test, fix tests for current torchao version * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change quantization to PerAxis * lambda =/ * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add torchao messages, remove group_size from int8 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * raise exception on missing torchao * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * touch up the torchao imports * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/tests/utils/test_qat.py b/tests/utils/test_qat.py
@@ -4,12 +4,19 @@
 
 import pytest
 import torch
-from torchao.quantization.qat import FakeQuantizedLinear
-from torchao.quantization.qat.fake_quantizer import (
-    FakeQuantizerBase,
-    Float8FakeQuantizer,
-    Int4WeightPreshuffledFakeQuantizer,
-)
+
+try:
+    from torchao.quantization.qat import FakeQuantizedLinear
+    from torchao.quantization.qat.fake_quantizer import (
+        FakeQuantizerBase,
+        Float8FakeQuantizer,
+        Int4WeightFakeQuantizer,
+        IntxFakeQuantizer,
+    )
+except ImportError:
+    print(
+        "Missing torchao import, please install or upgrade torchao with: pip install 'torchao>=0.15.0'"
+    )
 
 
 class _CountingFakeQuantizer(torch.nn.Module):
@@ -49,22 +56,29 @@ def _test_linear_is_fake_quantized(linear: torch.nn.Linear, qat_scheme: str):
     """
     Verify that the given linear contains fake quantizers according to the `qat_scheme`.
     """
+    weight_only = False
     if qat_scheme == "fp8-int4":
         act_fq_class = Float8FakeQuantizer
-        weight_fq_class = Int4WeightPreshuffledFakeQuantizer
+        weight_fq_class = Int4WeightFakeQuantizer
         min_in_features = 128
     elif qat_scheme == "fp8-fp8":
         act_fq_class = Float8FakeQuantizer
         weight_fq_class = Float8FakeQuantizer
         min_in_features = -1
+    elif qat_scheme == "int8":
+        act_fq_class = None
+        weight_fq_class = IntxFakeQuantizer
+        min_in_features = 128
+        weight_only = True
     else:
         raise ValueError(f"Unknown qat_scheme: {qat_scheme}")
 
     # Check base layer activations and weights
     base_layer = getattr(linear, "base_layer", linear)
     if base_layer.in_features >= min_in_features:
         assert isinstance(base_layer, FakeQuantizedLinear)
-        assert isinstance(base_layer.activation_fake_quantizer, act_fq_class)
+        if not weight_only:
+            assert isinstance(base_layer.activation_fake_quantizer, act_fq_class)
         assert isinstance(base_layer.weight_fake_quantizer, weight_fq_class)
 
     # Check lora A and B (only for full_finetuning=False)
@@ -73,22 +87,26 @@ def _test_linear_is_fake_quantized(linear: torch.nn.Linear, qat_scheme: str):
         lora_B = linear.lora_B.default
         if lora_A.in_features >= min_in_features:
             assert isinstance(lora_A, FakeQuantizedLinear)
-            assert isinstance(lora_A.activation_fake_quantizer, act_fq_class)
+            if not weight_only:
+                assert isinstance(lora_A.activation_fake_quantizer, act_fq_class)
             assert isinstance(lora_A.weight_fake_quantizer, weight_fq_class)
         if lora_B.in_features >= min_in_features:
             assert isinstance(lora_B, FakeQuantizedLinear)
-            assert isinstance(lora_B.activation_fake_quantizer, act_fq_class)
+            if not weight_only:
+                assert isinstance(lora_B.activation_fake_quantizer, act_fq_class)
             assert isinstance(lora_B.weight_fake_quantizer, weight_fq_class)
 
 
 def _test_fake_quantizers_are_called(
     model: torch.nn.Module,
     example_inputs: Dict,
     full_finetuning: bool,
+    qat_scheme: str,
 ):
     """
     Verify that the fake quantizers are actually called when the model is called.
     """
+    weight_only = qat_scheme == "int8"
 
     def _swap_fake_quantizers(model: torch.nn.Module):
         for name, child in model.named_children():
@@ -99,20 +117,23 @@ def _assert_fake_quantizers_are_called(model: torch.nn.Module):
         for name, child in model.named_children():
             if full_finetuning:
                 if isinstance(child, FakeQuantizedLinear):
-                    assert child.activation_fake_quantizer.count == 1
+                    if not weight_only:
+                        assert child.activation_fake_quantizer.count == 1
                     assert child.weight_fake_quantizer.count == 1
             else:
                 # For LoRA, we only fake quantize the input activations once per block:
                 # For self_attn, we only fake quantize the q_proj's input activations
                 # For mlp, we only fake quantize the gate_proj's input activations
                 if name == "self_attn":
                     base_layer = child.q_proj.base_layer
-                    assert hasattr(base_layer, "activation_fake_quantizer")
-                    assert base_layer.activation_fake_quantizer.count == 1
+                    if not weight_only:
+                        assert hasattr(base_layer, "activation_fake_quantizer")
+                        assert base_layer.activation_fake_quantizer.count == 1
                 elif name == "mlp":
                     base_layer = child.gate_proj.base_layer
-                    assert hasattr(base_layer, "activation_fake_quantizer")
-                    assert base_layer.activation_fake_quantizer.count == 1
+                    if not weight_only:
+                        assert hasattr(base_layer, "activation_fake_quantizer")
+                        assert base_layer.activation_fake_quantizer.count == 1
                 elif isinstance(child, FakeQuantizedLinear):
                     # Weight fake quantizers should always be called
                     assert child.weight_fake_quantizer.count == 1
@@ -124,7 +145,7 @@ def _assert_fake_quantizers_are_called(model: torch.nn.Module):
     model.apply(_assert_fake_quantizers_are_called)
 
 
-def _test_model_fake_quantize(qat_scheme: bool, full_finetuning: bool):
+def _test_model_fake_quantize(qat_scheme: str, full_finetuning: bool):
     """
     Test that all linear layers in the model are fake quantized according to the `qat_scheme`.
     """
@@ -141,16 +162,16 @@ def _test_model_fake_quantize(qat_scheme: bool, full_finetuning: bool):
         _test_linear_is_fake_quantized(layer.mlp.up_proj, qat_scheme)
         _test_linear_is_fake_quantized(layer.mlp.down_proj, qat_scheme)
     inputs = tokenizer("How are you?", return_tensors = "pt")
-    _test_fake_quantizers_are_called(model, inputs, full_finetuning)
+    _test_fake_quantizers_are_called(model, inputs, full_finetuning, qat_scheme)
 
 
 # TODO: there are bad interactions across tests right now, need to figure out
 # how to disable model caching before re-enabling this test
-@pytest.mark.parametrize("qat_scheme", ["fp8-int4", "fp8-fp8"])
-def _test_full_model_fake_quantize(qat_scheme: bool):
+@pytest.mark.parametrize("qat_scheme", ["fp8-int4", "fp8-fp8", "int8"])
+def _test_full_model_fake_quantize(qat_scheme: str):
     _test_model_fake_quantize(qat_scheme, full_finetuning = True)
 
 
-@pytest.mark.parametrize("qat_scheme", ["fp8-int4", "fp8-fp8"])
-def test_lora_model_fake_quantize(qat_scheme: bool):
+@pytest.mark.parametrize("qat_scheme", ["fp8-int4", "fp8-fp8", "int8"])
+def test_lora_model_fake_quantize(qat_scheme: str):
     _test_model_fake_quantize(qat_scheme, full_finetuning = False)
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -175,6 +175,8 @@
 # Stop "Special tokens have been added in the vocabulary, ..."
 logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.CRITICAL + 1)
 
+TORCHAO_MSG = "Error: torchao not found, please install with `pip install torchao`"
+
 
 # Ignore logging messages
 class HideLoggingMessage(logging.Filter):
@@ -2211,9 +2213,12 @@ def _prepare_model_for_qat(
     QAT can be optionally combined with LoRA fine-tuning to for additional throughput improvement.
     For more details: https://dev-discuss.pytorch.org/t/speeding-up-qat-by-1-89x-with-lora/2700
     """
-    from torchao.quantization import PerRow, quantize_
-    from torchao.quantization.granularity import PerGroup, PerAxis
-    from torchao.quantization.qat import QATConfig
+    try:
+        from torchao.quantization import PerRow, quantize_
+        from torchao.quantization.granularity import PerGroup, PerAxis
+        from torchao.quantization.qat import QATConfig
+    except ImportError:
+        raise ImportError(TORCHAO_MSG)
 
     # Gemma3 models have issues with int8 embedding quantization due to their
     # large vocabulary size (262144). Auto-switch to int4 weight-only instead.
@@ -2230,8 +2235,10 @@ def _prepare_model_for_qat(
     if not isinstance(qat_scheme, TorchAOConfig):
         torchao_config: Optional[TorchAOConfig] = None
         if qat_scheme == "fp8-int4":
-            from torchao.quantization import Float8DynamicActivationInt4WeightConfig
-
+            try:
+                from torchao.quantization import Float8DynamicActivationInt4WeightConfig
+            except ImportError:
+                raise ImportError(TORCHAO_MSG)
             group_size = 128
             base_config = Float8DynamicActivationInt4WeightConfig()
             filter_fn = (
@@ -2243,20 +2250,26 @@ def _prepare_model_for_qat(
                 base_config_and_filter_fns = [(base_config, filter_fn)],
             )
         elif qat_scheme == "fp8-fp8":
-            from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
-
+            try:
+                from torchao.quantization import (
+                    Float8DynamicActivationFloat8WeightConfig,
+                )
+            except ImportError:
+                raise ImportError(TORCHAO_MSG)
             base_config = Float8DynamicActivationFloat8WeightConfig(
                 granularity = PerRow()
             )
             torchao_config = TorchAOConfig(
                 qat_scheme = qat_scheme, base_config_and_filter_fns = [(base_config, None)]
             )
         elif qat_scheme == "int8-int4":
-            from torchao.quantization import (
-                Int8DynamicActivationIntxWeightConfig,
-                IntxWeightOnlyConfig,
-            )
-
+            try:
+                from torchao.quantization import (
+                    Int8DynamicActivationIntxWeightConfig,
+                    IntxWeightOnlyConfig,
+                )
+            except ImportError:
+                raise ImportError(TORCHAO_MSG)
             torchao_config = TorchAOConfig(
                 qat_scheme = qat_scheme,
                 base_config_and_filter_fns = [
@@ -2276,8 +2289,10 @@ def _prepare_model_for_qat(
                 prequantization_transform = _untie_input_output_embeddings,
             )
         elif qat_scheme == "int4":
-            from torchao.quantization import Int4WeightOnlyConfig
-
+            try:
+                from torchao.quantization import Int4WeightOnlyConfig
+            except ImportError:
+                raise ImportError(TORCHAO_MSG)
             group_size = 128
             base_config = Int4WeightOnlyConfig(group_size = group_size)
             filter_fn = (
@@ -2288,6 +2303,22 @@ def _prepare_model_for_qat(
                 qat_scheme = qat_scheme,
                 base_config_and_filter_fns = [(base_config, filter_fn)],
             )
+        elif qat_scheme == "int8":
+            try:
+                from torchao.quantization import IntxWeightOnlyConfig
+                from torchao.quantization.granularity import PerAxis
+            except ImportError:
+                raise ImportError(TORCHAO_MSG)
+
+            base_config = IntxWeightOnlyConfig(
+                weight_dtype = torch.int8,
+                granularity = PerAxis(0),
+            )
+            filter_fn = lambda m, _: isinstance(m, torch.nn.Linear)
+            torchao_config = TorchAOConfig(
+                qat_scheme = qat_scheme,
+                base_config_and_filter_fns = [(base_config, filter_fn)],
+            )
         else:
             raise ValueError(f"Unexpected QAT scheme {qat_scheme}")
         assert torchao_config is not None, f"TorchAOConfig was not set for {qat_scheme}"