Conformance: nncf.quantize_pt2e and OpenVINOQuantize support

daniil-lyakhov · daniil-lyakhov · commit 0c628c45ad78 · 2025-05-08T17:48:04.000+02:00
No grad during the TorchFX model validation

quantization params are being forwarded to quantize_pt2e/OpenVINOQuantizer
diff --git a/nncf/quantization/algorithms/min_max/backend.py b/nncf/quantization/algorithms/min_max/backend.py
@@ -174,6 +174,7 @@ def create_quantizer_insertion_command(
         :param target_point: Target location for the quantizer insertion.
         :param quantizer_config: QuantizerConfig instance for the current layer.
         :param parameters: FakeQuantizeParameters to calculate activation quantization parameters.
+        :param extra_params: Additional backend-specific parameters to initiate a quantizer insertion command.
         :return: Backend-specific Command for the quantizer insertion operation.
         """
 
@@ -193,6 +194,7 @@ def create_unified_scales_quantizers_insertion_commands(
         :param target_points: List of target locations for the quantizers insertion.
         :param quantizer_config: QuantizerConfig instance for the current layer.
         :param parameters: FakeQuantizeParameters to calculate activation quantization parameters.
+        :param extra_params: Additional backend-specific parameters to initiate a quantizer insertion command.
         :return: List of backend-specific Commands
             for the quantizers with unified scales insertion operations.
         """
diff --git a/tests/post_training/data/ptq_reference_data.yaml b/tests/post_training/data/ptq_reference_data.yaml
@@ -43,6 +43,13 @@ torchvision/resnet18_backend_FX_TORCH:
     error_message: "Openvino Model Files Not Found!"
     message: "Issue-166847"
 torchvision/resnet18_backend_CUDA_FX_TORCH:
+torchvision/resnet18_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.6946
+torchvision/resnet18_backend_OV_QUANTIZER_AO:
+  metric_value: 0.6946
+torchvision/resnet18_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.6946
+torchvision/resnet18_backend_X86_QUANTIZER_AO:
   metric_value: 0.6946
   exception_xfail_reason:
     type: "FileNotFoundError"
@@ -66,6 +73,14 @@ torchvision/mobilenet_v3_small_BC_backend_CUDA_FX_TORCH:
     type: "FileNotFoundError"
     error_message: "Openvino Model Files Not Found!"
     message: "Issue-166847"
+torchvision/mobilenet_v3_small_BC_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_OV_QUANTIZER_AO:
+  metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.6679
+torchvision/mobilenet_v3_small_BC_backend_X86_QUANTIZER_AO:
+  metric_value: 0.6679
 torchvision/vit_b_16_backend_FP32:
   metric_value: 0.8107
 torchvision/vit_b_16_backend_OV:
@@ -77,6 +92,13 @@ torchvision/vit_b_16_backend_FX_TORCH:
     error_message: "Openvino Model Files Not Found!"
     message: "Issue-166847"
 torchvision/vit_b_16_backend_CUDA_FX_TORCH:
+torchvision/vit_b_16_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.80922
+torchvision/vit_b_16_backend_OV_QUANTIZER_AO:
+  metric_value: 0.80922
+torchvision/vit_b_16_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.80922
+torchvision/vit_b_16_backend_X86_QUANTIZER_AO:
   metric_value: 0.80922
   exception_xfail_reason:
     type: "FileNotFoundError"
@@ -93,6 +115,13 @@ torchvision/swin_v2_s_backend_FX_TORCH:
     error_message: "Openvino Model Files Not Found!"
     message: "Issue-166847"
 torchvision/swin_v2_s_backend_CUDA_FX_TORCH:
+torchvision/swin_v2_s_backend_OV_QUANTIZER_NNCF:
+  metric_value: 0.8360
+torchvision/swin_v2_s_backend_OV_QUANTIZER_AO:
+  metric_value: 0.8360
+torchvision/swin_v2_s_backend_X86_QUANTIZER_NNCF:
+  metric_value: 0.8360
+torchvision/swin_v2_s_backend_X86_QUANTIZER_AO:
   metric_value: 0.8360
   exception_xfail_reason:
     type: "FileNotFoundError"
diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
@@ -23,6 +23,7 @@
 from nncf.quantization.advanced_parameters import AdvancedScaleEstimationParameters
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
 from tests.post_training.pipelines.base import ALL_PTQ_BACKENDS
+from tests.post_training.pipelines.base import FX_BACKENDS
 from tests.post_training.pipelines.base import NNCF_PTQ_BACKENDS
 from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.causal_language_model import CausalLMHF
@@ -107,7 +108,7 @@
             "fast_bias_correction": False,
             "preset": QuantizationPreset.MIXED,
         },
-        "backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV, BackendType.ONNX],
+        "backends": FX_BACKENDS + [BackendType.OV, BackendType.ONNX],
         "batch_size": 128,
     },
     {
@@ -118,7 +119,7 @@
             "model_type": ModelType.TRANSFORMER,
             "advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.15),
         },
-        "backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV],
+        "backends": FX_BACKENDS + [BackendType.OV],
         "batch_size": 1,
     },
     {
@@ -129,7 +130,7 @@
             "model_type": ModelType.TRANSFORMER,
             "advanced_parameters": AdvancedQuantizationParameters(smooth_quant_alpha=0.5),
         },
-        "backends": [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH, BackendType.OV],
+        "backends": FX_BACKENDS + [BackendType.OV],
         "batch_size": 1,
     },
     # Timm models
diff --git a/tests/post_training/pipelines/base.py b/tests/post_training/pipelines/base.py
@@ -57,6 +57,10 @@ class BackendType(Enum):
     CUDA_TORCH = "CUDA_TORCH"
     FX_TORCH = "FX_TORCH"
     CUDA_FX_TORCH = "CUDA_FX_TORCH"
+    OV_QUANTIZER_NNCF = "OV_QUANTIZER_NNCF"
+    OV_QUANTIZER_AO = "OV_QUANTIZER_AO"
+    X86_QUANTIZER_NNCF = "X86_QUANTIZER_NNCF"
+    X86_QUANTIZER_AO = "X86_QUANTIZER_AO"
     ONNX = "ONNX"
     OV = "OV"
     OPTIMUM = "OPTIMUM"
@@ -65,7 +69,14 @@ class BackendType(Enum):
 NNCF_PTQ_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH, BackendType.ONNX, BackendType.OV]
 ALL_PTQ_BACKENDS = NNCF_PTQ_BACKENDS
 PT_BACKENDS = [BackendType.TORCH, BackendType.CUDA_TORCH]
-FX_BACKENDS = [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH]
+FX_BACKENDS = [
+    BackendType.FX_TORCH,
+    BackendType.CUDA_FX_TORCH,
+    BackendType.OV_QUANTIZER_NNCF,
+    BackendType.OV_QUANTIZER_AO,
+    BackendType.X86_QUANTIZER_NNCF,
+    BackendType.X86_QUANTIZER_AO,
+]
 OV_BACKENDS = [BackendType.OV, BackendType.OPTIMUM]
 
 LIMIT_LENGTH_OF_STATUS = 120
diff --git a/tests/post_training/pipelines/image_classification_base.py b/tests/post_training/pipelines/image_classification_base.py
@@ -12,16 +12,30 @@
 import copy
 import os
 
+os.environ["TORCHINDUCTOR_FREEZING"] = "1"
+
+from itertools import islice
+
 import numpy as np
 import openvino as ov
 import torch
 from sklearn.metrics import accuracy_score
+from torch.ao.quantization.quantize_pt2e import convert_pt2e
+from torch.ao.quantization.quantize_pt2e import prepare_pt2e
+from torch.ao.quantization.quantizer.quantizer import Quantizer as TorchAOQuantizer
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
+from torch.ao.quantization.quantizer.x86_inductor_quantizer import get_default_x86_inductor_quantization_config
 from torchvision import datasets
 
 import nncf
+from nncf import AdvancedQuantizationParameters
 from nncf.common.logging.track_progress import track
+from nncf.experimental.torch.fx import OpenVINOQuantizer
+from nncf.experimental.torch.fx import quantize_pt2e
+from nncf.torch import disable_patching
 from tests.post_training.pipelines.base import DEFAULT_VAL_THREADS
 from tests.post_training.pipelines.base import FX_BACKENDS
+from tests.post_training.pipelines.base import BackendType
 from tests.post_training.pipelines.base import PTQTestPipeline
 
 
@@ -75,7 +89,17 @@ def process_result(request, userdata):
     def _validate_torch_compile(
         self, val_loader: torch.utils.data.DataLoader, predictions: np.ndarray, references: np.ndarray
     ):
-        compiled_model = torch.compile(self.compressed_model.cpu(), backend="openvino", options={"aot_autograd": True})
+        if self.backend in [
+            BackendType.FX_TORCH,
+            BackendType.CUDA_FX_TORCH,
+            BackendType.OV_QUANTIZER_AO,
+            BackendType.OV_QUANTIZER_NNCF,
+        ]:
+            compiled_model = torch.compile(
+                self.compressed_model.cpu(), backend="openvino", options={"aot_autograd": True}
+            )
+        else:
+            compiled_model = torch.compile(self.compressed_model)
         for i, (images, target) in enumerate(val_loader):
             # W/A for memory leaks when using torch DataLoader and OpenVINO
             pred = compiled_model(images)
@@ -103,3 +127,98 @@ def _validate(self) -> None:
 
         self.run_info.metric_name = "Acc@1"
         self.run_info.metric_value = acc_top1
+        return []
+
+    def _compress_torch_ao(self, quantizer):
+        with torch.no_grad(), disable_patching():
+            prepared_model = prepare_pt2e(self.model, quantizer)
+            subset_size = self.compression_params.get("subset_size", 300)
+            for data in islice(self.calibration_dataset.get_inference_data(), subset_size):
+                prepared_model(data)
+            self.compressed_model = convert_pt2e(prepared_model)
+
+    def _compress_nncf_pt2e(self, quantizer):
+        pt2e_kwargs = {}
+        for key in (
+            "subset_size",
+            "fast_bias_correction",
+        ):
+            if key in self.compression_params:
+                pt2e_kwargs[key] = self.compression_params[key]
+
+        advanced_parameters: AdvancedQuantizationParameters = self.compression_params.get(
+            "advanced_parameters", AdvancedQuantizationParameters()
+        )
+
+        sq_params = advanced_parameters.smooth_quant_alphas
+        sq_alpha = advanced_parameters.smooth_quant_alpha
+        if sq_alpha is not None:
+            if sq_alpha < 0:
+                sq_params.convolution = -1
+                sq_params.matmul = -1
+            else:
+                sq_params.matmul = sq_alpha
+        pt2e_kwargs["smooth_quant_params"] = sq_params
+        pt2e_kwargs["bias_correction_params"] = advanced_parameters.bias_correction_params
+        pt2e_kwargs["activations_range_estimator_params"] = advanced_parameters.activations_range_estimator_params
+        pt2e_kwargs["weights_range_estimator_params"] = advanced_parameters.weights_range_estimator_params
+
+        smooth_quant = False
+        if self.compression_params.get("model_type", False):
+            smooth_quant = self.compression_params["model_type"] == nncf.ModelType.TRANSFORMER
+
+        with disable_patching(), torch.no_grad():
+            self.compressed_model = quantize_pt2e(
+                self.model,
+                quantizer,
+                self.calibration_dataset,
+                smooth_quant=smooth_quant,
+                fold_quantize=False,
+                **pt2e_kwargs,
+            )
+
+    def _compress(self):
+        """
+        Quantize self.model
+        """
+        if self.backend not in FX_BACKENDS:
+            super()._compress()
+
+            return
+        if self.backend in [BackendType.FX_TORCH, BackendType.CUDA_FX_TORCH]:
+            with disable_patching(), torch.no_grad():
+                super()._compress()
+                return
+
+        quantizer = self._build_quantizer()
+
+        if self.backend in [BackendType.OV_QUANTIZER_NNCF, BackendType.X86_QUANTIZER_NNCF]:
+            self._compress_nncf_pt2e(quantizer)
+        else:
+            self._compress_torch_ao(quantizer)
+
+    def _build_quantizer(self) -> TorchAOQuantizer:
+        if self.backend in [BackendType.X86_QUANTIZER_AO, BackendType.X86_QUANTIZER_NNCF]:
+            quantizer = X86InductorQuantizer()
+            quantizer.set_global(get_default_x86_inductor_quantization_config())
+            return quantizer
+        quantizer_kwargs = {}
+        for key in (
+            "mode",
+            "preset",
+            "target_device",
+            "model_type",
+            "ignored_scope",
+        ):
+            if key in self.compression_params:
+                quantizer_kwargs[key] = self.compression_params[key]
+        advanced_parameters: AdvancedQuantizationParameters = self.compression_params.get(
+            "advanced_parameters", AdvancedQuantizationParameters()
+        )
+        quantizer_kwargs["overflow_fix"] = advanced_parameters.overflow_fix
+        quantizer_kwargs["quantize_outputs"] = advanced_parameters.quantize_outputs
+        quantizer_kwargs["activations_quantization_params"] = advanced_parameters.activations_quantization_params
+        quantizer_kwargs["weights_quantization_params"] = advanced_parameters.weights_quantization_params
+        quantizer_kwargs["quantizer_propagation_rule"] = advanced_parameters.quantizer_propagation_rule
+
+        return OpenVINOQuantizer(**quantizer_kwargs)