openvinotoolkit · nikita-savelyevv · May 12, 2025 · May 12, 2025 · May 14, 2025
@@ -20,7 +20,6 @@
 from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
-from nncf.common.utils.helpers import set_env_variable
 from nncf.parameters import CompressWeightsMode
 from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine
 from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
@@ -285,24 +284,19 @@ def _quantize_weights(
                         scales.append(scale)
                         zero_points.append(zero_point)
 
-                with set_env_variable("NNCF_DISABLE_OPTIMIZED_COMPRESSION", "1"):
-                    # Because of the fact that compression if performed per-column, weight size is very small and
-                    # optimized OV compression performs worse than numpy compression.
-                    # TODO(nikita-savelyevv): Remove this workaround by introducing logic that will control whether to
-                    #   execute optimized compression based on input size.
-                    if block_compression_config.mode == CompressWeightsMode.NF4:
-                        quantized_col = float_quantize_dequantize_weight(
-                            fns.unsqueeze(weight_col, 1),
-                            block_compression_config,
-                            precomputed_scale=scales[-1],
-                        )
-                    else:
-                        quantized_col = integer_quantize_dequantize_weight(
-                            fns.unsqueeze(weight_col, 1),
-                            block_compression_config,
-                            precomputed_scale=scales[-1],
-                            precomputed_zero_point=zero_points[-1],
-                        )
+                if block_compression_config.mode == CompressWeightsMode.NF4:
+                    quantized_col = float_quantize_dequantize_weight(
+                        fns.unsqueeze(weight_col, 1),
+                        block_compression_config,
+                        precomputed_scale=scales[-1],
+                    )
+                else:
+                    quantized_col = integer_quantize_dequantize_weight(
+                        fns.unsqueeze(weight_col, 1),
+                        block_compression_config,
+                        precomputed_scale=scales[-1],
+                        precomputed_zero_point=zero_points[-1],
+                    )
                 quantized_col = fns.flatten(quantized_col)
                 quantized_block[:, i] = quantized_col
                 loss_block[:, i] = (weight_col - quantized_col) ** 2 / hessian_diag_val**2

@@ -72,6 +72,9 @@
 )
 
 
+MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000
+
+
 @dataclass
 class CompressedWeight:
     """
@@ -194,7 +197,7 @@ def do_float_quantization(
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
     # Optimized implementation
-    if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight.backend):
+    if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov
 
         return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale)
@@ -243,7 +246,7 @@ def float_quantize_dequantize_weight(
     # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved
 
     # Optimized implementation
-    if _can_run_optimized(weight.backend):
+    if _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import (
             float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov,
         )
@@ -320,7 +323,7 @@ def get_integer_quantization_error(
     :return: The quantity characterizing the error of integer quantization.
     """
     # Optimized implementation
-    if _can_run_optimized(weight.backend):
+    if _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import (
             get_integer_quantization_error as get_integer_quantization_error_ov,
         )
@@ -446,7 +449,7 @@ def do_integer_quantization(
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)
 
     # Optimized implementation
-    if _can_run_optimized(weight.backend):
+    if _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import do_integer_quantization as do_integer_quantization_ov
 
         return do_integer_quantization_ov(weight, config, reduction_axes, precomputed_scale, precomputed_zero_point)
@@ -495,7 +498,7 @@ def integer_quantize_dequantize_weight(
         (and zero point).
     """
     # Optimized implementation
-    if _can_run_optimized(weight.backend):
+    if _can_run_optimized(weight):
         from nncf.openvino.optimized_functions import (
             integer_quantize_dequantize_weight as integer_quantize_dequantize_weight_ov,
         )
@@ -608,9 +611,10 @@ def _calculate_integer_quantized_weight(
     return compressed_weights
 
 
-def _can_run_optimized(input_backend: TensorBackend) -> bool:
+def _can_run_optimized(inp: Tensor) -> bool:
     if (
-        input_backend in [TensorBackend.ov, TensorBackend.numpy]
+        inp.backend in [TensorBackend.ov, TensorBackend.numpy]
+        and inp.size > MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
         and os.environ.get("NNCF_DISABLE_OPTIMIZED_COMPRESSION") is None
     ):
         if is_openvino_available():

@@ -44,6 +44,7 @@
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
+from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
 from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
@@ -1493,7 +1494,8 @@ def test_compression_with_transposed_activations(kwargs):
 )
 @pytest.mark.parametrize("disabled", [False, True])
 def test_disabled_optimized_compression(disabled):
-    model = LMLinearModel(input_shape=[1, 24, 5000]).ov_model
+    hidden_dim = (MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // LMLinearModel.OUTPUT_DIM) + 1
+    model = LMLinearModel(input_shape=[1, 24, hidden_dim]).ov_model
 
     def run_compression():
         compress_weights(model, mode=CompressWeightsMode.INT8)

@@ -29,6 +29,7 @@
 from nncf.openvino.cpu_info import is_arm_cpu
 from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
+from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight
@@ -109,9 +110,46 @@ def openvino_available(available: bool):
     import nncf.common.utils.backend
 
     original_openvino_available_value = nncf.common.utils.backend._OPENVINO_AVAILABLE
+    original_min_size_value = (
+        nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
+    )
+
     nncf.common.utils.backend._OPENVINO_AVAILABLE = available
+    nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 0
+
     yield
+
     nncf.common.utils.backend._OPENVINO_AVAILABLE = original_openvino_available_value
+    nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = (
+        original_min_size_value
+    )
+
+
+@pytest.mark.parametrize(
+    "weight_shape,is_disabled",
+    [
+        ((MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // 4, 4), True),
+        ((MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // 4 + 1, 4), False),
+    ],
+)
+@pytest.mark.parametrize("quantization_task", [QuantizationTask.Q, QuantizationTask.Q_DQ, QuantizationTask.Q_DQ_RQ])
+def test_optimized_compression_is_disabled(weight_shape, is_disabled, quantization_task):
+    weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy)
+    config = WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)
+
+    fn_to_call, fn_to_patch = _get_compression_fn_from_quantization_task(quantization_task, config)
+    patch_path = f"nncf.openvino.optimized_functions.{fn_to_patch.__name__}"
+    with patch(patch_path, side_effect=fn_to_patch) as mock:
+        kwargs = {}
+        if quantization_task == QuantizationTask.Q_DQ_RQ:
+            kwargs["return_compressed_weight"] = True
+
+        fn_to_call(weight, config, reduction_axes=1)
+
+        if is_disabled:
+            mock.assert_not_called()
+        else:
+            mock.assert_called_once()
 
 
 @pytest.mark.xfail(