diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index baf54c41e87..1de6f549851 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -20,7 +20,6 @@ from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer from nncf.common.utils.backend import BackendType from nncf.common.utils.backend import get_backend -from nncf.common.utils.helpers import set_env_variable from nncf.parameters import CompressWeightsMode from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend @@ -285,24 +284,19 @@ def _quantize_weights( scales.append(scale) zero_points.append(zero_point) - with set_env_variable("NNCF_DISABLE_OPTIMIZED_COMPRESSION", "1"): - # Because of the fact that compression if performed per-column, weight size is very small and - # optimized OV compression performs worse than numpy compression. - # TODO(nikita-savelyevv): Remove this workaround by introducing logic that will control whether to - # execute optimized compression based on input size. - if block_compression_config.mode == CompressWeightsMode.NF4: - quantized_col = float_quantize_dequantize_weight( - fns.unsqueeze(weight_col, 1), - block_compression_config, - precomputed_scale=scales[-1], - ) - else: - quantized_col = integer_quantize_dequantize_weight( - fns.unsqueeze(weight_col, 1), - block_compression_config, - precomputed_scale=scales[-1], - precomputed_zero_point=zero_points[-1], - ) + if block_compression_config.mode == CompressWeightsMode.NF4: + quantized_col = float_quantize_dequantize_weight( + fns.unsqueeze(weight_col, 1), + block_compression_config, + precomputed_scale=scales[-1], + ) + else: + quantized_col = integer_quantize_dequantize_weight( + fns.unsqueeze(weight_col, 1), + block_compression_config, + precomputed_scale=scales[-1], + precomputed_zero_point=zero_points[-1], + ) quantized_col = fns.flatten(quantized_col) quantized_block[:, i] = quantized_col loss_block[:, i] = (weight_col - quantized_col) ** 2 / hessian_diag_val**2 diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index bcce9308469..eab6fe75389 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -72,6 +72,9 @@ ) +MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000 + + @dataclass class CompressedWeight: """ @@ -194,7 +197,7 @@ def do_float_quantization( weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) # Optimized implementation - if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight.backend): + if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight): from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale) @@ -243,7 +246,7 @@ def float_quantize_dequantize_weight( # TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved # Optimized implementation - if _can_run_optimized(weight.backend): + if _can_run_optimized(weight): from nncf.openvino.optimized_functions import ( float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov, ) @@ -320,7 +323,7 @@ def get_integer_quantization_error( :return: The quantity characterizing the error of integer quantization. """ # Optimized implementation - if _can_run_optimized(weight.backend): + if _can_run_optimized(weight): from nncf.openvino.optimized_functions import ( get_integer_quantization_error as get_integer_quantization_error_ov, ) @@ -446,7 +449,7 @@ def do_integer_quantization( weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size) # Optimized implementation - if _can_run_optimized(weight.backend): + if _can_run_optimized(weight): from nncf.openvino.optimized_functions import do_integer_quantization as do_integer_quantization_ov return do_integer_quantization_ov(weight, config, reduction_axes, precomputed_scale, precomputed_zero_point) @@ -495,7 +498,7 @@ def integer_quantize_dequantize_weight( (and zero point). """ # Optimized implementation - if _can_run_optimized(weight.backend): + if _can_run_optimized(weight): from nncf.openvino.optimized_functions import ( integer_quantize_dequantize_weight as integer_quantize_dequantize_weight_ov, ) @@ -608,9 +611,10 @@ def _calculate_integer_quantized_weight( return compressed_weights -def _can_run_optimized(input_backend: TensorBackend) -> bool: +def _can_run_optimized(inp: Tensor) -> bool: if ( - input_backend in [TensorBackend.ov, TensorBackend.numpy] + inp.backend in [TensorBackend.ov, TensorBackend.numpy] + and inp.size > MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION and os.environ.get("NNCF_DISABLE_OPTIMIZED_COMPRESSION") is None ): if is_openvino_available(): diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 03a5585ed93..2e15dc9a0c5 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -44,6 +44,7 @@ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend +from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization @@ -1493,7 +1494,8 @@ def test_compression_with_transposed_activations(kwargs): ) @pytest.mark.parametrize("disabled", [False, True]) def test_disabled_optimized_compression(disabled): - model = LMLinearModel(input_shape=[1, 24, 5000]).ov_model + hidden_dim = (MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // LMLinearModel.OUTPUT_DIM) + 1 + model = LMLinearModel(input_shape=[1, 24, hidden_dim]).ov_model def run_compression(): compress_weights(model, mode=CompressWeightsMode.INT8) diff --git a/tests/openvino/optimized_functions/test_compression_functions.py b/tests/openvino/optimized_functions/test_compression_functions.py index 658b9d9ada6..b63af18f67b 100644 --- a/tests/openvino/optimized_functions/test_compression_functions.py +++ b/tests/openvino/optimized_functions/test_compression_functions.py @@ -29,6 +29,7 @@ from nncf.openvino.cpu_info import is_arm_cpu from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig +from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight @@ -109,9 +110,46 @@ def openvino_available(available: bool): import nncf.common.utils.backend original_openvino_available_value = nncf.common.utils.backend._OPENVINO_AVAILABLE + original_min_size_value = ( + nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION + ) + nncf.common.utils.backend._OPENVINO_AVAILABLE = available + nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 0 + yield + nncf.common.utils.backend._OPENVINO_AVAILABLE = original_openvino_available_value + nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = ( + original_min_size_value + ) + + +@pytest.mark.parametrize( + "weight_shape,is_disabled", + [ + ((MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // 4, 4), True), + ((MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // 4 + 1, 4), False), + ], +) +@pytest.mark.parametrize("quantization_task", [QuantizationTask.Q, QuantizationTask.Q_DQ, QuantizationTask.Q_DQ_RQ]) +def test_optimized_compression_is_disabled(weight_shape, is_disabled, quantization_task): + weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy) + config = WeightCompressionConfig(CompressWeightsMode.INT8_ASYM) + + fn_to_call, fn_to_patch = _get_compression_fn_from_quantization_task(quantization_task, config) + patch_path = f"nncf.openvino.optimized_functions.{fn_to_patch.__name__}" + with patch(patch_path, side_effect=fn_to_patch) as mock: + kwargs = {} + if quantization_task == QuantizationTask.Q_DQ_RQ: + kwargs["return_compressed_weight"] = True + + fn_to_call(weight, config, reduction_axes=1) + + if is_disabled: + mock.assert_not_called() + else: + mock.assert_called_once() @pytest.mark.xfail(