Skip to content

Introduce a minimal input size restriction for optimized weights compression #3492

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 13 additions & 19 deletions nncf/quantization/algorithms/weight_compression/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
from nncf.common.utils.backend import BackendType
from nncf.common.utils.backend import get_backend
from nncf.common.utils.helpers import set_env_variable
from nncf.parameters import CompressWeightsMode
from nncf.quantization.algorithms.layerwise.engine import LayerwiseEngine
from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend
Expand Down Expand Up @@ -285,24 +284,19 @@ def _quantize_weights(
scales.append(scale)
zero_points.append(zero_point)

with set_env_variable("NNCF_DISABLE_OPTIMIZED_COMPRESSION", "1"):
# Because of the fact that compression if performed per-column, weight size is very small and
# optimized OV compression performs worse than numpy compression.
# TODO(nikita-savelyevv): Remove this workaround by introducing logic that will control whether to
# execute optimized compression based on input size.
if block_compression_config.mode == CompressWeightsMode.NF4:
quantized_col = float_quantize_dequantize_weight(
fns.unsqueeze(weight_col, 1),
block_compression_config,
precomputed_scale=scales[-1],
)
else:
quantized_col = integer_quantize_dequantize_weight(
fns.unsqueeze(weight_col, 1),
block_compression_config,
precomputed_scale=scales[-1],
precomputed_zero_point=zero_points[-1],
)
if block_compression_config.mode == CompressWeightsMode.NF4:
quantized_col = float_quantize_dequantize_weight(
fns.unsqueeze(weight_col, 1),
block_compression_config,
precomputed_scale=scales[-1],
)
else:
quantized_col = integer_quantize_dequantize_weight(
fns.unsqueeze(weight_col, 1),
block_compression_config,
precomputed_scale=scales[-1],
precomputed_zero_point=zero_points[-1],
)
quantized_col = fns.flatten(quantized_col)
quantized_block[:, i] = quantized_col
loss_block[:, i] = (weight_col - quantized_col) ** 2 / hessian_diag_val**2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@
)


MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 10000


@dataclass
class CompressedWeight:
"""
Expand Down Expand Up @@ -194,7 +197,7 @@ def do_float_quantization(
weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)

# Optimized implementation
if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight.backend):
if config.mode == CompressWeightsMode.NF4 and _can_run_optimized(weight):
from nncf.openvino.optimized_functions import do_float_quantization as do_float_quantization_ov

return do_float_quantization_ov(weight, config, reduction_axes, precomputed_scale)
Expand Down Expand Up @@ -243,7 +246,7 @@ def float_quantize_dequantize_weight(
# TODO(nikita-savelyevv): add support for f4e2m1 once ticket 164851 is resolved

# Optimized implementation
if _can_run_optimized(weight.backend):
if _can_run_optimized(weight):
from nncf.openvino.optimized_functions import (
float_quantize_dequantize_weight as float_quantize_dequantize_weight_ov,
)
Expand Down Expand Up @@ -320,7 +323,7 @@ def get_integer_quantization_error(
:return: The quantity characterizing the error of integer quantization.
"""
# Optimized implementation
if _can_run_optimized(weight.backend):
if _can_run_optimized(weight):
from nncf.openvino.optimized_functions import (
get_integer_quantization_error as get_integer_quantization_error_ov,
)
Expand Down Expand Up @@ -446,7 +449,7 @@ def do_integer_quantization(
weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)

# Optimized implementation
if _can_run_optimized(weight.backend):
if _can_run_optimized(weight):
from nncf.openvino.optimized_functions import do_integer_quantization as do_integer_quantization_ov

return do_integer_quantization_ov(weight, config, reduction_axes, precomputed_scale, precomputed_zero_point)
Expand Down Expand Up @@ -495,7 +498,7 @@ def integer_quantize_dequantize_weight(
(and zero point).
"""
# Optimized implementation
if _can_run_optimized(weight.backend):
if _can_run_optimized(weight):
from nncf.openvino.optimized_functions import (
integer_quantize_dequantize_weight as integer_quantize_dequantize_weight_ov,
)
Expand Down Expand Up @@ -608,9 +611,10 @@ def _calculate_integer_quantized_weight(
return compressed_weights


def _can_run_optimized(input_backend: TensorBackend) -> bool:
def _can_run_optimized(inp: Tensor) -> bool:
if (
input_backend in [TensorBackend.ov, TensorBackend.numpy]
inp.backend in [TensorBackend.ov, TensorBackend.numpy]
and inp.size > MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
and os.environ.get("NNCF_DISABLE_OPTIMIZED_COMPRESSION") is None
):
if is_openvino_available():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_nf4_quantized_weight
from nncf.quantization.algorithms.weight_compression.weight_lowering import _calculate_normalized_weight
from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
Expand Down Expand Up @@ -1493,7 +1494,8 @@ def test_compression_with_transposed_activations(kwargs):
)
@pytest.mark.parametrize("disabled", [False, True])
def test_disabled_optimized_compression(disabled):
model = LMLinearModel(input_shape=[1, 24, 5000]).ov_model
hidden_dim = (MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // LMLinearModel.OUTPUT_DIM) + 1
model = LMLinearModel(input_shape=[1, 24, hidden_dim]).ov_model

def run_compression():
compress_weights(model, mode=CompressWeightsMode.INT8)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from nncf.openvino.cpu_info import is_arm_cpu
from nncf.openvino.graph.node_utils import get_const_value_as_ov_tensor
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.weight_lowering import MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
from nncf.quantization.algorithms.weight_compression.weight_lowering import do_float_quantization
from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
from nncf.quantization.algorithms.weight_compression.weight_lowering import float_quantize_dequantize_weight
Expand Down Expand Up @@ -109,9 +110,46 @@ def openvino_available(available: bool):
import nncf.common.utils.backend

original_openvino_available_value = nncf.common.utils.backend._OPENVINO_AVAILABLE
original_min_size_value = (
nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION
)

nncf.common.utils.backend._OPENVINO_AVAILABLE = available
nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = 0

yield

nncf.common.utils.backend._OPENVINO_AVAILABLE = original_openvino_available_value
nncf.quantization.algorithms.weight_compression.weight_lowering.MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION = (
original_min_size_value
)


@pytest.mark.parametrize(
"weight_shape,is_disabled",
[
((MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // 4, 4), True),
((MIN_INPUT_SIZE_FOR_OPTIMIZED_COMPRESSION // 4 + 1, 4), False),
],
)
@pytest.mark.parametrize("quantization_task", [QuantizationTask.Q, QuantizationTask.Q_DQ, QuantizationTask.Q_DQ_RQ])
def test_optimized_compression_is_disabled(weight_shape, is_disabled, quantization_task):
weight = get_random_float_tensor(weight_shape, TensorDataType.float32, TensorBackend.numpy)
config = WeightCompressionConfig(CompressWeightsMode.INT8_ASYM)

fn_to_call, fn_to_patch = _get_compression_fn_from_quantization_task(quantization_task, config)
patch_path = f"nncf.openvino.optimized_functions.{fn_to_patch.__name__}"
with patch(patch_path, side_effect=fn_to_patch) as mock:
kwargs = {}
if quantization_task == QuantizationTask.Q_DQ_RQ:
kwargs["return_compressed_weight"] = True

fn_to_call(weight, config, reduction_axes=1)

if is_disabled:
mock.assert_not_called()
else:
mock.assert_called_once()


@pytest.mark.xfail(
Expand Down
Loading