diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py index a957e8a6d16..35fe8d7c08a 100644 --- a/nncf/quantization/advanced_parameters.py +++ b/nncf/quantization/advanced_parameters.py @@ -276,6 +276,8 @@ class AdvancedAWQParameters: :type alpha_max: float :param steps: The number of the steps in grid search. :type steps: int + :param prefer_data_aware: Determines whether to use activations to calculate scales if activations are presented. + :type prefer_data_aware: bool """ subset_size: int = 32 @@ -283,6 +285,7 @@ class AdvancedAWQParameters: alpha_min: float = 0.0 alpha_max: float = 1.0 steps: int = 100 + prefer_data_aware: bool = True @api() diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index f33fac549ed..b4e96c0e509 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -302,6 +302,7 @@ def __init__( awq_params.alpha_min, awq_params.alpha_max, awq_params.steps, + awq_params.prefer_data_aware, ) if self._gptq: gptq_params = self._advanced_parameters.gptq_params @@ -323,7 +324,12 @@ def __init__( self._data_aware_mixed_precision = ( self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0 ) - self._data_aware_compression = self._awq or self._scale_estimation or self._lora_correction or self._gptq + self._data_aware_compression = ( + (self._awq and self._advanced_parameters.awq_params.prefer_data_aware) + or self._scale_estimation + or self._lora_correction + or self._gptq + ) @property def available_backends(self) -> List[BackendType]: @@ -542,7 +548,7 @@ def apply( nodes_to_compress = self.get_nodes_to_compress(graph) statistics = None - if self._data_aware_mixed_precision or self._data_aware_compression: + if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset: matmul_nodes_to_compress = [ node for node in nodes_to_compress if node.metatype in self._backend_entity.matmul_metatypes ] diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py index c38a853680c..de36c50f357 100644 --- a/nncf/quantization/algorithms/weight_compression/awq.py +++ b/nncf/quantization/algorithms/weight_compression/awq.py @@ -66,6 +66,7 @@ def __init__( alpha_min: float = 0.0, alpha_max: float = 1.0, steps: int = 100, + prefer_data_aware: bool = True, ): """ :param subset_size: The number of samples for AWQ. @@ -73,6 +74,7 @@ def __init__( :param alpha_min: Minimum value of smoothness parameter for grid search. :param alpha_max: Maximal value of smoothness parameter for grid search. :param steps: The number of the steps in grid search. + :param prefer_data_aware: Determines whether to use activations to calculate scales. """ super().__init__() self._subset_size = subset_size @@ -80,6 +82,7 @@ def __init__( self._alpha_min = alpha_min self._alpha_max = alpha_max self._steps = steps + self._prefer_data_aware = prefer_data_aware self._backend_entity = None self._patterns = None self._scale_per_target_node = {} @@ -118,7 +121,7 @@ def apply( graph: NNCFGraph, all_weight_params: List[WeightCompressionParameters], nodes_to_compress: List[NNCFNode], - statistics: Dict[str, WCTensorStatistic], + statistics: Optional[Dict[str, WCTensorStatistic]] = None, wc_backend_entity: Optional[WeightCompressionAlgoBackend] = None, ) -> TModel: """ @@ -132,156 +135,41 @@ def apply( :return: A resulting model. """ self._set_backend_entity(model, wc_backend_entity) - matches = [] - inference_nncf_graph = transform_to_inference_graph(deepcopy(graph), [], [], [], []) - nx_graph = inference_nncf_graph.get_nx_graph_copy() - for pattern_graph in self._patterns.values(): - matches.extend(find_subgraphs_matching_pattern(nx_graph, pattern_graph(), strict=False)) - - if len(matches) == 0: - nncf_logger.info("No matching patterns were found for applying AWQ algorithm, it will be skipped.") + awq_data = self._get_awq_data(graph, all_weight_params, nodes_to_compress) + if len(awq_data) == 0: return model transformation_layout = TransformationLayout() model_transformer = ModelTransformerFactory.create(model, inplace=True) - awq_data = {} - name_mapping = {wp.weight_name: idx for idx, wp in enumerate(all_weight_params)} - - for match in matches: - nncf_node = graph.get_node_by_key(match[-1]) - if not self._backend_entity.is_node_with_weights(nncf_node, graph): - continue - - target_node_names = [] - for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph): - target_node_names.append(weight_op_friendly_name) - - # skip node if it is in IgnoredScope or should not be compressed - if target_node_names[-1] not in name_mapping: - continue - - weight_params = all_weight_params[name_mapping[target_node_names[-1]]] + is_data_free = statistics is None or not self._prefer_data_aware - if weight_params.compression_config.num_bits != 4: - continue - target_node = nodes_to_compress[name_mapping[target_node_names[-1]]] + description = "Applying data-free AWQ" if is_data_free else "Applying data-aware AWQ" - # avoid matching different patterns for the same node - if target_node.node_name in awq_data: - continue - - nncf_node = graph.get_node_by_key(match[0]) - - if self._backend_entity.is_node_with_weights(nncf_node, graph): # pattern MatMul->Multiply->MatMul - merge_node_names = [] - for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph): - merge_node_names.append(weight_op_friendly_name) - merge_node = nodes_to_compress[name_mapping[merge_node_names[-1]]] - else: # pattern Act->MatMul or Act->Multiply->MatMul - merge_node = nncf_node - - awq_data[target_node.node_name] = AWQCompressionInfo(weight_params, target_node, merge_node) - - alpha_step = (self._alpha_max - self._alpha_min) / self._steps - - for k, awq_data_item in track(awq_data.items(), description="Applying AWQ"): + for k, awq_data_item in track(awq_data.items(), description=description): wp = awq_data_item.weight_params - target_node = awq_data_item.target_node merge_node = awq_data_item.merge_node weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph) if len(weight_data) != 1: # not supported by the algorithm continue - nncf_logger.debug(f"Apply AWQ for: {wp.node_with_weight.node_name}") + nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}") _, weight_port_id = weight_data[0] - - config = wp.compression_config - - s, X = process_stats(statistics[k], self._subset_size) - s = s.astype(TensorDataType.float32) - X = X.astype(TensorDataType.float32) - - top_k = max(int(s.shape[0] * self._percent_to_apply), 1) - topk_idxs = fns.argsort(-s)[:top_k] - - group_size = config.group_size - if group_size == -1: - group_size = s.shape[0] - - groups_to_correct = set() - for idx in topk_idxs: - groups_to_correct.add(idx.data // group_size) - - groups_to_correct = list(groups_to_correct) - weight = self._backend_entity.get_weight( wp.node_with_weight, weight_port_id, model, graph ) # get_const_value(wp.weight_node) weight_dtype = weight.dtype weight = weight.astype(TensorDataType.float32) - assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1 - reduction_axis = wp.reduction_axes[0] - - if reduction_axis == 0: - weight = fns.transpose(weight) - reduction_axis = 1 - - shape_vector = fns.mean(X, axis=1) - scale = fns.ones_like(shape_vector) - - awq_config = deepcopy(config) - awq_config.group_size = -1 - - for gi in groups_to_correct: - offset = gi * group_size - gscale = s[offset : offset + group_size] - - a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32) - a_max = 1e2 - gscale = fns.clip(gscale, a_min=a_min, a_max=a_max) - - gweight = weight[:, offset : offset + group_size] - gacts = X[offset : offset + group_size, :] - - fp32_out = fns.matmul(gweight, gacts) - min_diff = fns.max(fns.abs(fp32_out)) - best_scale = None - - alpha = self._alpha_min - for _ in range(self._steps): - cur_scale = gscale**alpha - weights_to_fake_quantize = gweight * cur_scale - if config.mode == CompressWeightsMode.NF4: - g_c_scale = calculate_nf4_scale(weights_to_fake_quantize, reduction_axis) - g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale) - g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale) - else: - g_decompressed_weighs = quantize_dequantize_weight( - weights_to_fake_quantize, awq_config, reduction_axis - ) - sacts = gacts / fns.unsqueeze(cur_scale, 1) - - cur_out = fns.matmul(g_decompressed_weighs, sacts) - cur_diff = fns.mean(fns.abs(cur_out - fp32_out)) - if cur_diff < min_diff: - min_diff = cur_diff - best_scale = cur_scale - alpha += alpha_step - - if best_scale is not None: - scale.data[offset : offset + group_size] = best_scale.data - - a_scale = scale - w_scale = scale - if wp.reduction_axes[0] == 0: - w_scale = fns.unsqueeze(w_scale, 1) - a_scale = fns.unsqueeze(1.0 / a_scale, 0) + + if is_data_free: + scale = self._data_free_step(weight) else: - w_scale = fns.unsqueeze(w_scale, 0) - a_scale = fns.unsqueeze(1.0 / a_scale, 1) + scale = self._data_aware_step(wp, weight, statistics[k]) + + w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0]) + a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0]) scaled_weight = (weight * w_scale).astype(weight_dtype) self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight) @@ -309,7 +197,148 @@ def apply( return transformed_model + def _data_aware_step(self, wp, weight, statistics): + alpha_step = (self._alpha_max - self._alpha_min) / self._steps + config = wp.compression_config + s, X = process_stats(statistics, self._subset_size) + s = s.astype(TensorDataType.float32) + X = X.astype(TensorDataType.float32) + + top_k = max(int(s.shape[0] * self._percent_to_apply), 1) + topk_idxs = fns.argsort(-s)[:top_k] + + group_size = config.group_size + if group_size == -1: + group_size = s.shape[0] + + groups_to_correct = set() + for idx in topk_idxs: + groups_to_correct.add(idx.data // group_size) + + groups_to_correct = list(groups_to_correct) + + assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1 + reduction_axis = wp.reduction_axes[0] + + if reduction_axis == 0: + weight = fns.transpose(weight) + reduction_axis = 1 + + shape_vector = fns.mean(X, axis=1) + scale = fns.ones_like(shape_vector) + + awq_config = deepcopy(config) + awq_config.group_size = -1 + + for gi in groups_to_correct: + offset = gi * group_size + gscale = s[offset : offset + group_size] + + a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32) + a_max = 1e2 + gscale = fns.clip(gscale, a_min=a_min, a_max=a_max) + + gweight = weight[:, offset : offset + group_size] + gacts = X[offset : offset + group_size, :] + + fp32_out = fns.matmul(gweight, gacts) + min_diff = fns.max(fns.abs(fp32_out)) + best_scale = None + + alpha = self._alpha_min + for _ in range(self._steps): + cur_scale = gscale**alpha + weights_to_fake_quantize = gweight * cur_scale + if config.mode == CompressWeightsMode.NF4: + g_c_scale = calculate_nf4_scale(weights_to_fake_quantize, reduction_axis) + g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale) + g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale) + else: + g_decompressed_weighs = quantize_dequantize_weight( + weights_to_fake_quantize, awq_config, reduction_axis + ) + sacts = gacts / fns.unsqueeze(cur_scale, 1) + + cur_out = fns.matmul(g_decompressed_weighs, sacts) + cur_diff = fns.mean(fns.abs(cur_out - fp32_out)) + if cur_diff < min_diff: + min_diff = cur_diff + best_scale = cur_scale + alpha += alpha_step + + if best_scale is not None: + scale.data[offset : offset + group_size] = best_scale.data + + return scale + + def _data_free_step(self, weight): + eps = fns.finfo(weight).eps + scale = fns.maximum(fns.mean(fns.abs(weight), axis=0), eps) + return 1 / scale + + def _get_awq_data( + self, graph: NNCFGraph, all_weight_params: List[WeightCompressionParameters], nodes_to_compress: List[NNCFNode] + ) -> Dict[str, AWQCompressionInfo]: + """ + Finds awq patterns in graph and returns it. + :param graph: Model graph. + :param all_weight_params: List of all weight parameters. + :param nodes_to_compress: List of nodes for processing. + :return: A dict with node names and matched AWQ patterns. + """ + matches = [] + inference_nncf_graph = transform_to_inference_graph(deepcopy(graph), [], [], [], []) + nx_graph = inference_nncf_graph.get_nx_graph_copy() + for pattern_graph in self._patterns.values(): + matches.extend(find_subgraphs_matching_pattern(nx_graph, pattern_graph(), strict=False)) + + if len(matches) == 0: + nncf_logger.info("No matching patterns were found for applying AWQ algorithm, it will be skipped.") + return {} + + awq_data = {} + name_mapping = {wp.weight_name: idx for idx, wp in enumerate(all_weight_params)} + + for match in matches: + nncf_node = graph.get_node_by_key(match[-1]) + if not self._backend_entity.is_node_with_weights(nncf_node, graph): + continue + + target_node_names = [] + for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph): + target_node_names.append(weight_op_friendly_name) + + # skip node if it is in IgnoredScope or should not be compressed + if target_node_names[-1] not in name_mapping: + continue + + weight_params = all_weight_params[name_mapping[target_node_names[-1]]] + + if weight_params.compression_config.num_bits != 4: + continue + target_node = nodes_to_compress[name_mapping[target_node_names[-1]]] + + # avoid matching different patterns for the same node + if target_node.node_name in awq_data: + continue + + nncf_node = graph.get_node_by_key(match[0]) + + if self._backend_entity.is_node_with_weights(nncf_node, graph): # pattern MatMul->Multiply->MatMul + merge_node_names = [] + for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph): + merge_node_names.append(weight_op_friendly_name) + merge_node = nodes_to_compress[name_mapping[merge_node_names[-1]]] + else: # pattern Act->MatMul or Act->Multiply->MatMul + merge_node = nncf_node + + awq_data[target_node.node_name] = AWQCompressionInfo(weight_params, target_node, merge_node) + return awq_data + def update_statistics(self, statistics): + if statistics is None: + return statistics + # Multiply activations by the computed scales for node_name, scale in self._scale_per_target_node.items(): for mean_stat in statistics[node_name].mean_values: diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index 3eb38d79fb9..81794b16fcd 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -601,13 +601,12 @@ def compress_weights( elif backend == BackendType.OPENVINO: from nncf.openvino.quantization.quantize_model import compress_weights_impl as ov_compress_weights_impl - if any((awq, scale_estimation, gptq, lora_correction)) and ( - dataset is None or mode == CompressWeightsMode.E2M1 - ): - msg = ( - "Scale estimation, AWQ, GPTQ or Lora Correction algorithm is defined, " - "but dataset is None or mode is E2M1." - ) + if any((scale_estimation, gptq, lora_correction)) and dataset is None: + msg = "Scale estimation, GPTQ or Lora Correction algorithm is defined, but dataset is None." + raise nncf.ParameterNotSupportedError(msg) + + if any((awq, scale_estimation, gptq, lora_correction)) and mode == CompressWeightsMode.E2M1: + msg = "AWQ, Scale estimation, GPTQ or Lora Correction algorithm is defined, but mode is E2M1." raise nncf.ParameterNotSupportedError(msg) if gptq and lora_correction: diff --git a/tests/cross_fw/test_templates/template_test_weights_compression.py b/tests/cross_fw/test_templates/template_test_weights_compression.py index 42852fcdb49..36565e918a7 100644 --- a/tests/cross_fw/test_templates/template_test_weights_compression.py +++ b/tests/cross_fw/test_templates/template_test_weights_compression.py @@ -23,6 +23,8 @@ from nncf.data.dataset import Dataset from nncf.errors import InvalidGroupSizeError from nncf.quantization import compress_weights +from nncf.quantization.advanced_parameters import AdvancedAWQParameters as AWQParams +from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams from nncf.quantization.algorithms.weight_compression.awq import AWQ from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA @@ -124,6 +126,13 @@ def get_not_supported_algorithms() -> List[str]: Returns a list of not supported weight compression algorithms. """ + @staticmethod + @abstractmethod + def supports_data_free() -> bool: + """ + Returns True if data-free compression is supported, False otherwise. + """ + @pytest.mark.parametrize( ("mode", "all_layers", "ratio", "ref_ids"), ( @@ -359,3 +368,38 @@ def test_error_message_for_invalid_group_size(self, algorithm): name_list = [name.strip('"') for name in names[0].split(",")] compress_weights(**kwargs, ignored_scope=IgnoredScope(names=name_list)) + + @pytest.mark.parametrize("dataset", [None, np.ones([1, 8, 8], dtype=np.float32)]) + @pytest.mark.parametrize("prefer_data_aware", [True, False]) + def test_data_free_awq(self, dataset, prefer_data_aware, mocker): + if dataset is None and not self.supports_data_free(): + pytest.skip("Skipping test for not supported dataset") + + n_layers = 8 + n_awq_target = n_layers - 1 # first MatMul is always int8 + model = self.get_awq_act_model(True, n_layers) + + if dataset is not None: + dataset = Dataset([self.to_tensor(dataset)]) + + fn_name = "_data_free_step" if dataset is None or not prefer_data_aware else "_data_aware_step" + + collect_spy = mocker.spy(AWQ, fn_name) + + compressed_model = compress_weights( + model, + mode=CompressWeightsMode.INT4_ASYM, + ratio=1.0, + group_size=-1, + dataset=dataset, + awq=True, + advanced_parameters=CompressionParams( + awq_params=AWQParams( + prefer_data_aware=prefer_data_aware, + ) + ), + ) + + n_awq = self.get_num_multiply_from_awq(compressed_model) + assert n_awq == n_awq_target + assert collect_spy.call_count == n_awq, f"Statistics should be collected {n_awq_target} times." diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index b5b4d6fc5c9..cc8ee20cbc6 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -701,7 +701,6 @@ def test_raise_error_with_unsupported_params_for_e2m1(algo): "algo", ( "lora_correction", - "awq", "scale_estimation", "gptq", ), @@ -1568,6 +1567,10 @@ def check_weights(model: ov.Model, ref_ids: List[int]) -> None: def get_not_supported_algorithms() -> List[str]: return [] + @staticmethod + def supports_data_free() -> bool: + return True + @staticmethod def get_scale_estimation_ref(): return np.array( diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index a014d8e613c..d781b61e37c 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -491,6 +491,10 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: def get_not_supported_algorithms() -> List[str]: return ["lora_correction", "gptq"] + @staticmethod + def supports_data_free() -> bool: + return False + @staticmethod def get_scale_estimation_ref(): return torch.tensor( diff --git a/tests/torch2/function_hook/quantization/test_weights_compression.py b/tests/torch2/function_hook/quantization/test_weights_compression.py index 5c983f44932..d7ae6eb6d05 100644 --- a/tests/torch2/function_hook/quantization/test_weights_compression.py +++ b/tests/torch2/function_hook/quantization/test_weights_compression.py @@ -411,6 +411,10 @@ def check_weights(model: torch.nn.Module, ref_ids: List[int]) -> None: def get_not_supported_algorithms() -> List[str]: return ["lora_correction", "gptq"] + @staticmethod + def supports_data_free() -> bool: + return False + @staticmethod def get_scale_estimation_ref(): return torch.tensor(