Skip to content

Aanuf/data free awq #3315

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 30 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
488cacc
Support scale estimation inside GPTQ
alexsu52 Jun 10, 2024
ee64877
fix for INT4_ASYM
alexsu52 Sep 4, 2024
f22e411
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Sep 23, 2024
51b4d7b
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Sep 26, 2024
f66cd1e
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Sep 30, 2024
7ce5a53
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Oct 2, 2024
f74d156
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Nov 11, 2024
5288c79
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Nov 11, 2024
1becf15
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Nov 14, 2024
047d7d9
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Dec 10, 2024
c0c7e57
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Dec 16, 2024
b74dea1
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Dec 27, 2024
26a9a77
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Jan 7, 2025
25fcc2c
Merge remote-tracking branch 'upstream/develop' into develop
andreyanufr Feb 25, 2025
f6f4693
Data-free AWQ prototype.
andreyanufr Feb 25, 2025
19a64ac
Data free AWQ.
andreyanufr Feb 26, 2025
bf215d5
Fixed style.
andreyanufr Feb 26, 2025
566ebe7
Fixed shape of data item int test.
andreyanufr Feb 27, 2025
70e47c8
Fixed test case for E2M1.
andreyanufr Feb 27, 2025
6b3310b
Enable awq by default.
andreyanufr Mar 4, 2025
c13437f
Resolved merge conflict.
andreyanufr Apr 9, 2025
519727f
Return AWQ default value.
andreyanufr Apr 9, 2025
10920a2
Fixed debug code.
andreyanufr Apr 9, 2025
35f6a64
Added parameter to define usage of data-free AWQ.
andreyanufr Apr 16, 2025
83c7867
Added tet for data-free AWQ.
andreyanufr Apr 16, 2025
a17b896
Fixed merge conflict.
andreyanufr Apr 16, 2025
516fec7
Changed type hint.
andreyanufr Apr 17, 2025
48e7f1c
Added data-free AWQ test fot pytorch case.
andreyanufr Apr 17, 2025
57c3e4a
Applied suggestion.
andreyanufr Apr 17, 2025
cd9e4c1
Gave the parameter a clearer nameю
andreyanufr Apr 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions nncf/quantization/advanced_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,13 +276,16 @@ class AdvancedAWQParameters:
:type alpha_max: float
:param steps: The number of the steps in grid search.
:type steps: int
:param prefer_data_aware: Determines whether to use activations to calculate scales if activations are presented.
:type prefer_data_aware: bool
"""

subset_size: int = 32
percent_to_apply: float = 0.002
alpha_min: float = 0.0
alpha_max: float = 1.0
steps: int = 100
prefer_data_aware: bool = True
Comment on lines +279 to +288
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
:param prefer_data_aware: Determines whether to use activations to calculate scales if activations are presented.
:type prefer_data_aware: bool
"""
subset_size: int = 32
percent_to_apply: float = 0.002
alpha_min: float = 0.0
alpha_max: float = 1.0
steps: int = 100
prefer_data_aware: bool = True
:param use_data_aware_scaling: Whether to use activation data for scale calculation when available.
:type use_data_aware_scaling: bool
"""
subset_size: int = 32
percent_to_apply: float = 0.002
alpha_min: float = 0.0
alpha_max: float = 1.0
steps: int = 100
use_data_aware_scaling: bool = True

What do you think about this?



@api()
Expand Down
10 changes: 8 additions & 2 deletions nncf/quantization/algorithms/weight_compression/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ def __init__(
awq_params.alpha_min,
awq_params.alpha_max,
awq_params.steps,
awq_params.prefer_data_aware,
)
if self._gptq:
gptq_params = self._advanced_parameters.gptq_params
Expand All @@ -323,7 +324,12 @@ def __init__(
self._data_aware_mixed_precision = (
self._sensitivity_metric != SensitivityMetric.WEIGHT_QUANTIZATION_ERROR and self._ratio != 1.0
)
self._data_aware_compression = self._awq or self._scale_estimation or self._lora_correction or self._gptq
self._data_aware_compression = (
(self._awq and self._advanced_parameters.awq_params.prefer_data_aware)
or self._scale_estimation
or self._lora_correction
or self._gptq
)

@property
def available_backends(self) -> List[BackendType]:
Expand Down Expand Up @@ -542,7 +548,7 @@ def apply(
nodes_to_compress = self.get_nodes_to_compress(graph)

statistics = None
if self._data_aware_mixed_precision or self._data_aware_compression:
if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please redefine self._data_aware_compression as following:

self._data_aware_compression = (self._awq and self._advanced_parameters.awq_params.is_data_aware) or self._scale_estimation or self._lora_correction or self._gptq

Then we can rollback this if statement to the original form.

Suggested change
if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset:
if self._data_aware_mixed_precision or self._data_aware_compression:

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to change my original suggestion here 🙂 After the recent changes (is_data_aware -> prefer_data_aware) I think it actually makes more sense to define

self._data_aware_compression = self._scale_estimation or self._lora_correction or self._gptq

Because otherwise it can happen that self._data_aware_compression is True, but data-aware won't actually be applied. This is in case self._awq is True and dataset is not provided.

And then we can do:

Suggested change
if (self._data_aware_mixed_precision or self._data_aware_compression) and dataset:
data_aware_awq = dataset and self._awq and self._advanced_parameters.awq_params.prefer_data_aware
if self._data_aware_mixed_precision or self._data_aware_compression or data_aware_awq:

matmul_nodes_to_compress = [
node for node in nodes_to_compress if node.metatype in self._backend_entity.matmul_metatypes
]
Expand Down
287 changes: 158 additions & 129 deletions nncf/quantization/algorithms/weight_compression/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,23 @@ def __init__(
alpha_min: float = 0.0,
alpha_max: float = 1.0,
steps: int = 100,
prefer_data_aware: bool = True,
):
"""
:param subset_size: The number of samples for AWQ.
:param percent_to_apply: The percent of outliers for correction.
:param alpha_min: Minimum value of smoothness parameter for grid search.
:param alpha_max: Maximal value of smoothness parameter for grid search.
:param steps: The number of the steps in grid search.
:param prefer_data_aware: Determines whether to use activations to calculate scales.
"""
super().__init__()
self._subset_size = subset_size
self._percent_to_apply = percent_to_apply
self._alpha_min = alpha_min
self._alpha_max = alpha_max
self._steps = steps
self._prefer_data_aware = prefer_data_aware
self._backend_entity = None
self._patterns = None
self._scale_per_target_node = {}
Expand Down Expand Up @@ -118,7 +121,7 @@ def apply(
graph: NNCFGraph,
all_weight_params: List[WeightCompressionParameters],
nodes_to_compress: List[NNCFNode],
statistics: Dict[str, WCTensorStatistic],
statistics: Optional[Dict[str, WCTensorStatistic]] = None,
wc_backend_entity: Optional[WeightCompressionAlgoBackend] = None,
) -> TModel:
"""
Expand All @@ -132,156 +135,41 @@ def apply(
:return: A resulting model.
"""
self._set_backend_entity(model, wc_backend_entity)
matches = []

inference_nncf_graph = transform_to_inference_graph(deepcopy(graph), [], [], [], [])
nx_graph = inference_nncf_graph.get_nx_graph_copy()
for pattern_graph in self._patterns.values():
matches.extend(find_subgraphs_matching_pattern(nx_graph, pattern_graph(), strict=False))

if len(matches) == 0:
nncf_logger.info("No matching patterns were found for applying AWQ algorithm, it will be skipped.")
awq_data = self._get_awq_data(graph, all_weight_params, nodes_to_compress)
if len(awq_data) == 0:
return model

transformation_layout = TransformationLayout()
model_transformer = ModelTransformerFactory.create(model, inplace=True)

awq_data = {}
name_mapping = {wp.weight_name: idx for idx, wp in enumerate(all_weight_params)}

for match in matches:
nncf_node = graph.get_node_by_key(match[-1])
if not self._backend_entity.is_node_with_weights(nncf_node, graph):
continue

target_node_names = []
for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph):
target_node_names.append(weight_op_friendly_name)

# skip node if it is in IgnoredScope or should not be compressed
if target_node_names[-1] not in name_mapping:
continue

weight_params = all_weight_params[name_mapping[target_node_names[-1]]]
is_data_free = statistics is None or not self._prefer_data_aware

if weight_params.compression_config.num_bits != 4:
continue
target_node = nodes_to_compress[name_mapping[target_node_names[-1]]]
description = "Applying data-free AWQ" if is_data_free else "Applying data-aware AWQ"

# avoid matching different patterns for the same node
if target_node.node_name in awq_data:
continue

nncf_node = graph.get_node_by_key(match[0])

if self._backend_entity.is_node_with_weights(nncf_node, graph): # pattern MatMul->Multiply->MatMul
merge_node_names = []
for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph):
merge_node_names.append(weight_op_friendly_name)
merge_node = nodes_to_compress[name_mapping[merge_node_names[-1]]]
else: # pattern Act->MatMul or Act->Multiply->MatMul
merge_node = nncf_node

awq_data[target_node.node_name] = AWQCompressionInfo(weight_params, target_node, merge_node)

alpha_step = (self._alpha_max - self._alpha_min) / self._steps

for k, awq_data_item in track(awq_data.items(), description="Applying AWQ"):
for k, awq_data_item in track(awq_data.items(), description=description):
wp = awq_data_item.weight_params
target_node = awq_data_item.target_node
merge_node = awq_data_item.merge_node
weight_data = self._backend_entity.get_weight_names_and_port_ids(wp.node_with_weight, graph)
if len(weight_data) != 1: # not supported by the algorithm
continue

nncf_logger.debug(f"Apply AWQ for: {wp.node_with_weight.node_name}")
nncf_logger.debug(f"{description} for: {wp.node_with_weight.node_name}")

_, weight_port_id = weight_data[0]

config = wp.compression_config

s, X = process_stats(statistics[k], self._subset_size)
s = s.astype(TensorDataType.float32)
X = X.astype(TensorDataType.float32)

top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
topk_idxs = fns.argsort(-s)[:top_k]

group_size = config.group_size
if group_size == -1:
group_size = s.shape[0]

groups_to_correct = set()
for idx in topk_idxs:
groups_to_correct.add(idx.data // group_size)

groups_to_correct = list(groups_to_correct)

weight = self._backend_entity.get_weight(
wp.node_with_weight, weight_port_id, model, graph
) # get_const_value(wp.weight_node)
weight_dtype = weight.dtype
weight = weight.astype(TensorDataType.float32)
assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1
reduction_axis = wp.reduction_axes[0]

if reduction_axis == 0:
weight = fns.transpose(weight)
reduction_axis = 1

shape_vector = fns.mean(X, axis=1)
scale = fns.ones_like(shape_vector)

awq_config = deepcopy(config)
awq_config.group_size = -1

for gi in groups_to_correct:
offset = gi * group_size
gscale = s[offset : offset + group_size]

a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
a_max = 1e2
gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)

gweight = weight[:, offset : offset + group_size]
gacts = X[offset : offset + group_size, :]

fp32_out = fns.matmul(gweight, gacts)
min_diff = fns.max(fns.abs(fp32_out))
best_scale = None

alpha = self._alpha_min
for _ in range(self._steps):
cur_scale = gscale**alpha
weights_to_fake_quantize = gweight * cur_scale
if config.mode == CompressWeightsMode.NF4:
g_c_scale = calculate_nf4_scale(weights_to_fake_quantize, reduction_axis)
g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale)
g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale)
else:
g_decompressed_weighs = quantize_dequantize_weight(
weights_to_fake_quantize, awq_config, reduction_axis
)
sacts = gacts / fns.unsqueeze(cur_scale, 1)

cur_out = fns.matmul(g_decompressed_weighs, sacts)
cur_diff = fns.mean(fns.abs(cur_out - fp32_out))
if cur_diff < min_diff:
min_diff = cur_diff
best_scale = cur_scale
alpha += alpha_step

if best_scale is not None:
scale.data[offset : offset + group_size] = best_scale.data

a_scale = scale
w_scale = scale
if wp.reduction_axes[0] == 0:
w_scale = fns.unsqueeze(w_scale, 1)
a_scale = fns.unsqueeze(1.0 / a_scale, 0)

if is_data_free:
scale = self._data_free_step(weight)
else:
w_scale = fns.unsqueeze(w_scale, 0)
a_scale = fns.unsqueeze(1.0 / a_scale, 1)
scale = self._data_aware_step(wp, weight, statistics[k])

w_scale = fns.unsqueeze(scale, 1 - wp.reduction_axes[0])
a_scale = fns.unsqueeze(1.0 / scale, wp.reduction_axes[0])

scaled_weight = (weight * w_scale).astype(weight_dtype)
self._backend_entity.set_weight(wp.node_with_weight, weight_port_id, model, graph, scaled_weight)
Expand Down Expand Up @@ -309,7 +197,148 @@ def apply(

return transformed_model

def _data_aware_step(self, wp, weight, statistics):
alpha_step = (self._alpha_max - self._alpha_min) / self._steps
config = wp.compression_config
s, X = process_stats(statistics, self._subset_size)
s = s.astype(TensorDataType.float32)
X = X.astype(TensorDataType.float32)

top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
topk_idxs = fns.argsort(-s)[:top_k]

group_size = config.group_size
if group_size == -1:
group_size = s.shape[0]

groups_to_correct = set()
for idx in topk_idxs:
groups_to_correct.add(idx.data // group_size)

groups_to_correct = list(groups_to_correct)

assert isinstance(wp.reduction_axes, tuple) and len(wp.reduction_axes) == 1
reduction_axis = wp.reduction_axes[0]

if reduction_axis == 0:
weight = fns.transpose(weight)
reduction_axis = 1

shape_vector = fns.mean(X, axis=1)
scale = fns.ones_like(shape_vector)

awq_config = deepcopy(config)
awq_config.group_size = -1

for gi in groups_to_correct:
offset = gi * group_size
gscale = s[offset : offset + group_size]

a_min = fns.astype(fns.quantile(gscale, 0.1), TensorDataType.float32)
a_max = 1e2
gscale = fns.clip(gscale, a_min=a_min, a_max=a_max)

gweight = weight[:, offset : offset + group_size]
gacts = X[offset : offset + group_size, :]

fp32_out = fns.matmul(gweight, gacts)
min_diff = fns.max(fns.abs(fp32_out))
best_scale = None

alpha = self._alpha_min
for _ in range(self._steps):
cur_scale = gscale**alpha
weights_to_fake_quantize = gweight * cur_scale
if config.mode == CompressWeightsMode.NF4:
g_c_scale = calculate_nf4_scale(weights_to_fake_quantize, reduction_axis)
g_compressed_weighs = do_nf4_quantization(weights_to_fake_quantize, g_c_scale)
g_decompressed_weighs = do_nf4_dequantization(g_compressed_weighs, g_c_scale)
else:
g_decompressed_weighs = quantize_dequantize_weight(
weights_to_fake_quantize, awq_config, reduction_axis
)
sacts = gacts / fns.unsqueeze(cur_scale, 1)

cur_out = fns.matmul(g_decompressed_weighs, sacts)
cur_diff = fns.mean(fns.abs(cur_out - fp32_out))
if cur_diff < min_diff:
min_diff = cur_diff
best_scale = cur_scale
alpha += alpha_step

if best_scale is not None:
scale.data[offset : offset + group_size] = best_scale.data

return scale

def _data_free_step(self, weight):
eps = fns.finfo(weight).eps
scale = fns.maximum(fns.mean(fns.abs(weight), axis=0), eps)
return 1 / scale

def _get_awq_data(
self, graph: NNCFGraph, all_weight_params: List[WeightCompressionParameters], nodes_to_compress: List[NNCFNode]
) -> Dict[str, AWQCompressionInfo]:
"""
Finds awq patterns in graph and returns it.
:param graph: Model graph.
:param all_weight_params: List of all weight parameters.
:param nodes_to_compress: List of nodes for processing.
:return: A dict with node names and matched AWQ patterns.
"""
matches = []
inference_nncf_graph = transform_to_inference_graph(deepcopy(graph), [], [], [], [])
nx_graph = inference_nncf_graph.get_nx_graph_copy()
for pattern_graph in self._patterns.values():
matches.extend(find_subgraphs_matching_pattern(nx_graph, pattern_graph(), strict=False))

if len(matches) == 0:
nncf_logger.info("No matching patterns were found for applying AWQ algorithm, it will be skipped.")
return {}

awq_data = {}
name_mapping = {wp.weight_name: idx for idx, wp in enumerate(all_weight_params)}

for match in matches:
nncf_node = graph.get_node_by_key(match[-1])
if not self._backend_entity.is_node_with_weights(nncf_node, graph):
continue

target_node_names = []
for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph):
target_node_names.append(weight_op_friendly_name)

# skip node if it is in IgnoredScope or should not be compressed
if target_node_names[-1] not in name_mapping:
continue

weight_params = all_weight_params[name_mapping[target_node_names[-1]]]

if weight_params.compression_config.num_bits != 4:
continue
target_node = nodes_to_compress[name_mapping[target_node_names[-1]]]

# avoid matching different patterns for the same node
if target_node.node_name in awq_data:
continue

nncf_node = graph.get_node_by_key(match[0])

if self._backend_entity.is_node_with_weights(nncf_node, graph): # pattern MatMul->Multiply->MatMul
merge_node_names = []
for weight_op_friendly_name, _ in self._backend_entity.get_weight_names_and_port_ids(nncf_node, graph):
merge_node_names.append(weight_op_friendly_name)
merge_node = nodes_to_compress[name_mapping[merge_node_names[-1]]]
else: # pattern Act->MatMul or Act->Multiply->MatMul
merge_node = nncf_node

awq_data[target_node.node_name] = AWQCompressionInfo(weight_params, target_node, merge_node)
return awq_data

def update_statistics(self, statistics):
if statistics is None:
return statistics

# Multiply activations by the computed scales
for node_name, scale in self._scale_per_target_node.items():
for mean_stat in statistics[node_name].mean_values:
Expand Down
Loading