Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 6 additions & 13 deletions src/llmcompressor/entrypoints/model_free/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@
apply_calibration_status,
freeze_module_quantization,
initialize_observer,
update_weight_global_scale,
update_weight_zp_scale,
observe,
update_qparams,
)
from llmcompressor.observers.helpers import flatten_for_calibration

__all__ = [
"initialize_quantized_linear",
"validate_weight_for_quantization",
"calibrate_global_scale",
"calibrate_scale_zp",
"calibrate_weight",
]


Expand Down Expand Up @@ -49,15 +48,9 @@ def initialize_quantized_linear(
return module


def calibrate_global_scale(module: torch.nn.Linear):
def calibrate_weight(module: torch.nn.Linear):
initialize_observer(module, "weight")
apply_calibration_status(module)
update_weight_global_scale(module)
freeze_module_quantization(module)


def calibrate_scale_zp(module: torch.nn.Linear):
initialize_observer(module, "weight")
apply_calibration_status(module)
update_weight_zp_scale(module)
observe(module, base_name="weight")
update_qparams(module, base_name="weight")
freeze_module_quantization(module)
35 changes: 21 additions & 14 deletions src/llmcompressor/entrypoints/model_free/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,22 @@
from torch.nn import Module

from llmcompressor.entrypoints.model_free.lifecycle import (
calibrate_global_scale,
calibrate_scale_zp,
calibrate_weight,
initialize_quantized_linear,
validate_weight_for_quantization,
)
from llmcompressor.entrypoints.model_free.microscale import (
get_fused_names,
is_microscale_scheme,
)
from llmcompressor.modifiers.quantization.calibration import (
apply_calibration_status,
freeze_module_quantization,
initialize_observer,
observe,
update_qparams,
)
from llmcompressor.observers import Observer

__all__ = [
"validate_file",
Expand Down Expand Up @@ -99,7 +106,7 @@ def process_file(
module = initialize_quantized_linear(tensors[name], scheme, device)

# 2. calibrate weight qparams
calibrate_scale_zp(module)
calibrate_weight(module)

# 3. compress module using qparams
compress_module(module)
Expand Down Expand Up @@ -175,14 +182,16 @@ def process_file_microscale_scheme(
# 1. initialize module with qparams (on device)
module = initialize_quantized_linear(tensors[name], scheme, device)

# 2. calibrate global scale; delay scale/zp for fused modules
calibrate_global_scale(module)
# gather fused modules for later processing
if name in fused_name_to_fused_index:
fused_index = fused_name_to_fused_index[name]
fused_modules[fused_index][name] = module
initialize_observer(module, "weight")
apply_calibration_status(module)
continue

calibrate_scale_zp(module)
# 2. get module qparams
calibrate_weight(module)

# 3. compress module using qparams
compress_module(module)
Expand All @@ -195,22 +204,20 @@ def process_file_microscale_scheme(

# Compress fused modules with shared global scale
for named_modules in fused_modules.values():
# 2.1. compute fused global scale across all members of the fused set
global_scales = [m.weight_global_scale for m in named_modules.values()]
fused_global_scale = torch.min(torch.cat(global_scales, dim=0))
# 2. fuse observers, observe weights, and get qparams
Observer.fuse([mod.weight_observer for mod in named_modules.values()])
observe(named_modules.values(), base_name="weight")
update_qparams(named_modules.values(), base_name="weight")

for name, module in named_modules.items():
module_name, _ = name.rsplit(".", 1)
module.weight_global_scale.data.copy_(fused_global_scale)

# 2.2. finish calibration with fused global scale
calibrate_scale_zp(module)
freeze_module_quantization(module)
Comment on lines 206 to +213
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fused observer cleanup — dangling references after freeze.

After freeze_module_quantization(module) at Line 213 deletes each module's weight_observer, the remaining fused observers in other modules still hold references to the deleted observer via their _fused_observers list (populated by Observer.fuse). In this code path, update_qparams on the fused set has already completed before any freeze, so no incorrect results will occur. However, if the logic is ever refactored to recompute qparams after partial freezing, the stale references could surface as subtle bugs (e.g., referencing a detached observer's statistics).

Consider either clearing _fused_observers in Observer.detach/freeze_module_quantization, or documenting that freezing must only happen after all fused peers have completed qparam computation.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/llmcompressor/entrypoints/model_free/process.py` around lines 206 - 213,
After freeze_module_quantization(module) deletes a module's weight_observer,
other fused observers still hold stale references in their _fused_observers list
(created by Observer.fuse), which can lead to dangling references if qparams are
ever recomputed; fix by ensuring observers are mutually cleaned up: update
freeze_module_quantization (or Observer.detach) to also remove the removed
observer from any peers' _fused_observers lists (or clear the peer lists when
fully frozen), so that after calling freeze_module_quantization on modules in
fused_modules the Observer._fused_observers no longer contains references to
deleted weight_observer instances and future calls to update_qparams/observe
cannot see stale observers.


# 3. compress module using microscale qparams
compress_module(module)

# 4. save compressed data (on cpu)
del tensors[name]
module_name, _ = name.rsplit(".", 1)
prefix = module_name + "."
for key, value in module.state_dict(prefix=prefix).items():
tensors[key] = value.to("cpu")
Expand Down
18 changes: 7 additions & 11 deletions src/llmcompressor/modifiers/gptq/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@
make_empty_hessian,
quantize_weight,
)
from llmcompressor.modifiers.quantization.calibration import update_weight_global_scale
from llmcompressor.modifiers.quantization.calibration import observe
from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
from llmcompressor.modifiers.utils import update_fused_layer_weight_global_scales
from llmcompressor.sentinel import Sentinel
from llmcompressor.utils.metric_logging import CompressionLogger

Expand Down Expand Up @@ -202,13 +201,6 @@ def on_start(self, state: State, event: Event, **kwargs):
self.register_hook(module, self.calibrate_module, "forward")
added_hook = True

# Optionally generate global scales if using TENSOR_GROUP quantization
for _, module in named_modules:
update_weight_global_scale(module)

for module in state.model.modules():
update_fused_layer_weight_global_scales(module)

if not added_hook:
raise ValueError(
"GPTQModifier requires a weight quantization config be specified by "
Expand All @@ -221,11 +213,15 @@ def on_event(self, state: State, event: Event, **kwargs):
self.on_start(state, None)

if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
QuantizationMixin.sync_activation_observers(self, state.model)
self.sync_obs_act_stats(state.model)
self.update_activation_qparams(state.model)
observe(self._num_samples.keys(), base_name="weight")
self.compress_modules()

if event.type_ == EventType.CALIBRATION_EPOCH_END:
QuantizationMixin.sync_activation_observers(self, state.model)
self.sync_obs_act_stats(state.model)
self.update_activation_qparams(state.model)
observe(self._num_samples.keys(), base_name="weight")
self.compress_modules()

if not self.ended_:
Expand Down
26 changes: 9 additions & 17 deletions src/llmcompressor/modifiers/gptq/gptq_quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from loguru import logger

from llmcompressor.modifiers.utils import SPARSITY_THRESHOLD
from llmcompressor.observers.base import Observer
from llmcompressor.pytorch.utils.helpers import tensor_sparsity

GPTQ_PRECISION = torch.float32
Expand Down Expand Up @@ -85,31 +84,17 @@ def quantize_weight(
"""
strategy = quant_args.strategy
actorder = quant_args.actorder
global_scale = getattr(module, "weight_global_scale", None)
final_shape = module.weight.shape
final_dtype = module.weight.dtype
W = module.weight.clone()
H = hessian

# create observer for calculating quantization parameters
observer = Observer.load_from_registry(
quant_args.observer if quant_args.observer else "memoryless_minmax",
base_name="weight",
args=quant_args,
module=module,
)
observer = module.weight_observer

# standardize shape and dtype
match module:
case torch.nn.Conv2d():
W = W.flatten(1)
case transformers.Conv1D():
W.transpose_(0, 1)
W = W.to(dtype=GPTQ_PRECISION)
num_rows = W.shape[0]
Comment on lines 86 to 95
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Confirm whether Conv1D / Conv2d are still routed through quantize_weight
rg -nP -C3 '\b(Conv1D|Conv2d)\b' src/llmcompressor/modifiers/gptq/
rg -nP -C3 'quantize_weight\s*\(' --type=py
# Look for any guards that exclude Conv layers before calling quantize_weight
rg -nP -C3 'isinstance\([^)]+,\s*(transformers\.)?Conv1D' --type=py

Repository: vllm-project/llm-compressor

Length of output: 8173


🏁 Script executed:

# Read the quantize_weight function from start to verify shape handling
sed -n '68,160p' src/llmcompressor/modifiers/gptq/gptq_quantize.py | cat -n

Repository: vllm-project/llm-compressor

Length of output: 4064


🏁 Script executed:

sed -n '38,66p' src/llmcompressor/modifiers/gptq/gptq_quantize.py | cat -n

Repository: vllm-project/llm-compressor

Length of output: 1055


🏁 Script executed:

# Check sparsegpt_sparsify.py to see how it handles shape standardization
sed -n '80,110p' src/llmcompressor/modifiers/pruning/sparsegpt/sgpt_sparsify.py | cat -n

Repository: vllm-project/llm-compressor

Length of output: 1372


🏁 Script executed:

# Check if there are any guards/filters that prevent Conv2d/Conv1D from reaching quantize_weight
rg -B5 'quantize_weight\(' src/llmcompressor/modifiers/gptq/base.py | head -40

Repository: vllm-project/llm-compressor

Length of output: 328


🏁 Script executed:

# Get more context around the quantize_weight call in base.py
sed -n '290,320p' src/llmcompressor/modifiers/gptq/base.py | cat -n

Repository: vllm-project/llm-compressor

Length of output: 1592


🏁 Script executed:

# Check the logic for which modules get quantized - look for module type checks
rg -B10 'compress_module_list\(' src/llmcompressor/modifiers/gptq/base.py | head -60

Repository: vllm-project/llm-compressor

Length of output: 1011


🏁 Script executed:

# Find where accumulate_hessian is called to see module filtering
rg -B5 'accumulate_hessian\(' src/llmcompressor/modifiers/gptq/ | head -80

Repository: vllm-project/llm-compressor

Length of output: 1103


🏁 Script executed:

# Check weight_observer implementation to verify global_scale key
rg -A10 'class.*Observer' src/llmcompressor/modifiers/gptq/ | head -80

Repository: vllm-project/llm-compressor

Length of output: 53


🏁 Script executed:

# Search for weight_observer setup and get_qparams implementation
rg 'weight_observer' src/llmcompressor/modifiers/gptq/ -A3 | head -50

Repository: vllm-project/llm-compressor

Length of output: 371


🏁 Script executed:

# Find where weight_observer is created
rg 'weight_observer\s*=' src/llmcompressor/ -B3 -A3 | head -100

Repository: vllm-project/llm-compressor

Length of output: 53


🏁 Script executed:

# Search for observer setup in the quantization scheme
rg 'Observer' src/llmcompressor/modifiers/gptq/ --type=py -l

Repository: vllm-project/llm-compressor

Length of output: 53


🏁 Script executed:

# Find where get_qparams is defined
rg 'def get_qparams' src/llmcompressor/ --type=py -B2 -A10 | head -100

Repository: vllm-project/llm-compressor

Length of output: 916


🏁 Script executed:

# Find compute_qparams_from_statistics to verify it returns global_scale
rg 'def compute_qparams_from_statistics' src/llmcompressor/ --type=py -A30 | head -100

Repository: vllm-project/llm-compressor

Length of output: 2580


🏁 Script executed:

# Check test to see if Conv2d is tested with GPTQ
cat -n src/llmcompressor/modifiers/gptq/test_gptq_quantize.py | head -100

Repository: vllm-project/llm-compressor

Length of output: 158


🏁 Script executed:

# Find test files for GPTQ
find . -name '*test*gptq*' -o -name '*gptq*test*' 2>/dev/null | head -20

Repository: vllm-project/llm-compressor

Length of output: 191


🏁 Script executed:

# Check test_gptq_quantize.py to see what module types are tested
cat -n tests/llmcompressor/modifiers/gptq/test_gptq_quantize.py | head -80

Repository: vllm-project/llm-compressor

Length of output: 1422


Restore upfront shape standardization for Conv1D/Conv2d weights.

The code path removed the initial Conv2d.flatten(1) and Conv1D.transpose_(0, 1) operations that convert weights to 2D before quantization, but only applies the inverse Conv1D.transpose_(0, 1) at the end (line 268). This causes shape mismatches in the quantization loop:

  • Conv1D: Weight remains (in, out) so W.shape[1] = out, but H.shape[0] = in from accumulate_hessian. Column indexing in the GPTQ loop will be misaligned.
  • Conv2d: Weight is 4-D (out, in, kH, kW), but line 101 executes W[:, dead] = 0 expecting 2-D, and num_columns = W.shape[1] gives incorrect results.

Compare with sparsegpt_sparsify.py (lines 89-93), which preserves this standardization:

if isinstance(module, torch.nn.Conv2d):
    W = W.flatten(1)
elif isinstance(module, transformers.Conv1D):
    W.transpose_(0, 1)

Add equivalent guards at the start of quantize_weight before line 88 where num_rows and num_columns are calculated, with matching inverse operations before reshaping at line 269.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/llmcompressor/modifiers/gptq/gptq_quantize.py` around lines 86 - 95, The
quantize_weight routine is missing upfront shape standardization for Conv2d and
transformers.Conv1D, causing mismatched indexing; modify quantize_weight to,
when module is torch.nn.Conv2d, call W = W.flatten(1) and when module is
transformers.Conv1D call W.transpose_(0,1) before converting W to GPTQ_PRECISION
and computing num_rows/num_columns, and then apply the inverse transforms (for
Conv1D transpose back and for Conv2d reshape back to original 4-D) just before
the final reshape/assignment step around where the current inverse
Conv1D.transpose_(0,1) is applied so all column/row operations in the GPTQ loop
operate on a 2-D (out x in) matrix.

num_columns = W.shape[1]

scale, zero_point = observer(W)
# handle g_idx and activation ordering
if strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
# mapping from column index to group index
Expand All @@ -121,14 +106,21 @@ def quantize_weight(
if actorder == ActivationOrdering.GROUP:
W, H, perm = _apply_activation_ordering(W, H)
# actually need scale/zp for permuted weight for this format
scale, zero_point = observer(W)
observer(W)
# use identity g_idx (invert permutation later)

elif actorder == ActivationOrdering.WEIGHT:
# permute weights and g_idx
W, H, perm = _apply_activation_ordering(W, H)
g_idx = g_idx[perm]

qparams = observer.get_qparams()
scale, zero_point, global_scale = (
qparams["scale"],
qparams["zero_point"],
qparams["global_scale"],
)

# sparsity mask
sparsity = tensor_sparsity(W)
preserve_zeros = sparsity >= SPARSITY_THRESHOLD
Expand Down
Loading
Loading