vllm-project · HDCharles · Apr 25, 2026 · Apr 25, 2026 · coderabbitai · Apr 25, 2026
@@ -8,16 +8,15 @@
     apply_calibration_status,
     freeze_module_quantization,
     initialize_observer,
-    update_weight_global_scale,
-    update_weight_zp_scale,
+    observe,
+    update_qparams,
 )
 from llmcompressor.observers.helpers import flatten_for_calibration
 
 __all__ = [
     "initialize_quantized_linear",
     "validate_weight_for_quantization",
-    "calibrate_global_scale",
-    "calibrate_scale_zp",
+    "calibrate_weight",
 ]
 
 
@@ -49,15 +48,9 @@ def initialize_quantized_linear(
     return module
 
 
-def calibrate_global_scale(module: torch.nn.Linear):
+def calibrate_weight(module: torch.nn.Linear):
     initialize_observer(module, "weight")
     apply_calibration_status(module)
-    update_weight_global_scale(module)
-    freeze_module_quantization(module)
-
-
-def calibrate_scale_zp(module: torch.nn.Linear):
-    initialize_observer(module, "weight")
-    apply_calibration_status(module)
-    update_weight_zp_scale(module)
+    observe(module, base_name="weight")
+    update_qparams(module, base_name="weight")
     freeze_module_quantization(module)
@@ -16,15 +16,22 @@
 from torch.nn import Module
 
 from llmcompressor.entrypoints.model_free.lifecycle import (
-    calibrate_global_scale,
-    calibrate_scale_zp,
+    calibrate_weight,
     initialize_quantized_linear,
     validate_weight_for_quantization,
 )
 from llmcompressor.entrypoints.model_free.microscale import (
     get_fused_names,
     is_microscale_scheme,
 )
+from llmcompressor.modifiers.quantization.calibration import (
+    apply_calibration_status,
+    freeze_module_quantization,
+    initialize_observer,
+    observe,
+    update_qparams,
+)
+from llmcompressor.observers import Observer
 
 __all__ = [
     "validate_file",
@@ -99,7 +106,7 @@ def process_file(
         module = initialize_quantized_linear(tensors[name], scheme, device)
 
         # 2. calibrate weight qparams
-        calibrate_scale_zp(module)
+        calibrate_weight(module)
 
         # 3. compress module using qparams
         compress_module(module)
@@ -175,14 +182,16 @@ def process_file_microscale_scheme(
         # 1. initialize module with qparams (on device)
         module = initialize_quantized_linear(tensors[name], scheme, device)
 
-        # 2. calibrate global scale; delay scale/zp for fused modules
-        calibrate_global_scale(module)
+        # gather fused modules for later processing
         if name in fused_name_to_fused_index:
             fused_index = fused_name_to_fused_index[name]
             fused_modules[fused_index][name] = module
+            initialize_observer(module, "weight")
+            apply_calibration_status(module)
             continue
 
-        calibrate_scale_zp(module)
+        # 2. get module qparams
+        calibrate_weight(module)
 
         # 3. compress module using qparams
         compress_module(module)
@@ -195,22 +204,20 @@ def process_file_microscale_scheme(
 
     # Compress fused modules with shared global scale
     for named_modules in fused_modules.values():
-        # 2.1. compute fused global scale across all members of the fused set
-        global_scales = [m.weight_global_scale for m in named_modules.values()]
-        fused_global_scale = torch.min(torch.cat(global_scales, dim=0))
+        # 2. fuse observers, observe weights, and get qparams
+        Observer.fuse([mod.weight_observer for mod in named_modules.values()])
+        observe(named_modules.values(), base_name="weight")
+        update_qparams(named_modules.values(), base_name="weight")
 
         for name, module in named_modules.items():
-            module_name, _ = name.rsplit(".", 1)
-            module.weight_global_scale.data.copy_(fused_global_scale)
-
-            # 2.2. finish calibration with fused global scale
-            calibrate_scale_zp(module)
+            freeze_module_quantization(module)
 
             # 3. compress module using microscale qparams
             compress_module(module)
 
             # 4. save compressed data (on cpu)
             del tensors[name]
+            module_name, _ = name.rsplit(".", 1)
             prefix = module_name + "."
             for key, value in module.state_dict(prefix=prefix).items():
                 tensors[key] = value.to("cpu")

diff --git a/src/llmcompressor/modifiers/gptq/base.py b/src/llmcompressor/modifiers/gptq/base.py
@@ -28,9 +28,8 @@
     make_empty_hessian,
     quantize_weight,
 )
-from llmcompressor.modifiers.quantization.calibration import update_weight_global_scale
+from llmcompressor.modifiers.quantization.calibration import observe
 from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
-from llmcompressor.modifiers.utils import update_fused_layer_weight_global_scales
 from llmcompressor.sentinel import Sentinel
 from llmcompressor.utils.metric_logging import CompressionLogger
 
@@ -202,13 +201,6 @@ def on_start(self, state: State, event: Event, **kwargs):
                     self.register_hook(module, self.calibrate_module, "forward")
                     added_hook = True
 
-        # Optionally generate global scales if using TENSOR_GROUP quantization
-        for _, module in named_modules:
-            update_weight_global_scale(module)
-
-        for module in state.model.modules():
-            update_fused_layer_weight_global_scales(module)
-
         if not added_hook:
             raise ValueError(
                 "GPTQModifier requires a weight quantization config be specified by "
@@ -221,11 +213,15 @@ def on_event(self, state: State, event: Event, **kwargs):
                 self.on_start(state, None)
 
         if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
-            QuantizationMixin.sync_activation_observers(self, state.model)
+            self.sync_obs_act_stats(state.model)
+            self.update_activation_qparams(state.model)
+            observe(self._num_samples.keys(), base_name="weight")
             self.compress_modules()
 
         if event.type_ == EventType.CALIBRATION_EPOCH_END:
-            QuantizationMixin.sync_activation_observers(self, state.model)
+            self.sync_obs_act_stats(state.model)
+            self.update_activation_qparams(state.model)
+            observe(self._num_samples.keys(), base_name="weight")
             self.compress_modules()
 
             if not self.ended_:

diff --git a/src/llmcompressor/modifiers/gptq/gptq_quantize.py b/src/llmcompressor/modifiers/gptq/gptq_quantize.py
@@ -12,7 +12,6 @@
 from loguru import logger
 
 from llmcompressor.modifiers.utils import SPARSITY_THRESHOLD
-from llmcompressor.observers.base import Observer
 from llmcompressor.pytorch.utils.helpers import tensor_sparsity
 
 GPTQ_PRECISION = torch.float32
@@ -85,31 +84,17 @@ def quantize_weight(
     """
     strategy = quant_args.strategy
     actorder = quant_args.actorder
-    global_scale = getattr(module, "weight_global_scale", None)
     final_shape = module.weight.shape
     final_dtype = module.weight.dtype
     W = module.weight.clone()
     H = hessian
 
-    # create observer for calculating quantization parameters
-    observer = Observer.load_from_registry(
-        quant_args.observer if quant_args.observer else "memoryless_minmax",
-        base_name="weight",
-        args=quant_args,
-        module=module,
-    )
+    observer = module.weight_observer
 
-    # standardize shape and dtype
-    match module:
-        case torch.nn.Conv2d():
-            W = W.flatten(1)
-        case transformers.Conv1D():
-            W.transpose_(0, 1)
     W = W.to(dtype=GPTQ_PRECISION)
     num_rows = W.shape[0]
     num_columns = W.shape[1]
 
-    scale, zero_point = observer(W)
     # handle g_idx and activation ordering
     if strategy in (QuantizationStrategy.GROUP, QuantizationStrategy.TENSOR_GROUP):
         # mapping from column index to group index
@@ -121,14 +106,21 @@ def quantize_weight(
         if actorder == ActivationOrdering.GROUP:
             W, H, perm = _apply_activation_ordering(W, H)
             # actually need scale/zp for permuted weight for this format
-            scale, zero_point = observer(W)
+            observer(W)
             # use identity g_idx (invert permutation later)
 
         elif actorder == ActivationOrdering.WEIGHT:
             # permute weights and g_idx
             W, H, perm = _apply_activation_ordering(W, H)
             g_idx = g_idx[perm]
 
+    qparams = observer.get_qparams()
+    scale, zero_point, global_scale = (
+        qparams["scale"],
+        qparams["zero_point"],
+        qparams["global_scale"],
+    )
+
     # sparsity mask
     sparsity = tensor_sparsity(W)
     preserve_zeros = sparsity >= SPARSITY_THRESHOLD