refactor: always disable quantization during calibration, re-enable for propagation

dzhengAP · dzhengAP · commit 316114ad54fa · 2026-03-14T21:15:58.000-07:00
- pipeline.py: remove disable_qac / DISABLE_QAC_MODIFIERS conditional logic;
  quantization is now unconditionally disabled during calibration pass and
  re-enabled during propagation pass so downstream subgraphs receive
  quantized inputs
- quantization/base.py: remove erroneous disable_quantization call from
  on_start; control now lives entirely in pipeline layer
- observers/base.py: move update_offload_parameter to top-level import
- calibration.py: fix hook docstrings to accurately describe stats-only behavior

Signed-off-by: dqzhengAP &lt;dqzheng1996@gmail.com&gt;
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -195,10 +195,6 @@ def calibrate_activations(
     # min/max stats but do NOT write scale/zero_point yet.
     # Qparams are written once at epoch end via flush_activation_qparams.
     if stats_only:
-        # Deferred mode: accumulate global min/max into the observer's
-        # _deferred_min / _deferred_max. Works for ALL observer types,
-        # including MemorylessMinMaxObserver which has no past_min_vals.
-        # Qparams are written once at epoch end via flush_activation_qparams.
         observer = getattr(module, f"{base_name}_observer", None)
         if observer is not None:
             observer.update_deferred_stats(value)
@@ -215,21 +211,20 @@ def calibrate_activations(
 
 def calibrate_input_hook(module: Module, args: Any):
     """
-    Hook to calibrate input activations.
-    Accumulates running min/max statistics in the observer without computing
-    scale/zero_point. Qparams are computed once at epoch end via
-    flush_activation_qparams (deferred mode).
+    Hook to accumulate input activation statistics (min/max) in the observer.
+    Scale and zero_point are not written here; they are computed once per subgraph
+    at epoch end via flush_activation_qparams.
     """
     args = args[0] if isinstance(args, tuple) else args
     calibrate_activations(module, value=args, base_name="input", stats_only=True)
 
 
 def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor):
     """
-    Hook to calibrate output activations.
-    Accumulates running min/max statistics only (deferred qparam mode).
-    Qparams are computed at epoch end; forward_quantize is skipped during
-    calibration batches since quantization is disabled in the sequential pipeline.
+    Hook to accumulate output activation statistics (min/max) in the observer.
+    Scale and zero_point are not written here; they are computed once per subgraph
+    at epoch end via flush_activation_qparams.
+    Note: forward_quantize is intentionally absent — hooks only collect statistics.
     """
     calibrate_activations(
         module,
diff --git a/src/llmcompressor/modifiers/quantization/quantization/base.py b/src/llmcompressor/modifiers/quantization/quantization/base.py
@@ -67,9 +67,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
     def on_start(self, state: State, event: Event, **kwargs):
         """
         Begin calibrating activations and weights. Calibrate weights only once on start.
-        Quantization is kept DISABLED during calibration batches so that forward passes
-        run in fp32. Activation qparams are computed once per subgraph at
-        SEQUENTIAL_EPOCH_END via flush_activation_qparams (deferred mode).
+        Activation qparams are computed once per subgraph at SEQUENTIAL_EPOCH_END via
+        flush_activation_qparams, rather than per batch.
         """
         self.started_ = True
         QuantizationMixin.start_calibration(self, state.model)
@@ -94,21 +93,14 @@ def on_start(self, state: State, event: Event, **kwargs):
         for _, module in tqdm.tqdm(named_modules, desc="Calibrating weights"):
             update_weight_zp_scale(module)
 
-        # Disable quantization during calibration batches so that fp32 activations
-        # flow through the model unmodified while hooks accumulate running stats.
-        # Re-enable once after epoch end when qparams have been flushed.
-        from compressed_tensors.quantization import disable_quantization
-
-        state.model.apply(disable_quantization)
-
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
             if not self.started_:
                 self.on_start(state, None)
 
         if event.type_ == EventType.SEQUENTIAL_EPOCH_END:
-            # Deferred qparam flush: compute scale/zero_point from accumulated
-            # running statistics, then free those stats to reduce memory.
+            # Compute scale/zero_point once from accumulated running statistics,
+            # then free those stats to reduce memory.
             for _, module in match_named_modules(
                 state.model, self.resolved_targets, self.ignore
             ):
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -7,8 +7,7 @@
 from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.quantization.utils import calculate_qparams, generate_gparam
 from compressed_tensors.registry.registry import RegistryMixin
-from compressed_tensors.utils import align_module_device
-
+from compressed_tensors.utils import align_module_device, update_offload_parameter
 from llmcompressor.observers.helpers import flatten_for_calibration
 
 __all__ = ["Observer", "MinMaxTuple", "ScaleZpTuple", "calibrate_module_from_observer"]
@@ -213,8 +212,6 @@ def calibrate_module_from_observer(
     :param base_name: one of "input", "output", "q", "k", "v"
     :return: True if qparams were updated, False if observer had no accumulated stats
     """
-    from compressed_tensors.utils import align_module_device, update_offload_parameter
-
     observer: Optional[Observer] = getattr(module, f"{base_name}_observer", None)
     if observer is None:
         return False
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING, Iterator
 
 import torch
+from compressed_tensors.quantization import disable_quantization, enable_quantization
 from compressed_tensors.utils import disable_offloading
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
@@ -19,7 +20,6 @@
 )
 from llmcompressor.utils.dev import get_main_device
 from llmcompressor.utils.helpers import (
-    DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
 )
@@ -111,18 +111,13 @@ def __call__(
 
         LifecycleCallbacks.calibration_epoch_start()
 
-        # TODO: remove this to enable quantization aware calibration
-        # for GPTQ, AWQ and AutoRound.
-        disable_qac = any(
-            type(mod).__name__ in DISABLE_QAC_MODIFIERS
-            for mod in session.lifecycle.recipe.modifiers
-        )
-
         with contextlib.ExitStack() as stack:
             stack.enter_context(calibration_forward_context(model))
-            # Optionally disable quantization
-            if not dataset_args.quantization_aware_calibration or disable_qac:
-                stack.enter_context(DisableQuantization(model))
+            # Always disable quantization during calibration so that observer hooks
+            # accumulate statistics from unquantized activations. Quantization is
+            # re-enabled during the propagation pass so that downstream subgraphs
+            # receive realistic (quantized) inputs.
+            stack.enter_context(DisableQuantization(model))
 
             # prepare intermediates cache
             activations = IntermediatesCache.from_dataloader(
@@ -148,7 +143,7 @@ def __call__(
                 num_batches = len(dataloader)
                 use_prefetch = getattr(dataset_args, "sequential_prefetch", False)
                 with disable_offloading():
-                    # do a preliminary pass to trigger modifier hooks
+                    # calibration pass: hooks accumulate activation statistics
                     for batch_idx, inputs in _get_batches(
                         activations,
                         num_batches,
@@ -159,10 +154,13 @@ def __call__(
                         session.state.current_batch_idx = batch_idx
                         subgraph.forward(model, **inputs)
 
+                    # flush accumulated stats -> write scale/zero_point once per subgraph
                     LifecycleCallbacks.sequential_epoch_end(subgraph)
 
-                    # this pass does not trigger modifier hooks
-                    # and is only used for capturing outputs of newly compressed modules
+                    # propagation pass: modifier hooks are disabled but quantization is
+                    # re-enabled so that compressed module outputs are quantized.
+                    # This ensures downstream subgraphs receive realistic inputs.
+                    model.apply(enable_quantization)
                     with HooksMixin.disable_hooks():
                         for batch_idx, inputs in _get_batches(
                             activations,
@@ -175,6 +173,7 @@ def __call__(
                             if subgraph_index < num_subgraphs - 1:
                                 activations.update(batch_idx, output)
                                 activations.delete(batch_idx, subgraph.consumed_names)
+                    model.apply(disable_quantization)
 
             # redundant, finish any remaining compression
             LifecycleCallbacks.calibration_epoch_end()