perf: make MSE observer compatible with torch.compile

Bias92 · Bias92 · commit bf63a4c3538b · 2026-03-12T02:19:05.000+09:00
compile inner _compute_candidate_error via torch.compile(dynamic=True). early stopping preserved in outer loop. compile flag added as oneshot arg. requires: vllm-project/compressed-tensors#627 related: pytorch/pytorch#177131 Signed-off-by: Jaewoo Kim <pewpewplay315@gmail.com>
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -22,6 +22,7 @@
 from llmcompressor.core.session_functions import active_session
 from llmcompressor.datasets import get_calibration_dataloader
 from llmcompressor.entrypoints.utils import post_process, pre_process
+from llmcompressor.observers.compile_config import set_observer_compile
 from llmcompressor.modeling.moe_context import moe_calibration_context
 from llmcompressor.pipelines import CalibrationPipeline
 
@@ -300,6 +301,7 @@ def oneshot(
     sequential_offload_device: str = "cpu",
     quantization_aware_calibration: bool = True,
     sequential_prefetch: bool = False,
+    enable_observer_compile: bool = False,
     # Miscellaneous arguments
     output_dir: str | None = None,
     log_dir: str | None = None,
@@ -406,9 +408,10 @@ def oneshot(
 
     # pass all args directly into Oneshot
     local_args = {
-        k: v for k, v in locals().items() if k not in ("local_args", "kwargs")
+        k: v for k, v in locals().items() if k not in ("local_args", "kwargs", "enable_observer_compile")
     }
     one_shot = Oneshot(**local_args, **kwargs)
+    set_observer_compile(enable_observer_compile)
     one_shot()
 
     return one_shot.model
diff --git a/src/llmcompressor/observers/compile_config.py b/src/llmcompressor/observers/compile_config.py
@@ -0,0 +1,18 @@
+"""
+Global configuration for observer torch.compile support.
+
+The compile flag is set by the oneshot entrypoint and read by observer
+instances at call time. This avoids threading the flag through recipe
+and modifier layers.
+"""
+
+_enable_observer_compile: bool = False
+
+
+def set_observer_compile(enabled: bool) -> None:
+    global _enable_observer_compile
+    _enable_observer_compile = enabled
+
+
+def get_observer_compile() -> bool:
+    return _enable_observer_compile
diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py
@@ -1,19 +1,25 @@
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
+import torch._dynamo.config
 from compressed_tensors.quantization import (
     QuantizationArgs,
     QuantizationStrategy,
 )
 from compressed_tensors.quantization.lifecycle import fake_quantize
 from compressed_tensors.quantization.utils import calculate_qparams, generate_gparam
-from compressed_tensors.utils import patch_attr
 
 from llmcompressor.observers.base import MinMaxTuple, Observer
+from llmcompressor.observers.compile_config import get_observer_compile
 from llmcompressor.observers.moving_base import MovingAverageObserverBase
 
 __all__ = ["MovingAverageMSEObserver"]
 
+# Allow torch.compile to handle scalar conversions inside
+# compressed_tensors' calculate_qparams (float(bit_range)).
+# Same approach as GPTQ compile path (commit a4f9ba2e).
+torch._dynamo.config.capture_scalar_outputs = True
+
 
 @Observer.register("memoryless_mse")
 class MemorylessMSEObserver(Observer):
@@ -32,7 +38,7 @@ class MemorylessMSEObserver(Observer):
     :param module: optional module with attached quantization parameters. This argument
         is required to utilize existing qparams such as global_scale or g_idx
     :param **observer_kwargs: keyword arguments for observer initialization\n
-        maxshrink: maximum shrink amount (in “grid steps”). The number of
+        maxshrink: maximum shrink amount (in "grid steps"). The number of
             search steps is int(maxshrink * grid)\n
         patience: number of consecutive search steps without improvement before
             early stopping\n
@@ -53,32 +59,39 @@ def __init__(self, *args, **kwargs):
         self.grid = observer_kwargs.get("grid", 100.0)
         self.norm = observer_kwargs.get("norm", 2.4)
 
-    def get_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
-        # min[min_vals, max_vals](mse_quant_error)
-        global_scale = self._get_module_param("global_scale")
+        # Pre-create token_args to avoid patch_attr context manager
+        # which causes torch.compile graph breaks
+        self._token_args = self.args.model_copy(
+            update={"strategy": QuantizationStrategy.TOKEN}
+        )
+
+    def _call_grid_search(
+        self,
+        observed: torch.Tensor,
+        global_scale: Optional[torch.Tensor],
+        optimize_global_scale: bool,
+    ) -> MinMaxTuple:
         return _grid_search_mse(
             observed,
             self.args,
+            self._token_args,
             self.maxshrink,
             self.patience,
             self.grid,
             self.norm,
             global_scale=global_scale,
-            optimize_global_scale=False,
+            optimize_global_scale=optimize_global_scale,
+            enable_compile=get_observer_compile(),
         )
 
+    def get_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
+        # min[min_vals, max_vals](mse_quant_error)
+        global_scale = self._get_module_param("global_scale")
+        return self._call_grid_search(observed, global_scale, False)
+
     def get_global_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
         # min[min_vals, max_vals, global_scale](mse_quant_error)
-        return _grid_search_mse(
-            observed,
-            self.args,
-            self.maxshrink,
-            self.patience,
-            self.grid,
-            self.norm,
-            global_scale=None,
-            optimize_global_scale=True,
-        )
+        return self._call_grid_search(observed, None, True)
 
 
 @Observer.register("mse")
@@ -98,7 +111,7 @@ class MovingAverageMSEObserver(MovingAverageObserverBase):
     :param module: optional module with attached quantization parameters. This argument
         is required to utilize existing qparams such as global_scale or g_idx
     :param **observer_kwargs: keyword arguments for observer initialization\n
-        maxshrink: maximum shrink amount (in “grid steps”). The number of
+        maxshrink: maximum shrink amount (in "grid steps"). The number of
             search steps is int(maxshrink * grid)\n
         patience: number of consecutive search steps without improvement before
             early stopping\n
@@ -119,55 +132,134 @@ def __init__(self, *args, **kwargs):
         self.grid = observer_kwargs.get("grid", 100.0)
         self.norm = observer_kwargs.get("norm", 2.4)
 
-    def get_current_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
-        # min[min_vals, max_vals](mse_quant_error)
-        global_scale = self._get_module_param("global_scale")
+        # Pre-create token_args to avoid patch_attr context manager
+        # which causes torch.compile graph breaks
+        self._token_args = self.args.model_copy(
+            update={"strategy": QuantizationStrategy.TOKEN}
+        )
+
+    def _call_grid_search(
+        self,
+        observed: torch.Tensor,
+        global_scale: Optional[torch.Tensor],
+        optimize_global_scale: bool,
+    ) -> MinMaxTuple:
         return _grid_search_mse(
             observed,
             self.args,
+            self._token_args,
             self.maxshrink,
             self.patience,
             self.grid,
             self.norm,
             global_scale=global_scale,
-            optimize_global_scale=False,
+            optimize_global_scale=optimize_global_scale,
+            enable_compile=get_observer_compile(),
         )
 
+    def get_current_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
+        # min[min_vals, max_vals](mse_quant_error)
+        global_scale = self._get_module_param("global_scale")
+        return self._call_grid_search(observed, global_scale, False)
+
     def get_current_global_min_max(self, observed: torch.Tensor) -> MinMaxTuple:
         # min[min_vals, max_vals, global_scale](mse_quant_error)
-        return _grid_search_mse(
-            observed,
-            self.args,
-            self.maxshrink,
-            self.patience,
-            self.grid,
-            self.norm,
-            global_scale=None,
-            optimize_global_scale=True,
-        )
+        return self._call_grid_search(observed, None, True)
+
+
+def _compute_candidate_error(
+    observed: torch.Tensor,
+    args: QuantizationArgs,
+    token_args: QuantizationArgs,
+    min_val: torch.Tensor,
+    max_val: torch.Tensor,
+    p: float,
+    norm: float,
+    global_scale: Optional[torch.Tensor],
+    optimize_global_scale: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Compute the quantization error for a single shrink factor.
+
+    Shared helper used by the grid search. When enable_compile is set
+    via oneshot, this function is called through its compiled wrapper
+    for accelerated execution.
+
+    :param observed: value of shape (num_observations, *qparams_shape, group_size)
+    :param args: quantization args used for computing qparams
+    :param token_args: quantization args with strategy set to TOKEN, pre-created
+        to avoid patch_attr context manager which causes torch.compile graph breaks
+    :param min_val: per-channel minimum values
+    :param max_val: per-channel maximum values
+    :param p: shrink factor (1 - i/grid)
+    :param norm: exponent used when computing the error
+    :param global_scale: precomputed global scale to use for quantization
+    :param optimize_global_scale: If True, recompute global_scale from candidates
+    :return: (error, shrinked_min_val, shrinked_max_val)
+    """
+    shrinked_min_val = p * min_val
+    shrinked_max_val = p * max_val
+
+    if optimize_global_scale:
+        global_scale = generate_gparam(shrinked_min_val, shrinked_max_val)
+
+    candidate_scales, candidate_zero_points = calculate_qparams(
+        min_vals=shrinked_min_val,
+        max_vals=shrinked_max_val,
+        quantization_args=args,
+        global_scale=global_scale,
+    )
+
+    # Use pre-created token_args instead of patch_attr context manager
+    # to maintain torch.compile compatibility
+    q = fake_quantize(
+        observed,
+        candidate_scales.unsqueeze(-1),
+        candidate_zero_points.unsqueeze(-1),
+        token_args,
+        global_scale=global_scale,
+    ).to(observed.dtype)
+
+    err = torch.sum((q - observed).abs().pow(norm), dim=(0, -1))
+    return err, shrinked_min_val, shrinked_max_val
+
+
+# Compiled variant of the inner computation.
+# The outer grid search loop stays in eager mode to preserve
+# early stopping (data-dependent control flow).
+_compute_candidate_error_compiled = torch.compile(
+    _compute_candidate_error, dynamic=True
+)
 
 
 def _grid_search_mse(
     observed: torch.Tensor,
     args: QuantizationArgs,
+    token_args: QuantizationArgs,
     maxshrink: float,
     patience: float,
     grid: float,
     norm: float,
     global_scale: Optional[torch.Tensor] = None,
     optimize_global_scale: bool = False,
+    enable_compile: bool = False,
 ) -> MinMaxTuple:
     """
     Perform a 1-D grid search to find per-channel min/max ranges that minimize
     mean-squared quantization error.
 
-    This routine progressively “shrinks” the absolute min/max ranges of the
-    observed tensor and evaluates the quantization error at each candidate
-    range. For each shrink factor ``p = 1 - i/grid`` up to ``maxshrink``.
+    Progressively shrinks the absolute min/max ranges of the observed tensor
+    and evaluates the quantization error at each candidate. Early stopping
+    exits when no improvement is found for ``patience`` consecutive steps.
+
+    When enable_compile is True, the inner error computation is executed
+    through a torch.compiled wrapper for accelerated execution while
+    preserving early stopping in the outer loop.
 
     :param observed: value of shape (num_observations, *qparams_shape, group_size)
     :param args: quantization args used for computing qparams and fake quant
-    :param maxshrink: maximum shrink amount (in “grid steps”). The number of
+    :param token_args: quantization args with strategy set to TOKEN
+    :param maxshrink: maximum shrink amount (in "grid steps"). The number of
         search steps is int(maxshrink * grid)
     :param patience: number of consecutive search steps without improvement before
         early stopping
@@ -178,50 +270,35 @@ def _grid_search_mse(
         `optimize_global_scale` is True
     :param optimize_global_scale: If True, recompute ``global_scale`` from the
         candidate min/max during each step of the search
+    :param enable_compile: If True, use torch.compiled inner computation
     """
     min_val = torch.amin(observed, dim=(0, -1))
     max_val = torch.amax(observed, dim=(0, -1))
     best_error = torch.full_like(min_val, torch.finfo(min_val.dtype).max)
     best_min_val = min_val.clone()
     best_max_val = max_val.clone()
 
-    # Early stopping params
+    compute_fn = (
+        _compute_candidate_error_compiled if enable_compile
+        else _compute_candidate_error
+    )
     no_improve_count = 0
 
     # @ksayers @HGCharles: investigate searching over separate shrinking factors
     for i in range(int(maxshrink * grid)):
         p = 1 - i / grid
-        shrinked_min_val = p * min_val
-        shrinked_max_val = p * max_val
-
-        if optimize_global_scale:
-            global_scale = generate_gparam(shrinked_min_val, shrinked_max_val)
-
-        candidate_scales, candidate_zero_points = calculate_qparams(
-            min_vals=shrinked_min_val,
-            max_vals=shrinked_max_val,
-            quantization_args=args,
-            global_scale=global_scale,
+        err, shrinked_min_val, shrinked_max_val = compute_fn(
+            observed,
+            args,
+            token_args,
+            min_val,
+            max_val,
+            p,
+            norm,
+            global_scale,
+            optimize_global_scale,
         )
 
-        # Note that observed.shape = (num_observations, *qparams_shape, group_size).
-        # For the purposes of fake quantization, this is equivalent to token quant
-        with patch_attr(args, "strategy", QuantizationStrategy.TOKEN):
-            q = fake_quantize(
-                observed,
-                candidate_scales.unsqueeze(-1),
-                candidate_zero_points.unsqueeze(-1),
-                args,
-                global_scale=global_scale,
-            ).to(observed.dtype)
-            # Note that due to forward quantization implementation, token quant,
-            # unlike tensor_group, requires extra dtype cast
-
-        q -= observed
-        q.abs_()
-        q.pow_(norm)
-        err = torch.sum(q, dim=(0, -1))
-
         tmp = err < best_error
         if torch.any(tmp):
             best_error[tmp] = err[tmp]
diff --git a/tests/llmcompressor/observers/test_mse.py b/tests/llmcompressor/observers/test_mse.py