new design

sychen52 · sychen52 · commit f3315e4e4e9c · 2026-06-05T11:49:30.000-07:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py
@@ -716,47 +716,48 @@ class MaxCalibConfig(QuantizeAlgorithmConfig):
         ),
     )
 
-    shared_patterns: dict[str, list[str]] | None = ModeloptField(
+    shared_states: dict[str, dict[str, list[str]]] | None = ModeloptField(
         default=None,
-        title="Regex patterns for groups that share quantization state",
+        title="Concrete shared quantization states and their grouping patterns",
         description=(
-            "Optional dict keyed by quantizer kind (``'weight'`` and/or ``'input'``), each a list "
-            "of regexes matched (full-match) against module fully-qualified names. They must list "
-            "every group you want for that kind. Modules whose match yields the same capture-group "
-            "tuple form one group; the capture boundary chooses granularity: capture the immediate "
-            "parent for per-parent / per-expert groups (e.g. ``r'(.*)\\.(?:q_proj|k_proj|v_proj)'``, "
-            "``r'(.*)\\.(?:w1|w3)'``); leave the expert index uncaptured for one cross-expert group "
-            "(``r'(.*)\\.experts\\.\\d+\\.(?:w1|w3)'``). Only ``'weight'`` is used today; ``'input'`` is "
-            "reserved for future input-quantizer sharing. When the ``'weight'`` list is omitted, "
-            "the default fusible patterns (q/k/v, gate/up, w1/w3) are used — these match exactly "
-            "the sibling groups export fuses, avoiding the over-grouping a shared-input heuristic "
-            "would cause (e.g. a ``shared_expert_gate`` that reads the same input but is not fused)."
+            "Optional dict keyed by shared-state name. ``'weight_global_amax'`` is implemented "
+            "today and accepts ``{'patterns': [...]}``, where patterns are full-match regexes "
+            "against module fully-qualified names. Omitted patterns use the state's defaults; "
+            "an empty pattern list disables that state."
         ),
     )
 
-    @field_validator("shared_patterns")
+    @field_validator("shared_states")
     @classmethod
-    def validate_shared_patterns(cls, v):
-        """Reject unknown quantizer kinds and invalid regexes at the config boundary."""
+    def validate_shared_states(cls, v):
+        """Reject unknown shared-state names, fields, and invalid regexes."""
         if v is None:
             return v
-        supported = {"weight", "input"}
+        supported = {"weight_global_amax"}
         unknown = set(v) - supported
         if unknown:
             raise ValueError(
-                f"shared_patterns has unsupported quantizer kind(s) {sorted(unknown)}; "
+                f"shared_states has unsupported state(s) {sorted(unknown)}; "
                 f"expected keys from {sorted(supported)}."
             )
-        offending = ("", "")  # (kind, pattern) of the last regex tried; set before each compile
+
+        offending = ("", "")
         try:
-            for kind, patterns in v.items():
-                for pattern in patterns:
-                    offending = (kind, pattern)
+            for state_name, state_cfg in v.items():
+                unknown_fields = set(state_cfg) - {"patterns"}
+                if unknown_fields:
+                    raise ValueError(
+                        f"shared_states[{state_name!r}] has unsupported field(s) "
+                        f"{sorted(unknown_fields)}; expected ['patterns']."
+                    )
+                for pattern in state_cfg.get("patterns", []):
+                    offending = (state_name, pattern)
                     re.compile(pattern)
         except re.error as e:
-            bad_kind, bad_pattern = offending
+            bad_state, bad_pattern = offending
             raise ValueError(
-                f"shared_patterns[{bad_kind!r}] has an invalid regex {bad_pattern!r}: {e}"
+                f"shared_states[{bad_state!r}]['patterns'] has an invalid regex "
+                f"{bad_pattern!r}: {e}"
             ) from e
         return v
 
diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py
@@ -45,6 +45,12 @@
     TensorQuantizer,
 )
 from .utils import is_quantized, is_quantized_linear
+from .utils.shared_input import (
+    DEFAULT_WEIGHT_SHARED_PATTERNS,
+    rebuild_shared_quant_states,
+    resolve_weight_global_amax_patterns,
+    shared_quant_states_metadata,
+)
 
 __all__ = [
     "register",
@@ -105,6 +111,24 @@ def maybe_promote_nvfp4_static_quantizer(module: nn.Module, quantizer_state: dic
         NVFP4StaticQuantizer.from_tensor_quantizer(module)
 
 
+def _restore_shared_quant_state_aliases(
+    model: nn.Module, config: QuantizeConfig, metadata: MetadataDict
+) -> None:
+    """Rebuild shared-state ties before checkpoint tensor values are loaded."""
+    if not metadata.get("shared_quant_states"):
+        return
+    method = getattr(config, "method", None)
+    if method == "max":
+        patterns = resolve_weight_global_amax_patterns(
+            shared_states=getattr(config, "shared_states", None)
+        )
+    elif method in {"mse", "local_hessian"}:
+        patterns = DEFAULT_WEIGHT_SHARED_PATTERNS
+    else:
+        return
+    rebuild_shared_quant_states(model, patterns=patterns)
+
+
 def restore_quantizer_state(model: nn.Module, config: QuantizeConfig, metadata: MetadataDict):
     """Restore the quantizer states from the given state dict.
 
@@ -146,6 +170,8 @@ def restore_quantizer_state(model: nn.Module, config: QuantizeConfig, metadata:
             name = get_unwrapped_name(name, model)
             module.modelopt_post_restore(name)
 
+    _restore_shared_quant_state_aliases(model, config, metadata)
+
     return model
 
 
@@ -176,6 +202,10 @@ def update_quantize_metadata(
 ) -> None:
     """Update the quantizer state in the metadata dict."""
     metadata["quantizer_state"] = quantizer_state(model)
+    if shared_state_metadata := shared_quant_states_metadata(model):
+        metadata["shared_quant_states"] = shared_state_metadata
+    else:
+        metadata.pop("shared_quant_states", None)
 
 
 def quantizer_state(model: nn.Module) -> dict[str, Any]:
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -54,6 +54,7 @@
     populate_shared_state,
     promote_nvfp4_static_quantizers,
     reduce_amax,
+    resolve_weight_global_amax_patterns,
 )
 from .utils.calib_utils import _GPTQ_HELPER_REGISTRY, GPTQHelper
 
@@ -112,16 +113,16 @@ def _collect_grouped_linears(model: nn.Module) -> list[list[nn.Module]]:
 
 @torch.no_grad()
 def _check_grouped_weight_global_amax_synced(model: nn.Module) -> None:
-    """Verify SharedQuantState unified each name-based fusible group's weight global_amax.
+    """Verify shared NVFP4 state unified each name-based fusible group's weight global_amax.
 
     The legacy name-based grouping (Q/K/V, gate/up, w1/w3) is kept here as a *check*
     rather than performed: after attach/populate/promote, the promoted static-NVFP4 weight
     quantizers in each name group must already share one ``global_amax``. This catches the
-    SharedQuantState path failing to form or sync a group it should have (e.g. a
+    SharedNVFP4GlobalAmaxState path failing to form or sync a group it should have (e.g. a
     :data:`DEFAULT_WEIGHT_SHARED_PATTERNS` regression, or an architecture the regexes miss)
     before the MSE per-block search — computed against ``global_amax`` — bakes in the
     inconsistency. Run only when the default patterns are in effect (custom
-    ``shared_patterns`` may intentionally group differently). Members whose ``global_amax``
+    ``shared_states`` may intentionally group differently). Members whose ``global_amax``
     is not materialized (``None``/meta, e.g. an ``init_empty_weights`` model) are skipped.
     """
     for group in _collect_grouped_linears(model):
@@ -132,8 +133,8 @@ def _check_grouped_weight_global_amax_synced(model: nn.Module) -> None:
         ref = amaxes[0]
         assert all(torch.equal(a, ref) for a in amaxes), (
             "A fusible sibling group (q/k/v or gate/up) was not unified to a shared weight "
-            "global_amax; SharedQuantState failed to sync it, so the per-block MSE scales "
-            "would be inconsistent across the group."
+            "global_amax; SharedNVFP4GlobalAmaxState failed to sync it, so the per-block "
+            "MSE scales would be inconsistent across the group."
         )
 
 
@@ -148,7 +149,7 @@ def _finalize_with_shared_state(model: nn.Module, weight_patterns: list[str]) ->
     populate_shared_state(model)
     promote_nvfp4_static_quantizers(model)
     # Under the default patterns, verify the fusible name groups were actually synced.
-    if weight_patterns is DEFAULT_WEIGHT_SHARED_PATTERNS:
+    if weight_patterns == DEFAULT_WEIGHT_SHARED_PATTERNS:
         _check_grouped_weight_global_amax_synced(model)
 
 
@@ -264,7 +265,7 @@ def max_calibrate(
     forward_loop: ForwardLoop | None = None,
     distributed_sync=True,
     sync_expert_weight_amax=False,
-    shared_patterns: Mapping[str, Sequence[str]] | None = None,
+    shared_states: Mapping[str, Mapping[str, Sequence[str]]] | None = None,
 ):
     """Calibrate the model using max.
 
@@ -275,29 +276,19 @@ def max_calibrate(
         distributed_sync: Whether to sync input_quantizer amax across distributed processes.
         sync_expert_weight_amax: SequentialMLP only — share one weight amax across all experts
             in a MoE layer (within-rank sync + EP all-reduce when EP>1).
-        shared_patterns: Optional dict keyed by quantizer kind (``"weight"``/``"input"``), each a
-            list of regexes over module FQNs. When the ``"weight"`` list is omitted,
-            :data:`DEFAULT_WEIGHT_SHARED_PATTERNS` (q/k/v, gate/up, w1/w3) is used. Modules whose
-            regex match yields the same capture-group tuple form one group — capture the immediate
-            parent for per-parent (per-expert) grouping, or leave the expert index uncaptured for
-            cross-expert. Only ``"weight"`` is used today; ``"input"`` is reserved for future
-            input-quantizer sharing.
+        shared_states: Optional dict keyed by shared-state name. ``"weight_global_amax"`` is
+            implemented today and accepts ``{"patterns": [...]}``; omitted patterns use
+            :data:`DEFAULT_WEIGHT_SHARED_PATTERNS`, while an empty list disables the state.
 
     See :class:`MaxCalibConfig <modelopt.torch.quantization.config.MaxCalibConfig>` for
     details on the remaining arguments.
     """
     # Discover fusible sibling groups by name regex and attach the (initially empty) shared
-    # state up front, so the SharedQuantState container exists for the whole calibration —
-    # forward-time fields can accumulate into it. Discovery is structural (a pattern over the
-    # module tree), so it needs no ``_amax``; per-member values are aggregated later by
-    # populate_shared_state, after the forward and any cross-rank ``_amax`` sync. Default to
-    # q/k/v + gate/up when no "weight" key is given; an explicit (possibly empty) list
-    # overrides it — key presence, not truthiness, so {"weight": []} disables grouping.
-    # Only "weight" is consumed today; "input" is reserved.
-    if shared_patterns is not None and "weight" in shared_patterns:
-        weight_patterns = list(shared_patterns["weight"])
-    else:
-        weight_patterns = DEFAULT_WEIGHT_SHARED_PATTERNS
+    # state up front, so parent-level runtime hooks can be installed by future concrete
+    # states. Discovery is structural (a pattern over the module tree), so it needs no
+    # ``_amax``; per-member values are aggregated later by populate_shared_state, after the
+    # forward and any cross-rank ``_amax`` sync.
+    weight_patterns = resolve_weight_global_amax_patterns(shared_states=shared_states)
     attach_shared_quant_states(model, patterns=weight_patterns)
 
     # Always run weight calibration on the weight tensor directly so every weight
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -172,12 +172,25 @@ class TensorQuantizer(nn.Module):
         "pre_bwd_fn",
         # quantizer cache for custom backends, like luts
         "_quantizer_cache",
-        # Runtime-only back-reference to a sibling group's SharedQuantState; it is
-        # re-established during calibration and must not be serialized (it points to a
-        # live module whose dynamic QuantLinear members are not picklable).
-        "_shared_quant_state_ref",
+        # Runtime-only references to concrete shared-state owners; they are re-established
+        # during calibration and must not be serialized.
+        "_shared_quant_state_refs",
+        # Runtime-only set of storage attributes tied to shared state. The tied
+        # aliases are rebuilt from calibration config and tensor state during restore.
+        "_shared_quant_tied_attrs",
     }
 
+    def __setattr__(self, name, value):
+        tied = self.__dict__.get("_shared_quant_tied_attrs", set())
+        if name in tied:
+            current = self._buffers.get(name, None) if "_buffers" in self.__dict__ else None
+            if value is not current:
+                raise RuntimeError(
+                    f"{name} is tied shared quant state; update it in-place or replace it "
+                    "through the owning shared-state object."
+                )
+        return super().__setattr__(name, value)
+
     def __init__(
         self,
         quant_attribute_cfg=None,
@@ -1368,8 +1381,11 @@ def _preserve_amax_in_fp32(self):
         if amax is not None:
             self._amax = amax.to(dtype=torch.float32)
         global_amax = getattr(self, "_global_amax", None)
-        if global_amax is not None:
-            self._global_amax = global_amax.to(dtype=torch.float32)
+        if global_amax is not None and global_amax.dtype != torch.float32:
+            if "_global_amax" in self.__dict__.get("_shared_quant_tied_attrs", set()):
+                global_amax.data = global_amax.to(dtype=torch.float32)
+            else:
+                self._global_amax = global_amax.to(dtype=torch.float32)
 
     def _amax_setter_helper(self, value):
         super()._amax_setter_helper(value)
diff --git a/modelopt/torch/quantization/utils/__init__.py b/modelopt/torch/quantization/utils/__init__.py
@@ -23,7 +23,7 @@
 __all__ = [
     "DEFAULT_WEIGHT_SHARED_PATTERNS",
     "EXPORT_MODE",
-    "SharedQuantState",
+    "SharedNVFP4GlobalAmaxState",
     "attach_shared_quant_states",
     "convert_quantization_axis_to_reduce_axis",
     "export_torch_mode",
@@ -32,11 +32,15 @@
     "is_quantized_column_parallel_linear",
     "is_quantized_linear",
     "is_quantized_row_parallel_linear",
+    "iter_shared_quant_states",
     "populate_shared_state",
+    "rebuild_shared_quant_states",
     "reduce_amax",
     "reduce_sum",
     "replace_function",
     "representative_weight_quantizer",
+    "resolve_weight_global_amax_patterns",
+    "shared_quant_states_metadata",
     "update_quant_cfg_with_kv_cache_quant",
     "weight_attr_names",
 ]
diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py
@@ -954,24 +954,26 @@ def promote_nvfp4_static_quantizers(model: nn.Module) -> int:
     need to be promoted so they use the two-level scaling path (global amax +
     per-block amax) instead of the generic E4M3 path.
 
-    If the quantizer has a ``_shared_quant_state_ref`` with a populated
-    ``weight_global_amax`` (sibling group) whose owning state lives within ``model``,
-    that shared value is used instead of this quantizer's own ``_amax`` reduction,
-    keeping siblings on a common FP8 grid.
+    If the quantizer has a shared-state reference with a populated
+    ``global_amax`` (sibling group) whose owning state lives within ``model``, the
+    promoted quantizer's ``_global_amax`` buffer is tied to that canonical state
+    buffer instead of receiving an independent copy.
 
     Returns the number of quantizers converted.
     """
     from modelopt.torch.quantization.nn import NVFP4StaticQuantizer, TensorQuantizer
+    from modelopt.torch.quantization.utils.shared_input import (
+        SharedNVFP4GlobalAmaxState,
+        iter_shared_quant_states,
+    )
 
     # Shared states owned within THIS promotion root. This function also runs on
     # submodules / individual linears; a quantizer may still carry a back-reference from
-    # an earlier full-model calibration whose owning ``_shared_quant_state`` is outside
-    # ``model``. Only trust refs reachable here — otherwise the global_amax would come
+    # an earlier full-model calibration whose owning state is outside ``model``. Only
+    # trust refs reachable here — otherwise the global_amax would come
     # from an unrelated prior run; fall back to the quantizer's own amax instead.
     valid_shared_states = {
-        id(state)
-        for owner in model.modules()
-        if (state := getattr(owner, "_shared_quant_state", None)) is not None
+        id(state) for state in iter_shared_quant_states(model, SharedNVFP4GlobalAmaxState)
     }
 
     converted = 0
@@ -984,19 +986,23 @@ def promote_nvfp4_static_quantizers(model: nn.Module) -> int:
         if amax is None:
             continue
 
-        # Grouped siblings share one ``weight_global_amax`` (common FP8 grid);
-        # otherwise fall back to this quantizer's own per-block amax.
+        # Grouped siblings share one canonical global_amax (common FP8 grid); otherwise
+        # fall back to this quantizer's own per-block amax.
         already_promoted = isinstance(module, NVFP4StaticQuantizer)
-        shared = getattr(module, "_shared_quant_state_ref", None)
+        shared_refs = module.__dict__.get("_shared_quant_state_refs", {})
+        shared = shared_refs.get(SharedNVFP4GlobalAmaxState.state_name)
         if (
-            shared is not None
+            isinstance(shared, SharedNVFP4GlobalAmaxState)
             and id(shared) in valid_shared_states
-            and shared.weight_global_amax is not None
+            and shared.global_amax is not None
         ):
-            global_amax = shared.weight_global_amax
+            NVFP4StaticQuantizer.from_tensor_quantizer(module)
+            shared.tie_member_quantizer(module)
         else:
+            if isinstance(shared, SharedNVFP4GlobalAmaxState):
+                shared.untie_member_quantizer(module)
             global_amax = reduce_amax(amax.clone().detach(), axis=None)
-        NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
+            NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
         if not already_promoted:
             converted += 1
     return converted
diff --git a/modelopt/torch/quantization/utils/shared_input.py b/modelopt/torch/quantization/utils/shared_input.py
diff --git a/tests/unit/torch/quantization/test_shared_input.py b/tests/unit/torch/quantization/test_shared_input.py