Fix bugs for MSE

cjluo-nv · cjluo-nv · commit 35872383feaa · 2026-05-08T23:34:05.000Z
Signed-off-by: Chenjie Luo &lt;chenjiel@nvidia.com&gt;
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
@@ -110,11 +110,25 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 and w_quantizer._amax.dim() >= 1
             ):
                 amax = w_quantizer._amax
+                # Static block-quant calibration (e.g. NVFP4 MSE FP8 sweep)
+                # produces a per-block _amax with shape (num_blocks_total, ...)
+                # where num_blocks_total = fused_total * blocks_per_row. That
+                # shape collapses the row axis we want to slice on. Restore the
+                # row dimension so the dim-0 slicing below splits gate / up
+                # correctly.  No-op when _amax is already aligned with fused_total.
+                if amax.numel() != fused_total and amax.numel() % fused_total == 0:
+                    amax = amax.contiguous().view(fused_total, amax.numel() // fused_total)
                 amax_dim0 = amax.shape[0]
                 if fused_total % amax_dim0 == 0:
                     slice_start = fused_start * amax_dim0 // fused_total
                     slice_end = (fused_start + weight_slice.shape[0]) * amax_dim0 // fused_total
-                    w_quantizer.amax = amax[slice_start:slice_end].contiguous()
+                    sliced = amax[slice_start:slice_end].contiguous()
+                    # The amax setter refuses shape changes once `_amax` exists,
+                    # so drop the existing buffer before re-registering with the
+                    # sliced shape.
+                    if hasattr(w_quantizer, "_amax"):
+                        delattr(w_quantizer, "_amax")
+                    w_quantizer.amax = sliced
                 else:
                     warnings.warn(
                         f"Expert {idx} {proj_name}: fused amax dim0 ({amax_dim0}) does not "
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -16,6 +16,7 @@
 """Code that export quantized Hugging Face models for deployment."""
 
 import collections.abc
+import contextlib
 import json
 import re
 import tempfile
@@ -1134,6 +1135,30 @@ def _unpatch_revert_weight_conversion(patches: list[tuple[Any, Any]]) -> None:
         mod.revert_weight_conversion = original
 
 
+def _sanitize_generation_config_for_save(model: torch.nn.Module) -> None:
+    """Coerce ``model.generation_config`` so it passes transformers' strict validation.
+
+    Some upstream HF checkpoints ship a ``generation_config.json`` that mixes
+    ``do_sample=False`` with sampling-only attrs (``top_p``, ``top_k``, ...).
+    Newer transformers raise ``ValueError("GenerationConfig is invalid: ...")``
+    inside ``save_pretrained``, blocking export. We try a strict validate and
+    on failure flip ``do_sample`` to ``True`` so the upstream sampling intent
+    is preserved (rather than silently dropping ``top_p`` etc.). Quietly does
+    nothing if the model has no generation_config or it's already valid.
+    """
+    gc = getattr(model, "generation_config", None)
+    if gc is None or not hasattr(gc, "validate"):
+        return
+    try:
+        gc.validate(strict=True)
+        return
+    except Exception:
+        pass
+    if not getattr(gc, "do_sample", False):
+        with contextlib.suppress(Exception):
+            gc.do_sample = True
+
+
 def export_speculative_decoding(
     model: torch.nn.Module,
     dtype: torch.dtype | None = None,
@@ -1228,6 +1253,12 @@ def export_hf_checkpoint(
         # modeling_utils does `from core_model_loading import revert_weight_conversion`.
         _patches = _patch_revert_weight_conversion()
 
+        # Some upstream HF checkpoints ship a generation_config.json that fails
+        # transformers' strict validation on save (e.g. ``top_p`` set without
+        # ``do_sample=True`` — newer transformers raises). Flip ``do_sample`` to
+        # the sampling-attrs intent so save_pretrained can write the file.
+        _sanitize_generation_config_for_save(model)
+
         try:
             model.save_pretrained(
                 export_dir,
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -64,8 +64,98 @@
     "max_calibrate",
     "smoothquant",
     "svdquant",
+    "sync_grouped_weight_global_amax",
 ]
 
+
+# Sibling weight-quantizer name groups whose ``global_amax`` should share an
+# FP8 scale-of-scales. All members of a group sit under the same parent module
+# (e.g. one self-attention or one MLP block) and either consume the same input
+# tensor or get fused at deployment, so a divergent global_amax across siblings
+# would split their FP8 grids and skew the round.
+_GROUPED_WEIGHT_QUANTIZER_PATTERNS: tuple[tuple[str, ...], ...] = (
+    # Standard self-attention (skipped for fused qkv_proj — single weight).
+    ("q_proj", "k_proj", "v_proj"),
+    # Gated MLP, modern naming (Llama / Qwen / Mistral / etc.).
+    ("gate_proj", "up_proj"),
+    # Gated MLP, older Mixtral-style naming.
+    ("w1", "w3"),
+)
+
+
+def _is_calibrated_nvfp4_static_weight_quantizer(q) -> bool:
+    """True for an NVFP4-static weight quantizer that ``max_calibrate`` already
+    populated with a per-block ``_amax`` and that is currently enabled.
+    """
+    return (
+        isinstance(q, TensorQuantizer)
+        and not q._disabled
+        and q.is_nvfp4_static
+        and hasattr(q, "_amax")
+        and q._amax is not None
+    )
+
+
+def _collect_grouped_linears(model: nn.Module) -> list[list[nn.Module]]:
+    """Find groups of Linear-like submodules whose NVFP4-static weight quantizers
+    should share ``global_amax`` (Q/K/V under one attention parent; gate/up under
+    one MLP parent).
+    """
+    groups: list[list[nn.Module]] = []
+    wq_attr = quantizer_attr_names("weight").weight_quantizer
+    for parent in model.modules():
+        for sibling_names in _GROUPED_WEIGHT_QUANTIZER_PATTERNS:
+            members: list[nn.Module] = []
+            for n in sibling_names:
+                child = getattr(parent, n, None)
+                if child is None:
+                    continue
+                wq = getattr(child, wq_attr, None)
+                if _is_calibrated_nvfp4_static_weight_quantizer(wq):
+                    members.append(child)
+            if len(members) >= 2:
+                groups.append(members)
+    return groups
+
+
+@torch.no_grad()
+def sync_grouped_weight_global_amax(model: nn.Module) -> int:
+    """Sync ``global_amax`` across sibling NVFP4-static weight quantizers.
+
+    For each group of siblings (Q/K/V projections under one attention parent;
+    gate/up — a.k.a. ``w1``/``w3`` — under one MLP parent) unifies the
+    NVFP4 ``global_amax`` so the per-block FP8 round picks scales against a
+    consistent FP8 grid across the group during MSE / local-Hessian search.
+
+    Reuses :func:`modelopt.torch.export.quant_utils.preprocess_linear_fusion`
+    (whose ``NVFP4StaticQuantizer`` branch performs the same
+    ``max(stack(global_amax))`` unification at export time). To call it before
+    MSE, this helper first promotes each grouped weight quantizer to
+    :class:`NVFP4StaticQuantizer` with its local ``global_amax`` (=
+    ``reduce_amax(_amax)``); ``preprocess_linear_fusion`` then unifies in
+    place.
+
+    Must be called after ``max_calibrate`` has populated each weight
+    quantizer's ``_amax``. Idempotent. Returns the number of groups synced.
+    """
+    from modelopt.torch.export.quant_utils import preprocess_linear_fusion
+
+    n_groups = 0
+    for group in _collect_grouped_linears(model):
+        # Promote each member's weight quantizer so `preprocess_linear_fusion`
+        # sees post-conversion NVFP4StaticQuantizers (its NVFP4 branch reads
+        # `global_amax`, which only exists post-promotion).
+        wq_attr = quantizer_attr_names("weight").weight_quantizer
+        for child in group:
+            wq = getattr(child, wq_attr)
+            if not isinstance(wq, NVFP4StaticQuantizer):
+                local_global = reduce_amax(wq._amax, axis=None)
+                NVFP4StaticQuantizer.from_tensor_quantizer(wq, global_amax=local_global)
+        preprocess_linear_fusion(group)
+        n_groups += 1
+    return n_groups
+
+
 CalibratorFactory: TypeAlias = Callable[
     [torch.Tensor, int | tuple | list | None, Callable[..., torch.Tensor]], _Calibrator
 ]
@@ -349,6 +439,13 @@ def mse_calibrate(
     # Step 1: First get initial amax using max calibration
     max_calibrate(model, forward_loop, distributed_sync)
 
+    # Step 1b: Sync global_amax across sibling NVFP4-static weight quantizers
+    # (q/k/v_proj under one attention block; gate/up — a.k.a. w1/w3 — under one
+    # MLP block) so their FP8 scale-of-scales matches and the per-block FP8
+    # round uses a consistent grid. No-op when there are no sibling groups
+    # (e.g. fused QKV / fused gate_up_proj).
+    sync_grouped_weight_global_amax(model)
+
     # Step 2: Replace calibrators with MseCalibrator for enabled quantizers
     # and identify weight quantizers
     weight_quantizers = []
@@ -360,19 +457,16 @@ def mse_calibrate(
                 # Get the initial amax from max calibration
                 initial_amax = module._amax.clone().detach()
 
-                is_nvfp4_static = (
-                    module.is_static_block_quant
-                    and module._num_bits == (2, 1)
-                    and module._block_sizes is not None
-                    and module._block_sizes.get("scale_bits") == (4, 3)
-                )
+                is_nvfp4_static = module.is_nvfp4_static
 
                 if is_nvfp4_static:
-                    # Compute and set global_amax
-                    global_amax = reduce_amax(initial_amax, axis=None)
-
-                    # Convert to NVFP4StaticQuantizer in-place
-                    NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
+                    # If sync_grouped_weight_global_amax already promoted this
+                    # quantizer (it's a sibling in a Q/K/V or gate/up group),
+                    # its global_amax has been unified across the group; just
+                    # leave it. Otherwise convert + set local global_amax.
+                    if not isinstance(module, NVFP4StaticQuantizer):
+                        global_amax = reduce_amax(initial_amax, axis=None)
+                        NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)
 
                 if fp8_scale_sweep:
                     # Check if backend has a registered custom calibrator factory.
@@ -612,6 +706,11 @@ def forward(self, input, *args, **kwargs):
     print_rank_0("local_hessian: Running max calibration for all quantizers...")
     max_calibrate(model, forward_loop, distributed_sync)
 
+    # Sync global_amax across sibling NVFP4-static weight quantizers
+    # (q/k/v_proj, gate/up_proj a.k.a. w1/w3) so the FP8 scale-of-scales
+    # is consistent across the group. Idempotent; no-op when fused.
+    sync_grouped_weight_global_amax(model)
+
     # Setup helpers for all quantized linear modules
     name_to_module = dict(model.named_modules())
     weight_quantizers_info = []
@@ -666,14 +765,9 @@ def quant_func(x, amax, quantizer=weight_quantizer):
 
             return xq
 
-        is_nvfp4_static = (
-            weight_quantizer.is_static_block_quant
-            and weight_quantizer._num_bits == (2, 1)
-            and weight_quantizer._block_sizes is not None
-            and weight_quantizer._block_sizes.get("scale_bits") == (4, 3)
-        )
+        is_nvfp4_static = weight_quantizer.is_nvfp4_static
 
-        if is_nvfp4_static:
+        if is_nvfp4_static and not isinstance(weight_quantizer, NVFP4StaticQuantizer):
             global_amax = reduce_amax(initial_amax, axis=None)
             NVFP4StaticQuantizer.from_tensor_quantizer(weight_quantizer, global_amax=global_amax)
 
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -514,6 +514,22 @@ def is_mx_format(self):
             and self.block_sizes.get("scale_bits", None) == (8, 0)
         )
 
+    @property
+    def is_nvfp4_static(self):
+        """Check if this quantizer is configured for NVFP4 static block quantization.
+
+        Format-only check (does not consider whether ``_amax`` has been
+        populated by calibration). True when the quantizer holds E2M1 weights
+        with E4M3 per-block scales in a static layout — i.e. the two-level
+        scaling NVFP4 path consumed by :class:`NVFP4StaticQuantizer`.
+        """
+        return (
+            self.is_static_block_quant
+            and self._num_bits == (2, 1)
+            and self._block_sizes is not None
+            and self._block_sizes.get("scale_bits") == (4, 3)
+        )
+
     def is_mxfp(self, bits):
         """Check if is MXFP4/MXFP6/MXFP8."""
         if bits == 4:
diff --git a/modelopt/torch/quantization/utils/core_utils.py b/modelopt/torch/quantization/utils/core_utils.py
@@ -957,13 +957,7 @@ def promote_nvfp4_static_quantizers(model: nn.Module) -> int:
     for _name, module in list(model.named_modules()):
         if isinstance(module, TensorQuantizer) and not module._disabled:
             if module._calibrator is not None and not module._dynamic and hasattr(module, "_amax"):
-                is_nvfp4_static = (
-                    module.is_static_block_quant
-                    and module._num_bits == (2, 1)
-                    and module._block_sizes is not None
-                    and module._block_sizes.get("scale_bits") == (4, 3)
-                )
-                if is_nvfp4_static:
+                if module.is_nvfp4_static:
                     initial_amax = module._amax.clone().detach()
                     global_amax = reduce_amax(initial_amax, axis=None)
                     NVFP4StaticQuantizer.from_tensor_quantizer(module, global_amax=global_amax)