more reviwers feedback

Fridah-nv · Fridah-nv · commit cfe4a4aa9ecc · 2026-05-04T23:59:29.000Z
Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
@@ -90,13 +90,11 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 and w_quantizer._amax is not None
                 and w_quantizer._amax.dim() >= 1
             ):
-                amax = w_quantizer._amax  # CPU float32
+                amax = w_quantizer._amax
                 amax_dim0 = amax.shape[0]
-                if amax_dim0 % fused_total == 0:
+                if fused_total % amax_dim0 == 0:
                     slice_start = fused_start * amax_dim0 // fused_total
                     slice_end = (fused_start + weight_slice.shape[0]) * amax_dim0 // fused_total
-                    # Bypass amax.setter (which forbids shape changes); w_quantizer is a
-                    # deepcopy for gate/up so mutating it is safe.
                     w_quantizer._amax = amax[slice_start:slice_end].contiguous()
                 else:
                     warnings.warn(
@@ -114,6 +112,7 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
                 hasattr(w_quantizer, "_amax")
                 and w_quantizer._amax is not None
                 and w_quantizer._amax.numel() > 1
+                and (getattr(w_quantizer, "block_sizes", None) or {}).get(-1) is not None
             ):
                 amax_cpu = w_quantizer._amax
                 invalid_mask = ~(
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -410,7 +410,12 @@ def mse_calibrate(
                     quant_func=partial(_mse_quant_func, quantizer=module),
                 )
 
-    # Identify weight quantizers by checking if they have corresponding weight parameters
+    # Collect weight quantizers (standard + fused-experts per-expert lists).
+    try:
+        from modelopt.torch.quantization.plugins.huggingface import _QuantFusedExperts as _qfe_cls
+    except ImportError:
+        _qfe_cls = None  # type: ignore[misc]
+
     name_to_module = dict(model.named_modules())
     for parent_module in name_to_module.values():
         if parent_module in seen_modules:
@@ -421,22 +426,56 @@ def mse_calibrate(
             if isinstance(weight_quantizer, TensorQuantizer) and weight_quantizer.is_enabled:
                 if getattr(weight_quantizer, "_calibrator", None) is not None:
                     weight_quantizers.append((parent_module, weight_name, weight_quantizer))
-        # _QuantFusedExperts stores per-expert weight quantizers as nn.ModuleList named
-        # {param_name}_weight_quantizers (plural). Detect this pattern and enqueue each
-        # per-expert quantizer individually. The isinstance(qlist, nn.ModuleList) +
-        # isinstance(wq, TensorQuantizer) check below guards against false positives on
-        # unrelated modules that happen to have similarly-named attributes.
-        for param_name, _ in parent_module.named_parameters(recurse=False):
-            qlist = getattr(parent_module, f"{param_name}_weight_quantizers", None)
-            if not isinstance(qlist, nn.ModuleList):
-                continue
-            for expert_idx, wq in enumerate(qlist):
-                if isinstance(wq, TensorQuantizer) and wq.is_enabled:
-                    if getattr(wq, "_calibrator", None) is not None:
-                        weight_quantizers.append((parent_module, (param_name, expert_idx), wq))
+        # Enqueue per-expert quantizers from {param}_weight_quantizers ModuleLists.
+        if _qfe_cls is not None and isinstance(parent_module, _qfe_cls):
+            for param_name, param in parent_module.named_parameters(recurse=False):
+                qlist = getattr(parent_module, f"{param_name}_weight_quantizers", None)
+                if not isinstance(qlist, nn.ModuleList):
+                    continue
+                if len(qlist) != param.shape[0]:
+                    warnings.warn(
+                        f"Skipping {param_name}_weight_quantizers: list length {len(qlist)} "
+                        f"does not match parameter leading dimension {param.shape[0]}. "
+                        "This may indicate a misconfigured fused-experts module.",
+                        stacklevel=2,
+                    )
+                    continue
+                for expert_idx, wq in enumerate(qlist):
+                    if isinstance(wq, TensorQuantizer) and wq.is_enabled:
+                        if getattr(wq, "_calibrator", None) is not None:
+                            weight_quantizers.append((parent_module, (param_name, expert_idx), wq))
 
         seen_modules.add(parent_module)
 
+    # Warn about enabled weight quantizers that weren't scheduled for MSE calibration.
+    picked_ids = {id(wq) for _, _, wq in weight_quantizers}
+
+    def _is_active_unpicked(q: Any) -> bool:
+        return (
+            isinstance(q, TensorQuantizer)
+            and q.is_enabled
+            and getattr(q, "_calibrator", None) is not None
+            and id(q) not in picked_ids
+        )
+
+    missed: list[str] = []
+    for mod_name, module in name_to_module.items():
+        for attr_name, attr in module._modules.items():
+            if isinstance(attr, TensorQuantizer) and attr_name.endswith("weight_quantizer"):
+                if _is_active_unpicked(attr):
+                    missed.append(f"{mod_name}.{attr_name}")
+            elif isinstance(attr, nn.ModuleList) and attr_name.endswith("_weight_quantizers"):
+                for i, wq in enumerate(attr):
+                    if _is_active_unpicked(wq):
+                        missed.append(f"{mod_name}.{attr_name}[{i}]")
+    if missed:
+        warnings.warn(
+            f"MSE weight calibration: {len(missed)} weight quantizer(s) are enabled but were "
+            f"not scheduled for calibration and will retain max-calibration amax values. "
+            f"First {min(5, len(missed))}: {missed[:5]}",
+            stacklevel=2,
+        )
+
     # Step 3: Calibrate weight quantizers ONE AT A TIME with immediate amax computation
     # This prevents massive memory accumulation seen in large models
     for idx, (parent_module, weight_name, weight_quantizer) in enumerate(
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -713,7 +713,7 @@ def _quantize_with_block_sizes(self):
 
     @pytest.mark.parametrize("zero_amax", [False, True])
     def test_fallback_warning_emitted(self, zero_amax):
-        """Fallback warning must fire for uncalibrated (_amax=None) and zero-amax experts."""
+        """Fallback warning must fire and produce valid per-block _amax + global_amax."""
         import warnings
         from unittest.mock import patch
 
@@ -725,8 +725,16 @@ def test_fallback_warning_emitted(self, zero_amax):
             converted.gate_up_proj_weight_quantizers[idx]._amax = bad_amax
             converted.down_proj_weight_quantizers[idx]._amax = bad_amax
 
+        captured_wrappers = []
+
+        def _capture(wrapper, dtype):
+            captured_wrappers.append(wrapper)
+
         with (
-            patch("modelopt.torch.export.unified_export_hf._export_quantized_weight"),
+            patch(
+                "modelopt.torch.export.unified_export_hf._export_quantized_weight",
+                side_effect=_capture,
+            ),
             warnings.catch_warnings(record=True) as caught,
         ):
             warnings.simplefilter("always")
@@ -735,4 +743,17 @@ def test_fallback_warning_emitted(self, zero_amax):
         assert any("weight-derived per-block amax" in str(w.message) for w in caught), (
             f"No fallback warning emitted for {'zero' if zero_amax else 'None'} amax — Bug 3 regression"
         )
+
+        # Every per-block weight quantizer must have a repaired per-block _amax and global_amax.
+        for wrapper in captured_wrappers:
+            wq = wrapper.weight_quantizer
+            if not (getattr(wq, "block_sizes", None) or {}).get(-1):
+                continue
+            assert wq._amax is not None and wq._amax.numel() > 1, (
+                "Fallback did not produce per-block _amax"
+            )
+            assert hasattr(wq, "global_amax") and wq.global_amax > 0, (
+                "global_amax missing or zero after fallback"
+            )
+
         self._cleanup_registry(expert_type)
diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -29,7 +29,8 @@ def test_no_zero_scales_for_tiny_weights(self):
         """Tiny per-block amax (<<FP8 min) must not underflow to zero after FP8 cast."""
         block_size = 16
         tiny_weight = torch.full((4, block_size), 1e-10)
-        wsf2 = torch.tensor(1e-10 / (6.0 * 448.0))
+        # wsf2=1.0 → per_block_scale = amax/(6*wsf2) ≈ 1.7e-11 << 2^-9, exercises FP8-min clamp
+        wsf2 = torch.tensor(1.0)
 
         per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor(tiny_weight, block_size, wsf2)
         per_block_scale_f32 = per_block_scale.float()