more reviewers feedback

Fridah-nv · Fridah-nv · commit b161f3b379a3 · 2026-05-12T22:52:26.000Z
Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -78,6 +78,16 @@ def get_weights_scaling_factor_2_from_quantizer(cls, weight_quantizer):
             )
             return weight_quantizer._amax.float() / (6.0 * 448.0)
 
+    @classmethod
+    def _cast_per_block_scale_to_fp8(cls, per_block_scale: torch.Tensor) -> torch.Tensor:
+        """Clamp to FP8 E4M3FN representable range, then cast.
+
+        FP8 E4M3FN has no Inf and a smallest positive subnormal of ``2**-9`` (~0.00195).
+        Values below the min silently underflow to 0 (zero outputs at inference); values
+        above 448 cast to NaN.
+        """
+        return per_block_scale.clamp(min=2**-9, max=448.0).to(torch.float8_e4m3fn)
+
     @classmethod
     def get_weights_scaling_factor_from_quantizer(
         cls,
@@ -122,17 +132,9 @@ def get_weights_scaling_factor_from_quantizer(
             expected_shape = (*weight.shape[:-1], num_blocks_per_row)
             per_block_scale = per_block_scale.view(expected_shape)
 
-            # Quantize scales to FP8. Saturate to the fp8_e4m3fn max (448) before the
-            # cast: when the [==0]=1.0 safety net above fires (per_block_amax was zero
-            # for an all-zero weight block) and global_amax is small, the pre-cast value
-            # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any
-            # value >= 480 casts to NaN — clamp first to keep the stored byte finite.
             if not keep_high_precision:
-                fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
-                per_block_scale = (
-                    (per_block_scale * 448.0 / per_block_scale_max)
-                    .clamp(min=fp8_e4m3fn_min, max=448.0)
-                    .to(torch.float8_e4m3fn)
+                per_block_scale = cls._cast_per_block_scale_to_fp8(
+                    per_block_scale * 448.0 / per_block_scale_max
                 )
             return per_block_scale, weights_scaling_factor_2
         else:
@@ -172,15 +174,8 @@ def get_weights_scaling_factor(
         )
         # Set all zero values in scale to 1.0
         per_block_scale[per_block_scale == 0] = 1.0
-        # Convert to torch.float8_e4m3fn
         if not keep_high_precision:
-            # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before
-            # casting.  Without this, blocks whose scale falls below the FP8 representable
-            # range silently underflow to 0, causing those blocks to produce zero output at
-            # inference even when the weights are non-trivial.
-            fp8_e4m3fn_min = 2**-9  # 0.001953125 — smallest positive subnormal
-            per_block_scale = per_block_scale.clamp(min=fp8_e4m3fn_min)
-            per_block_scale = per_block_scale.to(torch.float8_e4m3fn)
+            per_block_scale = cls._cast_per_block_scale_to_fp8(per_block_scale)
         return per_block_scale, weights_scaling_factor_2
 
     @classmethod
diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml
@@ -13,118 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+imports:
+  base_disable_all: configs/ptq/units/base_disable_all
+  default_disabled_quantizers: configs/ptq/units/default_disabled_quantizers
+  nvfp4: configs/numerics/nvfp4
+  nvfp4_static: configs/numerics/nvfp4_static
+
 metadata:
   recipe_type: ptq
-  description: >
-    NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep;
-    dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts
-    (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style).
+  description: NVFP4 static weight (MSE FP8-scale sweep) and dynamic activation for expert layers only (W4A4), no KV-cache quantization.
 quantize:
   algorithm:
     method: mse
     fp8_scale_sweep: true
     layerwise: false
   quant_cfg:
-    # ── Disable everything first ─────────────────────────────────────────────
-    - quantizer_name: '*'
-      enable: false
-
-    # ── Sequential experts (nn.Linear per expert) ────────────────────────────
+    - $import: base_disable_all
     - quantizer_name: '*mlp.experts*weight_quantizer'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
+        $import: nvfp4_static
     - quantizer_name: '*mlp.experts*input_quantizer'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-
-    # ── Sequential experts: Mixtral / block_sparse_moe style ────────────────
+        $import: nvfp4
     - quantizer_name: '*block_sparse_moe*weight_quantizer'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
+        $import: nvfp4_static
     - quantizer_name: '*block_sparse_moe*input_quantizer'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-
-    # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ──
-    - quantizer_name: '*gate_up_proj_weight_quantizers*'
-      enable: true
       cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
-    - quantizer_name: '*gate_up_proj_input_quantizer*'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-    - quantizer_name: '*down_proj_weight_quantizers*'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: static
-          scale_bits: e4m3
-        num_bits: e2m1
-    - quantizer_name: '*down_proj_input_quantizer*'
-      enable: true
-      cfg:
-        block_sizes:
-          -1: 16
-          type: dynamic
-          scale_bits: e4m3
-        num_bits: e2m1
-
-    # ── Exclusions: shared experts, attention, routers, lm_head ─────────────
-    - quantizer_name: '*block_sparse_moe.gate*'
-      enable: false
-    - quantizer_name: '*linear_attn.conv1d*'
-      enable: false
-    - quantizer_name: '*lm_head*'
-      enable: false
-    - quantizer_name: '*mlp.gate.*'
-      enable: false
-    - quantizer_name: '*mlp.shared_expert*'
-      enable: false
-    - quantizer_name: '*mlp.shared_expert_gate.*'
-      enable: false
-    - quantizer_name: '*router*'
-      enable: false
-    - quantizer_name: 'output.*'
-      enable: false
-    - parent_class: 'nn.BatchNorm1d'
-      quantizer_name: '*'
-      enable: false
-    - parent_class: 'nn.BatchNorm2d'
-      quantizer_name: '*'
-      enable: false
-    - parent_class: 'nn.BatchNorm3d'
-      quantizer_name: '*'
-      enable: false
-    - parent_class: 'nn.LeakyReLU'
-      quantizer_name: '*'
-      enable: false
+        $import: nvfp4
+    - $import: default_disabled_quantizers
diff --git a/tests/unit/torch/quantization/test_nvfp4_tensor.py b/tests/unit/torch/quantization/test_nvfp4_tensor.py
@@ -13,17 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for NVFP4QTensor per-block FP8 scale underflow clamping."""
+"""Tests for NVFP4QTensor per-block FP8 scale clamping (underflow + overflow)."""
+
+from types import SimpleNamespace
 
 import torch
 
 from modelopt.torch.quantization.qtensor.nvfp4_tensor import NVFP4QTensor
 
 _FP8_E4M3FN_MIN = 2**-9  # 0.001953125 — smallest positive FP8 E4M3FN subnormal
+_FP8_E4M3FN_MAX = 448.0
 
 
 class TestNVFP4ScaleClamping:
-    """Per-block weight scales below the FP8 E4M3FN minimum must be clamped, not rounded to zero."""
+    """Per-block weight scales outside the FP8 E4M3FN range must be clamped, not turned into 0/NaN."""
 
     def test_no_zero_scales_for_tiny_weights(self):
         """Tiny per-block amax (<<FP8 min) must not underflow to zero after FP8 cast."""
@@ -67,3 +70,42 @@ def test_mixed_weight_no_zeros(self):
         assert (per_block_scale.float() > 0).all(), (
             "Zero scales in mixed-magnitude tensor after FP8 cast."
         )
+
+    def test_helper_clamps_overflow_to_max(self):
+        """Values above 448 must saturate to 448, not cast to NaN (fp8_e4m3fn has no Inf)."""
+        oversized = torch.tensor([100.0, 448.0, 1e3, 1e6])
+        out = NVFP4QTensor._cast_per_block_scale_to_fp8(oversized).float()
+        assert torch.isfinite(out).all(), f"FP8 cast produced non-finite values: {out.tolist()}"
+        assert (out <= _FP8_E4M3FN_MAX).all(), f"FP8 cast values exceed 448: {out.tolist()}"
+
+    def test_helper_clamps_underflow_to_min(self):
+        """Values below the FP8 subnormal must clamp up, not collapse to 0."""
+        tiny = torch.tensor([0.0, 1e-12, 1e-6, _FP8_E4M3FN_MIN / 2])
+        out = NVFP4QTensor._cast_per_block_scale_to_fp8(tiny).float()
+        assert (out > 0).all(), f"FP8 cast produced zero scales: {out.tolist()}"
+
+    def test_static_path_no_nan_when_block_amax_zero(self):
+        """Static path: when a block's amax is 0 (all-zero weights), the `[==0]=1.0` safety net
+        and a small global_amax push the pre-cast value above 448. Without the max clamp,
+        fp8_e4m3fn would cast it to NaN — regression for the export-time NaN reported on this PR.
+        """
+        block_size = 16
+        # global_amax small enough that 1.0 * 448 / (global_amax/6) >> 448.
+        global_amax = torch.tensor(0.01)
+        # One block with amax=0 (triggers safety net), three normal blocks.
+        per_block_amax = torch.tensor([[0.0, 0.005, 0.008, 0.01]])
+        weight = torch.randn(1, 4 * block_size)
+        q = SimpleNamespace(
+            global_amax=global_amax,
+            _amax=per_block_amax,
+            block_sizes={-1: block_size},
+        )
+
+        per_block_scale, _ = NVFP4QTensor.get_weights_scaling_factor_from_quantizer(q, weight)
+        per_block_scale_f32 = per_block_scale.float()
+        assert torch.isfinite(per_block_scale_f32).all(), (
+            f"NaN/Inf in exported static per-block scale: {per_block_scale_f32.tolist()}"
+        )
+        assert (per_block_scale_f32 <= _FP8_E4M3FN_MAX).all(), (
+            f"Static per-block scale exceeds FP8 max 448: {per_block_scale_f32.tolist()}"
+        )