[TRTLLM-11770][feat] Skip nvfp4 fused norm if the dim doesn't meet the requirement (NVIDIA#12901)

pamelap-nvidia · web-flow · commit 79d2e3720d11 · 2026-04-11T01:23:34.000-04:00
Signed-off-by: Pamela &lt;179191831+pamelap-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/mamba/layernorm_gated.py b/tensorrt_llm/_torch/modules/mamba/layernorm_gated.py
@@ -1,7 +1,7 @@
 # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/layernorm_gated.py
 # Copyright (c) 2024, Tri Dao, Albert Gu.
 #
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +23,23 @@
 from ...utils import Fp4QuantizedTensor
 
 
+def fused_gated_rmsnorm_quant_shape_ok(hidden_size: int,
+                                       group_size: int) -> bool:
+    """True if ``torch.ops.trtllm.fused_gated_rmsnorm_quant`` supports this shape.
+
+    Keep in sync with TORCH_CHECKs in cpp/tensorrt_llm/thop/fusedGatedRMSNormQuant.cpp.
+    """
+    if group_size <= 0 or hidden_size % group_size != 0:
+        return False
+    if group_size % 256 != 0:
+        return False
+    if not (256 <= group_size <= 8192):
+        return False
+    if hidden_size % 16 != 0:
+        return False
+    return True
+
+
 @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 @triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
 @triton.jit
@@ -208,7 +225,8 @@ def forward(
 
         # NVFP4 quantized path - uses optimized fused CUDA kernel
         # Fuses: SiLU gating + Group RMSNorm + FP4 quantization
-        if self.is_nvfp4 and z is not None and not self.norm_before_gate:
+        if self.is_nvfp4 and z is not None and not self.norm_before_gate and \
+           fused_gated_rmsnorm_quant_shape_ok(self.hidden_size, self.group_size):
             if self.nvfp4_scale is None:
                 raise ValueError(
                     "RMSNormGated NVFP4 output requested but no `nvfp4_scale` is attached. "
diff --git a/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py b/tensorrt_llm/_torch/modules/mamba/mamba2_mixer.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -39,6 +39,7 @@
 from .fuse_elementwise_ops import (extract_transpose_xbc_prefill,
                                    fused_split_rearrange_after_conv1d)
 from .layernorm_gated import RMSNorm as RMSNormGated
+from .layernorm_gated import fused_gated_rmsnorm_quant_shape_ok
 from .selective_state_update import \
     selective_state_update as selective_state_update_native
 from .ssd_combined import mamba_chunk_scan_combined
@@ -234,7 +235,9 @@ def __init__(
 
     def post_load_weights(self):
         """Post-process after loading weights."""
-        if self.norm.is_nvfp4 and self.norm.nvfp4_scale is None:
+        if (self.norm.is_nvfp4 and fused_gated_rmsnorm_quant_shape_ok(
+                self.norm.hidden_size, self.norm.group_size)
+                and self.norm.nvfp4_scale is None):
             self._try_attach_nvfp4_scale()
 
     def _try_attach_nvfp4_scale(self):