[None][fix] skip inference_mode() when torch.compile=True for gemma3 fp8 (NVIDIA#12367)

amukkara · web-flow · commit ce80c1480dde · 2026-04-11T01:33:39.000-04:00
Signed-off-by: Anurag Mukkara &lt;134339030+amukkara@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3.py b/tensorrt_llm/_torch/models/modeling_gemma3.py
@@ -22,6 +22,7 @@
 from ..modules.gated_mlp import GatedMLP
 from ..modules.linear import TensorParallelMode
 from ..modules.rms_norm import RMSNorm
+from ..utils import inference_mode_unless_compiling
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
                              register_auto_model)
 
@@ -47,7 +48,7 @@ def __init__(
         )
         self.embed_scale = torch.sqrt(torch.tensor(hidden_size)).to(self.dtype)
 
-    @torch.inference_mode()
+    @inference_mode_unless_compiling
     def forward(self, input_ids):
         return super().forward(input_ids) * self.embed_scale
 
@@ -90,7 +91,7 @@ def __init__(
             q_scaling=q_scaling,
         )
 
-    @torch.inference_mode()
+    @inference_mode_unless_compiling
     def forward(
         self,
         position_ids: Optional[torch.IntTensor],
@@ -163,7 +164,7 @@ def __init__(
             eps=config.rms_norm_eps,
             dtype=config.torch_dtype)
 
-    @torch.inference_mode()
+    @inference_mode_unless_compiling
     def forward(
         self,
         position_ids: torch.IntTensor,
@@ -222,7 +223,7 @@ def __init__(self, model_config: ModelConfig[Gemma3TextConfig]):
                             eps=config.pretrained_config.rms_norm_eps,
                             dtype=config.pretrained_config.torch_dtype)
 
-    @torch.inference_mode()
+    @inference_mode_unless_compiling
     def forward(
         self,
         attn_metadata: AttentionMetadata,
@@ -392,7 +393,7 @@ def get_flashinfer_attention_mask(
             context_mask_list.append(mask_i.flatten())
         return torch.cat(context_mask_list, dim=0).contiguous()
 
-    @torch.inference_mode()
+    @inference_mode_unless_compiling
     def forward(
         self,
         attn_metadata: AttentionMetadata,
diff --git a/tensorrt_llm/_torch/modules/qk_norm_attention.py b/tensorrt_llm/_torch/modules/qk_norm_attention.py
@@ -228,6 +228,7 @@ def k_l2norm():
             self.ln_events[0],
             self.ln_events[1],
             self.aux_stream,
+            disable_on_compile=True,
         )
 
         return q, k
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import functools
 import os
 import threading
 from dataclasses import dataclass
@@ -424,6 +425,19 @@ def wrapper(*args, **kwargs):
     return decorator(func) if func else decorator
 
 
+# This decorator selectively disables inference_mode() to avoid conflicts with torch.dynamo tracing.
+def inference_mode_unless_compiling(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if torch.compiler.is_compiling():
+            return func(*args, **kwargs)
+        with torch.inference_mode():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
 def split(x: torch.Tensor,
           tp_size: int,
           idx: int,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1317,14 +1317,17 @@ def test_auto_dtype(self):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
-    def test_fp8_prequantized(self):
+    @parametrize_with_ids("torch_compile", [False, True])
+    def test_fp8_prequantized(self, torch_compile):
         # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size.
         kv_cache_config = KvCacheConfig(enable_block_reuse=False,
                                         enable_partial_reuse=False,
                                         dtype="fp8")
+        torch_compile_config = _get_default_torch_compile_config(torch_compile)
         prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-1b-it-fp8/"
         with LLM(prequantized_model_path,
-                 kv_cache_config=kv_cache_config) as llm:
+                 kv_cache_config=kv_cache_config,
+                 torch_compile_config=torch_compile_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -272,6 +272,8 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False]
+accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True]
 accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_vswa_reuse
 accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_guided_decoding_vswa_reuse[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -330,7 +330,8 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
-  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized[torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
   - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype

Original file line number	Diff line number	Diff line change
`@@ -228,6 +228,7 @@ def k_l2norm():`
`228`	`228`	`self.ln_events[0],`
`229`	`229`	`self.ln_events[1],`
`230`	`230`	`self.aux_stream,`
	`231`	`+ disable_on_compile=True,`
`231`	`232`	`)`
`232`	`233`
`233`	`234`	`return q, k`