[None][feat] AutoDeploy: Onboard google/gemma-4-31B-it dense model, including nvfp4 (NVIDIA#12866)

suyoggupta · web-flow · commit 2f02816c2994 · 2026-04-10T23:12:40.000-07:00
Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;
diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md
@@ -14,7 +14,7 @@ The following is a table of supported models for the PyTorch backend:
 | `ExaoneMoEForCausalLM`               | K-EXAONE                           | `LGAI-EXAONE/K-EXAONE-236B-A23B`             |
 | `Gemma3ForCausalLM`                  | Gemma 3                            | `google/gemma-3-1b-it`                       |
 | `Gemma3nForConditionalGeneration` [^8]| Gemma 3n                           | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it` |
-| `Gemma4ForConditionalGeneration` [^7]| Gemma 4                            | `google/gemma-4-26B-A4B-it`                  |
+| `Gemma4ForConditionalGeneration` [^7]| Gemma 4                            | `google/gemma-4-26B-A4B-it`, `google/gemma-4-31B-it` |
 | `Glm4MoeForCausalLM`                 | GLM-4.5, GLM-4.6, GLM-4.7          | `THUDM/GLM-4-100B-A10B`                      |
 | `Glm4MoeLiteForCausalLM` [^6]        | GLM-4.7-Flash                      | `zai-org/GLM-4.7-Flash`                      |
 | `GlmMoeDsaForCausalLM`               | GLM-5                              | `zai-org/GLM-5`                              |
@@ -62,7 +62,7 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 [^4]: Overlap scheduler isn't supported when using EAGLE-3(Two Model Engine) for GPT-OSS.
 [^5]: Supported via the [AutoDeploy](../features/auto_deploy/auto-deploy.md) backend. See [AD config](../../../examples/auto_deploy/model_registry/configs/qwen3.5_moe_400b.yaml).
 [^6]: Supported via the [AutoDeploy](../features/auto_deploy/auto-deploy.md) backend. See [AD config](../../../examples/auto_deploy/model_registry/configs/glm-4.7-flash.yaml).
-[^7]: Text-only support via the [AutoDeploy](../features/auto_deploy/auto-deploy.md) backend. See [AD config](../../../examples/auto_deploy/model_registry/configs/gemma4_moe.yaml).
+[^7]: Text-only support via the [AutoDeploy](../features/auto_deploy/auto-deploy.md) backend. See AD configs for [MoE](../../../examples/auto_deploy/model_registry/configs/gemma4_moe.yaml) and [dense](../../../examples/auto_deploy/model_registry/configs/gemma4_dense.yaml).
 [^8]: Text-only support via the [AutoDeploy](../features/auto_deploy/auto-deploy.md) backend. See [AD config](../../../examples/auto_deploy/model_registry/configs/gemma3n_e2b_it.yaml).
 
 
diff --git a/examples/auto_deploy/model_registry/configs/gemma4_dense.yaml b/examples/auto_deploy/model_registry/configs/gemma4_dense.yaml
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Gemma 4 dense (31B) — text-only AD export path.
+# Uses triton paged attention backend: supports head_dim=512 (global_head_dim),
+# paged KV cache, CUDA-graph-compatible, FlashDecoding for decode.
+model_factory: Gemma4ForConditionalGeneration
+tokenizer: google/gemma-4-31B-it
+attn_backend: triton_paged
+compile_backend: torch-cudagraph
+cuda_graph_config:
+  batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+max_num_tokens: 8192
+max_batch_size: 512
+max_seq_len: 8192
+enable_chunked_prefill: true
+kv_cache_config:
+  enable_block_reuse: false
+  free_gpu_memory_fraction: 0.8
+transforms:
+  compile_model:
+    piecewise_enabled: true
+  mlir_elementwise_fusion:
+    enabled: true
+  gather_logits_before_lm_head:
+    enabled: true
+  fuse_gemms:
+    enabled: true
diff --git a/examples/auto_deploy/model_registry/models.yaml b/examples/auto_deploy/model_registry/models.yaml
@@ -315,6 +315,9 @@ models:
   yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'gemma4_moe_base.yaml']
 - name: google/gemma-4-26B-A4B-it
   yaml_extra: ['dashboard_default.yaml', 'world_size_1.yaml', 'gemma4_moe.yaml']
+# --- Gemma 4 (2026) - Dense 31B ---
+- name: google/gemma-4-31B-it
+  yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'gemma4_dense.yaml']
 # --- JetBrains Mellum (Apr 2025) - code specialist ---
 - name: JetBrains/Mellum-4b-sft-all
   yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_paged_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention/triton_paged_attention.py
@@ -347,14 +347,14 @@ def _flash_decode_stage1_kernel(
         )
         page_mask_2d = page_mask[:, None]
 
-        k = tl.load(
-            kv_cache_ptr + cache_base, mask=page_mask_2d, other=0.0
-        )  # [PAGE_SIZE, HEAD_DIM]
+        k = tl.load(kv_cache_ptr + cache_base, mask=page_mask_2d, other=0.0).to(
+            q_all.dtype
+        )  # [PAGE_SIZE, HEAD_DIM]; cast from fp8 if kv cache is fp8
         v = tl.load(
             kv_cache_ptr + cache_base + cache_stride_kv,
             mask=page_mask_2d,
             other=0.0,
-        )  # [PAGE_SIZE, HEAD_DIM]
+        ).to(q_all.dtype)  # [PAGE_SIZE, HEAD_DIM]; cast from fp8 if kv cache is fp8
 
         # [HEAD_RATIO_PADDED, HEAD_DIM] @ [HEAD_DIM, PAGE_SIZE] -> [HEAD_RATIO_PADDED, PAGE_SIZE]
         attn = tl.dot(q_all, tl.trans(k)) * SM_SCALE
@@ -728,12 +728,12 @@ def _paged_context_kernel(
                 kv_cache_ptr + page_base + local_kv,
                 mask=tl.full([PAGE_SIZE, HEAD_DIM], 1, tl.int1),
                 other=0.0,
-            )
+            ).to(q.dtype)  # cast from fp8 if kv cache is fp8
             v = tl.load(
                 kv_cache_ptr + page_base + local_kv + cache_stride_kv,
                 mask=tl.full([PAGE_SIZE, HEAD_DIM], 1, tl.int1),
                 other=0.0,
-            )
+            ).to(q.dtype)  # cast from fp8 if kv cache is fp8
 
             qk = tl.dot(q, tl.trans(k)) * SM_SCALE
 
@@ -745,24 +745,16 @@ def _paged_context_kernel(
             full_mask_p1 = q_mask[:, None] & sw_mask
             qk = tl.where(full_mask_p1, qk, float("-inf"))
         else:
-            k_block_ptr = tl.make_block_ptr(
-                base=kv_cache_ptr + page_base,
-                shape=(PAGE_SIZE, HEAD_DIM),
-                strides=(cache_stride_token, 1),
-                offsets=(0, 0),
-                block_shape=(PAGE_SIZE, HEAD_DIM),
-                order=(1, 0),
-            )
-            v_block_ptr = tl.make_block_ptr(
-                base=kv_cache_ptr + page_base + cache_stride_kv,
-                shape=(PAGE_SIZE, HEAD_DIM),
-                strides=(cache_stride_token, 1),
-                offsets=(0, 0),
-                block_shape=(PAGE_SIZE, HEAD_DIM),
-                order=(1, 0),
-            )
-            k = tl.load(k_block_ptr)
-            v = tl.load(v_block_ptr)
+            k = tl.load(
+                kv_cache_ptr + page_base + local_kv,
+                mask=tl.full([PAGE_SIZE, HEAD_DIM], 1, tl.int1),
+                other=0.0,
+            ).to(q.dtype)  # cast from fp8 if kv cache is fp8
+            v = tl.load(
+                kv_cache_ptr + page_base + local_kv + cache_stride_kv,
+                mask=tl.full([PAGE_SIZE, HEAD_DIM], 1, tl.int1),
+                other=0.0,
+            ).to(q.dtype)  # cast from fp8 if kv cache is fp8
 
             qk = tl.dot(q, tl.trans(k)) * SM_SCALE
 
@@ -799,12 +791,14 @@ def _paged_context_kernel(
             # Use int64 to avoid overflow when physical_page * stride > 2^31
             page_base = physical_page.to(tl.int64) * cache_stride_block + kv_head_offset
             page_mask_2d = page_mask[:, None]
-            k = tl.load(kv_cache_ptr + page_base + local_kv, mask=page_mask_2d, other=0.0)
+            k = tl.load(kv_cache_ptr + page_base + local_kv, mask=page_mask_2d, other=0.0).to(
+                q.dtype
+            )  # cast from fp8 if kv cache is fp8
             v = tl.load(
                 kv_cache_ptr + page_base + local_kv + cache_stride_kv,
                 mask=page_mask_2d,
                 other=0.0,
-            )
+            ).to(q.dtype)  # cast from fp8 if kv cache is fp8
 
             qk = tl.dot(q, tl.trans(k)) * SM_SCALE
             kv_positions = kv_base_pos + page_offsets[None, :]
@@ -938,11 +932,24 @@ def triton_paged_context(
 
     max_pages = (max_q_len + page_size - 1) // page_size
     total_expected_pages = num_seq * max_pages
+    # Force SDPA for large head_dim: the Triton paged kernel's tl.dot produces
+    # misaligned shared memory accesses on Blackwell when HEAD_DIM > 256.
+    large_head_dim = head_dim > 256
+    # kv_indices may be a pre-allocated buffer larger than the actual page count;
+    # fall back to the page table indptr which always reflects the true count.
+    pages_uniform = kv_indices.shape[0] == total_expected_pages or (
+        max_pages > 0 and int(kv_indptr[-1].item()) == total_expected_pages
+    )
+    # SDPA reshape requires all sequences to have the same q_len (since q is
+    # packed as [total_tokens, ...] and we reshape to [num_seq, max_q_len, ...]).
+    # Check without GPU sync: sum(q_len_i) == num_seq * max_q_len iff all equal.
+    all_same_q_len = total_tokens == num_seq * max_q_len
     use_sdpa = (
-        max_q_len >= 512
-        and num_seq <= 64
+        (max_q_len >= 512 or large_head_dim)
+        and (num_seq <= 64 or large_head_dim)
         and max_pages > 0
-        and kv_indices.shape[0] == total_expected_pages  # all seqs same page count
+        and pages_uniform
+        and all_same_q_len
         and sw == 0  # SDPA doesn't support sliding window natively
     )
 
@@ -979,6 +986,11 @@ def triton_paged_context(
             HEAD_DIM=head_dim,
         )
 
+        # Cast k/v to query dtype if kv cache uses a different dtype (e.g., fp8)
+        if kv_cache.dtype != q.dtype:
+            k_sdpa = k_sdpa.to(q.dtype)
+            v_sdpa = v_sdpa.to(q.dtype)
+
         # SDPA with GQA
         o_sdpa = torch.nn.functional.scaled_dot_product_attention(
             q.view(num_seq, max_q_len, n_heads, head_dim).transpose(1, 2),
diff --git a/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py b/tensorrt_llm/_torch/auto_deploy/models/quant_config_reader.py
@@ -85,7 +85,13 @@ def has(cls, reader_cls: str) -> bool:
 
 @QuantConfigReaderRegistry.register("modelopt")
 class ModelOPTQuantConfigReader(QuantConfigReader):
-    _ALWAYS_EXCLUDE = ("lm_head", "model.embed_tokens", "*.mixer.gate*", "*.mlp.gate")
+    _ALWAYS_EXCLUDE = (
+        "lm_head",
+        "model.embed_tokens",
+        "*.embed_tokens",
+        "*.mixer.gate*",
+        "*.mlp.gate",
+    )
     DEFAULT_TORCH_DTYPE = "float16"
     DEFAULT_KV_CACHE_DTYPE = "fp8"
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_gemma4_modeling.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/models/test_gemma4_modeling.py
@@ -107,6 +107,55 @@ def _small_text_config() -> Gemma4TextConfig:
     return config
 
 
+def _small_dense_text_config() -> Gemma4TextConfig:
+    """Small config mimicking gemma-4-31B-it (dense, no MoE)."""
+    config = Gemma4TextConfig(
+        vocab_size=256,
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=3,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        num_global_key_value_heads=1,
+        head_dim=16,
+        global_head_dim=32,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=64,
+        rms_norm_eps=1e-6,
+        attention_bias=False,
+        attention_dropout=0.0,
+        attention_k_eq_v=True,
+        sliding_window=16,
+        layer_types=["sliding_attention", "sliding_attention", "full_attention"],
+        enable_moe_block=False,
+        num_experts=None,
+        top_k_experts=None,
+        expert_intermediate_size=None,
+        final_logit_softcapping=30.0,
+        hidden_size_per_layer_input=0,
+        num_kv_shared_layers=0,
+        use_double_wide_mlp=False,
+        use_bidirectional_attention="vision",
+        rope_parameters={
+            "full_attention": {
+                "rope_type": "proportional",
+                "rope_theta": 1000000.0,
+                "partial_rotary_factor": 0.25,
+            },
+            "sliding_attention": {
+                "rope_type": "default",
+                "rope_theta": 10000.0,
+            },
+        },
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+    )
+    config._attn_implementation = "eager"
+    return config
+
+
 def _position_ids(batch_size: int, seq_len: int, device: str) -> torch.Tensor:
     return torch.arange(seq_len, device=device).unsqueeze(0).expand(batch_size, -1)
 
@@ -695,3 +744,133 @@ def test_export():
     logits2 = out2[0] if isinstance(out2, tuple) else getattr(out2, "logits", out2)
     assert logits2.shape == (B2, S2, config.vocab_size)
     assert torch.isfinite(logits2).all()
+
+
+# ---------------------------------------------------------------------------
+# Tests — Dense variant (gemma-4-31B-it style, no MoE)
+# ---------------------------------------------------------------------------
+
+
+def test_dense_decoder_layer_equivalence():
+    """Dense (non-MoE) decoder layer matches reference for sliding and full attention."""
+    device, dtype = _device_and_dtype()
+    config = _small_dense_text_config()
+
+    for layer_idx in [0, 2]:
+        layer_type = config.layer_types[layer_idx]
+        ref = _RefDecoderLayer(config, layer_idx).to(device=device, dtype=dtype).eval()
+        ad = Gemma4TextDecoderLayer(config, layer_idx).to(device=device, dtype=dtype).eval()
+        _load_ref_into_ad(ad, ref)
+
+        B, S = 2, 8
+        x = torch.randn(B, S, config.hidden_size, device=device, dtype=dtype)
+        pos_ids = _position_ids(B, S, device)
+        rope = _build_ref_rope(config, layer_type, device, dtype)
+        cos, sin = rope(x, pos_ids)
+
+        causal_mask = (
+            torch.triu(torch.full((S, S), float("-inf"), device=device, dtype=dtype), diagonal=1)
+            .unsqueeze(0)
+            .unsqueeze(0)
+        )
+
+        with torch.no_grad():
+            ad_out = ad(x, (cos, sin))
+            ref_out = ref(x, (cos, sin), attention_mask=causal_mask)
+        assert_rmse_close(
+            ad_out,
+            ref_out,
+            rmse_ratio_tol=0.05,
+            msg=f"Dense layer {layer_idx} ({layer_type}): ",
+        )
+
+
+def test_dense_full_model_equivalence():
+    """Dense CausalLM logits (no MoE) match reference."""
+    device, dtype = _device_and_dtype()
+    config = _small_dense_text_config()
+
+    ref = _RefForCausalLM(config).to(device=device, dtype=dtype).eval()
+    ad = Gemma4ForCausalLM(config).to(device=device, dtype=dtype).eval()
+    _transfer_ref_to_ad_full_model(ad, ref)
+
+    B, S = 2, 8
+    input_ids = torch.randint(0, config.vocab_size, (B, S), device=device)
+    pos_ids = _position_ids(B, S, device)
+
+    with torch.no_grad():
+        ref_logits = ref(input_ids, pos_ids)
+        ad_out = ad(input_ids=input_ids, position_ids=pos_ids)
+
+    assert ad_out.logits.shape == (B, S, config.vocab_size)
+    assert torch.isfinite(ad_out.logits).all()
+    assert_rmse_close(ad_out.logits, ref_logits, rmse_ratio_tol=0.05, msg="Dense full model: ")
+
+
+def test_dense_conditional_generation_wrapper():
+    """ConditionalGeneration wrapper works with dense (non-MoE) text config."""
+    device, dtype = _device_and_dtype()
+    config = Gemma4Config(
+        text_config=_small_dense_text_config(),
+        vision_config=Gemma4VisionConfig(hidden_size=32),
+    )
+    model = Gemma4ForConditionalGeneration(config).to(device=device, dtype=dtype).eval()
+
+    B, S = 2, 8
+    input_ids = torch.randint(0, config.text_config.vocab_size, (B, S), device=device)
+    pos_ids = _position_ids(B, S, device)
+
+    with torch.no_grad():
+        out = model(input_ids=input_ids, position_ids=pos_ids)
+    assert out.logits is not None
+    assert out.logits.shape == (B, S, config.text_config.vocab_size)
+    assert torch.isfinite(out.logits).all()
+
+
+def test_dense_export():
+    """Dense model (no MoE) can be exported with torch.export."""
+    device = "cpu"
+    dtype = torch.float32
+    config = _small_dense_text_config()
+
+    model = Gemma4ForCausalLM(config).to(device=device, dtype=dtype).eval()
+
+    B, S = 2, 8
+    input_ids = torch.randint(0, config.vocab_size, (B, S), device=device)
+    pos_ids = _position_ids(B, S, device)
+
+    batch_dim = Dim("batch", min=1, max=4)
+    seq_dim = Dim("seq", min=1, max=64)
+    dynamic_shapes = {
+        "input_ids": {0: batch_dim, 1: seq_dim},
+        "position_ids": {0: batch_dim, 1: seq_dim},
+    }
+
+    gm = torch_export_to_gm(
+        model,
+        args=(input_ids,),
+        kwargs={"position_ids": pos_ids},
+        dynamic_shapes=dynamic_shapes,
+    )
+
+    with torch.no_grad():
+        pre_export_out = model(input_ids=input_ids, position_ids=pos_ids)
+        exported_out = gm(input_ids, position_ids=pos_ids)
+
+    logits = (
+        exported_out[0]
+        if isinstance(exported_out, tuple)
+        else getattr(exported_out, "logits", exported_out)
+    )
+    assert torch.isfinite(logits).all(), "Dense export produced non-finite values"
+    torch.testing.assert_close(logits, pre_export_out.logits, rtol=1e-3, atol=1e-3)
+
+    # Test different shape
+    B2, S2 = 1, 4
+    ids2 = torch.randint(0, config.vocab_size, (B2, S2), device=device)
+    pos2 = _position_ids(B2, S2, device)
+    with torch.no_grad():
+        out2 = gm(ids2, position_ids=pos2)
+    logits2 = out2[0] if isinstance(out2, tuple) else getattr(out2, "logits", out2)
+    assert logits2.shape == (B2, S2, config.vocab_size)
+    assert torch.isfinite(logits2).all()
diff --git a/tests/unittest/auto_deploy/singlegpu/custom_ops/attention/test_triton_paged_attention.py b/tests/unittest/auto_deploy/singlegpu/custom_ops/attention/test_triton_paged_attention.py