[https://nvbugs/6095953][fix] Fix cache memory estimation for Qwen3 hybrid models in trtllm-bench (#13268)

hyukn · web-flow · commit be1f6f57d3be · 2026-04-28T09:36:01.000+08:00
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -11,7 +11,8 @@
     validate_and_set_kv_cache_quant
 from tensorrt_llm.bench.build.build import (get_benchmark_engine_settings,
                                             get_model_config)
-from tensorrt_llm.bench.build.dataclasses import NemotronHybridConfig
+from tensorrt_llm.bench.build.dataclasses import (NemotronHybridConfig,
+                                                  Qwen3HybridConfig)
 from tensorrt_llm.bench.dataclasses.general import (DatasetMetadata,
                                                     InferenceRequest)
 from tensorrt_llm.logger import logger
@@ -111,10 +112,9 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     else:
         model_config = get_model_config(model, model_path)
 
-        if isinstance(
-                model_config,
-                NemotronHybridConfig) and mamba_ssm_cache_dtype not in (None,
-                                                                        "auto"):
+        if isinstance(model_config,
+                      (NemotronHybridConfig, Qwen3HybridConfig
+                       )) and mamba_ssm_cache_dtype not in (None, "auto"):
             model_config.set_mamba_ssm_cache_dtype(mamba_ssm_cache_dtype)
 
         from tensorrt_llm._torch.model_config import ModelConfig
diff --git a/tensorrt_llm/bench/build/build.py b/tensorrt_llm/bench/build/build.py
@@ -5,7 +5,7 @@
 import click
 from click_option_group import AllOptionGroup, optgroup
 
-from tensorrt_llm._torch.pyexecutor.config_utils import is_nemotron_hybrid, load_pretrained_config
+from tensorrt_llm._torch.pyexecutor.config_utils import is_nemotron_hybrid, is_qwen3_hybrid, load_pretrained_config
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.utils.data import create_dataset_from_stream, initialize_tokenizer
 from tensorrt_llm.bench.utils import VALID_QUANT_ALGOS
@@ -14,7 +14,7 @@
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
-from tensorrt_llm.bench.build.dataclasses import ModelConfig, NemotronHybridConfig
+from tensorrt_llm.bench.build.dataclasses import ModelConfig, NemotronHybridConfig, Qwen3HybridConfig
 from tensorrt_llm.bench.build.tuning import calc_engine_setting
 
 TUNED_QUANTS = {
@@ -89,6 +89,8 @@ def get_model_config(model_name: str, model_path: Path = None) -> ModelConfig:
                                                trust_remote_code=True)
     if is_nemotron_hybrid(pretrained_config):
         return NemotronHybridConfig.from_hf(model_name, model_path)
+    if is_qwen3_hybrid(pretrained_config):
+        return Qwen3HybridConfig.from_hf(model_name, model_path)
     return ModelConfig.from_hf(model_name, model_path)
 
 
diff --git a/tensorrt_llm/bench/build/dataclasses.py b/tensorrt_llm/bench/build/dataclasses.py
@@ -13,7 +13,8 @@
 import json
 import struct
 
-from tensorrt_llm._torch.pyexecutor.config_utils import load_pretrained_config
+from tensorrt_llm._torch.pyexecutor.config_utils import (
+    load_pretrained_config, get_qwen3_hybrid_layer_types)
 
 
 def parse_safetensors_file_metadata(model_path, filename):
@@ -113,8 +114,9 @@ def _parse(filename: str) -> None:
 
 
 class ModelConfig(BaseModel):
-    """ Model specific configurations. The parameters are needed in engine
-        setting calculation.
+    """Model specific configurations.
+
+    The parameters are needed in engine setting calculation.
     """
     name: str
     model_type: str
@@ -254,3 +256,55 @@ def cache_memory_fraction(self, cache_memory_fraction):
 
     def set_mamba_ssm_cache_dtype(self, mamba_ssm_cache_dtype: str):
         self.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
+
+
+class Qwen3HybridConfig(ModelConfig):
+    """Config for Qwen3 hybrid models (full-attention + linear-attention layers).
+
+    Maps Qwen3.5 linear-attention parameters to the same cache estimation
+    formulas used by NemotronHybridConfig.
+    """
+    linear_key_head_dim: int  # d_state
+    linear_conv_kernel_dim: int  # d_conv
+    linear_num_value_heads: int  # num_heads (mamba_num_heads)
+    linear_num_key_heads: int  # n_groups
+    linear_value_head_dim: int  # head_dim (mamba_head_dim)
+    num_linear_attention_layers: Optional[int] = Field(default=None)
+    mamba_ssm_cache_dtype: Optional[str] = Field(default="auto")
+
+    @model_validator(mode="after")
+    def set_values_if_none(self):
+        """Derive num_attention_layers and num_linear_attention_layers.
+
+        Uses the HF config's layer_types / full_attention_interval.
+        """
+        if self.num_linear_attention_layers is None or self.num_attention_layers is None:
+            pretrained_config = load_pretrained_config(self.name,
+                                                       trust_remote_code=True)
+            layer_types = get_qwen3_hybrid_layer_types(pretrained_config)
+            if self.num_attention_layers is None:
+                self.num_attention_layers = sum(1 for lt in layer_types
+                                                if lt == "full_attention")
+            if self.num_linear_attention_layers is None:
+                self.num_linear_attention_layers = sum(
+                    1 for lt in layer_types if lt == "linear_attention")
+
+        super().set_values_if_none()
+        return self
+
+    def extra_model_cache_in_gb(self, bytes_per_elem, target_seq_len=None):
+        d_inner = self.linear_value_head_dim * self.linear_num_value_heads
+        conv_dim = d_inner + 2 * self.linear_num_key_heads * self.linear_key_head_dim
+        conv_state_elems = conv_dim * (self.linear_conv_kernel_dim - 1)
+        ssm_state_elems = (self.linear_num_value_heads *
+                           self.linear_value_head_dim *
+                           self.linear_key_head_dim)
+        gb_per_cache = bytes_per_elem * self.num_linear_attention_layers * (
+            conv_state_elems + ssm_state_elems) / (1024**3)
+        return gb_per_cache
+
+    def cache_memory_fraction(self, cache_memory_fraction):
+        return cache_memory_fraction**2
+
+    def set_mamba_ssm_cache_dtype(self, mamba_ssm_cache_dtype: str):
+        self.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
diff --git a/tensorrt_llm/bench/build/tuning.py b/tensorrt_llm/bench/build/tuning.py
@@ -6,7 +6,7 @@
 from tensorrt_llm.llmapi.llm_utils import QuantConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization.mode import QuantAlgo
-from tensorrt_llm.bench.build.dataclasses import ModelConfig, NemotronHybridConfig
+from tensorrt_llm.bench.build.dataclasses import ModelConfig, NemotronHybridConfig, Qwen3HybridConfig
 from .utils import get_device_memory
 import math
 
@@ -82,7 +82,7 @@ def calc_engine_setting(
         kv_cache_gpu_mem_fraction)
 
     bytes_per_elem = BYTES_PER_ELEM.get(QuantAlgo.NO_QUANT)
-    if isinstance(model_config, NemotronHybridConfig):
+    if isinstance(model_config, (NemotronHybridConfig, Qwen3HybridConfig)):
         mamba_ssm_cache_dtype = model_config.mamba_ssm_cache_dtype
         if mamba_ssm_cache_dtype != "auto":
             if str_dtype_to_torch(mamba_ssm_cache_dtype) == torch.float32:
@@ -110,8 +110,8 @@ def calc_engine_setting(
         target_input_len,
         target_output_len,
         pp_size,
-        disable_optimistic_tuning=isinstance(model_config,
-                                             NemotronHybridConfig))
+        disable_optimistic_tuning=isinstance(
+            model_config, (NemotronHybridConfig, Qwen3HybridConfig)))
 
     # Functional and performance
     if total_gpu_memory < engine_size: