yingguo-trt · pull · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
@@ -228,9 +228,25 @@ def build_worker_environment(worker_config, env_config, role, benchmark_mode,
                           'TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP',
                           'TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1')
         if role == "GEN":
+            gen_config = worker_config.get('gen', {})
+            concurrency_int = int(concurrency)
+            max_batch_size = int(
+                gen_config.get('max_batch_size', concurrency_int))
+            enable_attention_dp = gen_config.get('enable_attention_dp', False)
+            tp_size = int(gen_config.get('tensor_parallel_size', 1))
+            max_capacity = ((max_batch_size * tp_size)
+                            if enable_attention_dp else max_batch_size)
+            queue_size = min(max_capacity, concurrency_int)
+            if queue_size < concurrency_int:
+                print(f"[WARNING] TLLM_BENCHMARK_REQ_QUEUES_SIZE capped to "
+                      f"{queue_size} (max_batch_size={max_batch_size} x "
+                      f"tp_size={tp_size} with "
+                      f"attention_dp={enable_attention_dp}) "
+                      f"which is less than concurrency={concurrency}. "
+                      f"Fill loop would hang if set to {concurrency}.")
             upsert_env_config(env_config, 'gen_worker_env_var',
                               'TLLM_BENCHMARK_REQ_QUEUES_SIZE',
-                              f'TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency}')
+                              f'TLLM_BENCHMARK_REQ_QUEUES_SIZE={queue_size}')
 
     # 2. Add profiling env vars to env_config (conditional)
     if nsys_on:

diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -415,10 +415,11 @@ def get_model_yaml_config(model_label: str,
                 'attn_backend': 'FLASHINFER',
             }
         },
-        # Nemotron-3-Super-120B-NVFP4: chunked prefill + MTP-3 speculative decoding
+        # Nemotron-3-Super-120B-NVFP4: (no MTP)
         {
-            'patterns': ['nemotron_3_super_120b_nvfp4'],
+            'patterns': ['nemotron_3_super_120b_nvfp4-'],
             'config': {
+                'max_seq_len': 1048576,
                 'enable_chunked_prefill': True,
                 'enable_attention_dp': False,
                 'stream_interval': 1,
@@ -431,6 +432,32 @@ def get_model_yaml_config(model_label: str,
                 },
                 'kv_cache_config': {
                     'enable_block_reuse': False,
+                    'mamba_ssm_cache_dtype': 'float16',
+                    'mamba_ssm_stochastic_rounding': True,
+                    'mamba_ssm_philox_rounds': 5,
+                },
+            }
+        },
+        # Nemotron-3-Super-120B-NVFP4: MTP speculative decoding
+        {
+            'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
+            'config': {
+                'max_seq_len': 1048576,
+                'enable_chunked_prefill': True,
+                'enable_attention_dp': False,
+                'stream_interval': 1,
+                'moe_config': {
+                    'backend': 'CUTLASS',
+                },
+                'cuda_graph_config': {
+                    'enable_padding': True,
+                    'max_batch_size': 8,
+                },
+                'kv_cache_config': {
+                    'enable_block_reuse': False,
+                    'mamba_ssm_cache_dtype': 'float16',
+                    'mamba_ssm_stochastic_rounding': True,
+                    'mamba_ssm_philox_rounds': 5,
                 },
                 'speculative_config': {
                     'decoding_type': 'MTP',

diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
@@ -15,6 +15,7 @@
 """
 TensorRT LLM perf tests
 """
+import json
 import os
 import re
 import shutil
@@ -53,6 +54,7 @@
     "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
     "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
     "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8",
+    "llama_v3.3_8b": "llama-models-v3/llama-v3-8b-instruct-hf",
     "llama_v3.3_70b_instruct_fp8":
     "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
     "llama_v3.3_70b_instruct_fp4":
@@ -179,6 +181,8 @@
     "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
     "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
     "nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+    "nemotron_3_super_120b_nvfp4_mtp":
+    "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
     "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4",
 }
 # Model PATH of HuggingFace
@@ -238,18 +242,24 @@
 
 TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")
 
+NEMOTRON_SUPER_MODELS = {
+    "nemotron_3_super_120b_nvfp4",
+    "nemotron_3_super_120b_nvfp4_mtp",
+}
+
 TRUST_REMOTE_CODE_MODELS = {  # these models require explicit trust_remote_code=True
     "llama_v3.3_nemotron_super_49b",
     "llama_v3.3_nemotron_super_49b_fp8",
     "llama_v3.1_nemotron_ultra_253b",
     "llama_v3.1_nemotron_ultra_253b_fp8",
     "kimi_k2_nvfp4",
     "nemotron_3_super_120b_nvfp4",
+    "nemotron_3_super_120b_nvfp4_mtp",
 }
 
-# Models requiring TLLM_ALLOW_LONG_MAX_MODEL_LEN=1 due to max_seq_len > 128K
-LONG_MAX_SEQ_LEN_MODELS = {
-    "nemotron_3_super_120b_nvfp4",
+# Spec-dec models real dataset in serve perf tests.
+SPEC_DEC_REAL_DATASET_MODELS = {
+    "nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail",
 }
 
 # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
@@ -1437,10 +1447,93 @@ def get_trtllm_serve_server_command(self, engine_dir):
             yaml.dump(serve_config, f, default_flow_style=False)
         server_cmd += ["--config", config_path]
 
+        if self._config.model_name in NEMOTRON_SUPER_MODELS:
+            server_cmd += [
+                "--reasoning_parser", "nano-v3", "--tool_parser", "qwen3_coder"
+            ]
+
         return server_cmd
 
-    def get_trtllm_serve_client_command(self, engine_dir, input_len,
-                                        output_len):
+    def generate_trtllm_custom_dataset(self, dst_dataset_path: str,
+                                       input_len: int, output_len: int,
+                                       dataset_source: str):
+        # Currently only support cnn_dailymail dataset source.
+        if dataset_source != "cnn_dailymail":
+            raise ValueError(
+                f"Unsupported real dataset source: {dataset_source}. "
+                "Only 'cnn_dailymail' is supported.")
+        from datasets import load_dataset
+        from transformers import AutoTokenizer
+
+        model_dir = self.get_trtllm_bench_model()
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_dir,
+            trust_remote_code=self._config.model_name
+            in TRUST_REMOTE_CODE_MODELS)
+        dataset = load_dataset("cnn_dailymail",
+                               "3.0.0",
+                               split="validation",
+                               streaming=True,
+                               trust_remote_code=True)
+        if not os.path.exists(os.path.dirname(dst_dataset_path)):
+            os.makedirs(os.path.dirname(dst_dataset_path), exist_ok=True)
+
+        num_reqs = self._config.num_reqs
+        req_count = 0
+        with open(dst_dataset_path, "w", encoding="utf-8") as f:
+            for req in dataset:
+                article = req.get("article")
+                if not article:
+                    continue
+
+                prompt = f"Summarize: {article}"
+                prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
+                if input_len > 0:
+                    if not prompt_ids:
+                        continue
+                    if len(prompt_ids) < input_len:
+                        # Keep strict fixed-length requests for perf coverage
+                        # by extending short real-dataset prompts.
+                        repeats = (input_len + len(prompt_ids) -
+                                   1) // len(prompt_ids)
+                        prompt_ids = (prompt_ids * repeats)[:input_len]
+                    elif len(prompt_ids) > input_len:
+                        prompt_ids = prompt_ids[:input_len]
+                prompt_text = tokenizer.decode(prompt_ids,
+                                               skip_special_tokens=False)
+
+                sample = {
+                    "input": {
+                        "messages": [{
+                            "role": "system",
+                            "content": ""
+                        }, {
+                            "role": "user",
+                            "content": prompt_text
+                        }],
+                        "max_tokens":
+                        int(output_len),
+                        "num_tokens":
+                        len(prompt_ids),
+                    }
+                }
+                f.write(json.dumps(sample, ensure_ascii=False) + "\n")
+                req_count += 1
+                if req_count >= num_reqs:
+                    break
+
+        if req_count < num_reqs:
+            raise ValueError(
+                f"Cannot sample enough requests from cnn_dailymail: requested={num_reqs}, sampled={req_count}"
+            )
+        print_info(f"Generated {req_count} samples from {dataset_source} to "
+                   f"{dst_dataset_path}")
+
+    def get_trtllm_serve_client_command(self,
+                                        engine_dir,
+                                        input_len,
+                                        output_len,
+                                        real_dataset_path: str = ""):
         model_dir = self.get_trtllm_bench_model()
         client_cmd = [
             "python",
@@ -1453,20 +1546,30 @@ def get_trtllm_serve_client_command(self, engine_dir, input_len,
             "--num-prompts",
             str(self._config.num_reqs),
             "--ignore-eos",
+            "--tokenize-on-client",
             "--no-test-input",
             "--percentile-metrics",
             "ttft,tpot,itl,e2el",
-            "--dataset-name",
-            "random",
-            "--random-ids",
-            "--tokenize-on-client",
-            "--random-input-len",
-            str(input_len),
-            "--random-output-len",
-            str(output_len),
-            "--random-range-ratio",
-            "0.0",
         ]
+        if real_dataset_path:
+            client_cmd += [
+                "--dataset-name",
+                "trtllm_custom",
+                "--dataset-path",
+                real_dataset_path,
+            ]
+        else:
+            client_cmd += [
+                "--dataset-name",
+                "random",
+                "--random-ids",
+                "--random-input-len",
+                str(input_len),
+                "--random-output-len",
+                str(output_len),
+                "--random-range-ratio",
+                "0.0",
+            ]
         if self._config.concurrency != -1:
             client_cmd += ["--max-concurrency", str(self._config.concurrency)]
         if self._config.streaming == "streaming":
@@ -1489,19 +1592,38 @@ def get_commands(self):
         if self._config.runtime == "serve":
             server_cmd = self.get_trtllm_serve_server_command(engine_dir)
             client_cmds = []
+            data_cmds = []
             for bs in self._config.batch_sizes:
                 for len_idx, input_len in enumerate(self._config.input_lens):
                     output_len = self._config.output_lens[len_idx]
+                    real_dataset_path = ""
+                    if self._config.model_name in SPEC_DEC_REAL_DATASET_MODELS:
+                        dataset_source = SPEC_DEC_REAL_DATASET_MODELS[
+                            self._config.model_name]
+                        print_info(
+                            f"Using real dataset source '{dataset_source}' for "
+                            f"spec-dec model: {self._config.model_name}.")
+                        real_dataset_path = os.path.join(
+                            engine_dir,
+                            f"dataset_custom_{input_len}_{output_len}.jsonl")
+                        self.generate_trtllm_custom_dataset(
+                            real_dataset_path,
+                            input_len,
+                            output_len,
+                            dataset_source=dataset_source)
                     client_cmd = self.get_trtllm_serve_client_command(
-                        engine_dir, input_len, output_len)
+                        engine_dir,
+                        input_len,
+                        output_len,
+                        real_dataset_path=real_dataset_path)
                     client_cmds.append(client_cmd)
             server_env = os.environ.copy()
-            if self._config.model_name in LONG_MAX_SEQ_LEN_MODELS:
+            if self._config.model_name in NEMOTRON_SUPER_MODELS:
                 server_env["TLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
-            server_timeout = 3600 if self._config.model_name in LONG_MAX_SEQ_LEN_MODELS else 600
+            server_timeout = 3600 if self._config.model_name in NEMOTRON_SUPER_MODELS else 600
             return PerfServeScriptTestCmds(server_cmd=server_cmd,
                                            client_cmds=client_cmds,
-                                           data_cmds=[],
+                                           data_cmds=data_cmds,
                                            server_env=server_env,
                                            server_timeout=server_timeout)