Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion examples/disaggregated/slurm/benchmark/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,25 @@ def build_worker_environment(worker_config, env_config, role, benchmark_mode,
'TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP',
'TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1')
if role == "GEN":
gen_config = worker_config.get('gen', {})
concurrency_int = int(concurrency)
max_batch_size = int(
gen_config.get('max_batch_size', concurrency_int))
enable_attention_dp = gen_config.get('enable_attention_dp', False)
tp_size = int(gen_config.get('tensor_parallel_size', 1))
max_capacity = ((max_batch_size * tp_size)
if enable_attention_dp else max_batch_size)
queue_size = min(max_capacity, concurrency_int)
if queue_size < concurrency_int:
print(f"[WARNING] TLLM_BENCHMARK_REQ_QUEUES_SIZE capped to "
f"{queue_size} (max_batch_size={max_batch_size} x "
f"tp_size={tp_size} with "
f"attention_dp={enable_attention_dp}) "
f"which is less than concurrency={concurrency}. "
f"Fill loop would hang if set to {concurrency}.")
upsert_env_config(env_config, 'gen_worker_env_var',
'TLLM_BENCHMARK_REQ_QUEUES_SIZE',
f'TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency}')
f'TLLM_BENCHMARK_REQ_QUEUES_SIZE={queue_size}')

# 2. Add profiling env vars to env_config (conditional)
if nsys_on:
Expand Down
31 changes: 29 additions & 2 deletions tests/integration/defs/perf/pytorch_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,11 @@ def get_model_yaml_config(model_label: str,
'attn_backend': 'FLASHINFER',
}
},
# Nemotron-3-Super-120B-NVFP4: chunked prefill + MTP-3 speculative decoding
# Nemotron-3-Super-120B-NVFP4: (no MTP)
{
'patterns': ['nemotron_3_super_120b_nvfp4'],
'patterns': ['nemotron_3_super_120b_nvfp4-'],
'config': {
'max_seq_len': 1048576,
'enable_chunked_prefill': True,
'enable_attention_dp': False,
'stream_interval': 1,
Expand All @@ -431,6 +432,32 @@ def get_model_yaml_config(model_label: str,
},
'kv_cache_config': {
'enable_block_reuse': False,
'mamba_ssm_cache_dtype': 'float16',
'mamba_ssm_stochastic_rounding': True,
'mamba_ssm_philox_rounds': 5,
},
}
},
# Nemotron-3-Super-120B-NVFP4: MTP speculative decoding
{
'patterns': ['nemotron_3_super_120b_nvfp4_mtp'],
'config': {
'max_seq_len': 1048576,
'enable_chunked_prefill': True,
'enable_attention_dp': False,
'stream_interval': 1,
'moe_config': {
'backend': 'CUTLASS',
},
'cuda_graph_config': {
'enable_padding': True,
'max_batch_size': 8,
},
'kv_cache_config': {
'enable_block_reuse': False,
'mamba_ssm_cache_dtype': 'float16',
'mamba_ssm_stochastic_rounding': True,
'mamba_ssm_philox_rounds': 5,
},
'speculative_config': {
'decoding_type': 'MTP',
Expand Down
160 changes: 141 additions & 19 deletions tests/integration/defs/perf/test_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
TensorRT LLM perf tests
"""
import json
import os
import re
import shutil
Expand Down Expand Up @@ -53,6 +54,7 @@
"llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
"llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8",
"llama_v3.3_8b": "llama-models-v3/llama-v3-8b-instruct-hf",
"llama_v3.3_70b_instruct_fp8":
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
"llama_v3.3_70b_instruct_fp4":
Expand Down Expand Up @@ -179,6 +181,8 @@
"nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
"nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
"nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
"nemotron_3_super_120b_nvfp4_mtp":
"NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
"kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4",
}
# Model PATH of HuggingFace
Expand Down Expand Up @@ -238,18 +242,24 @@

TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")

NEMOTRON_SUPER_MODELS = {
"nemotron_3_super_120b_nvfp4",
"nemotron_3_super_120b_nvfp4_mtp",
}

TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=True
"llama_v3.3_nemotron_super_49b",
"llama_v3.3_nemotron_super_49b_fp8",
"llama_v3.1_nemotron_ultra_253b",
"llama_v3.1_nemotron_ultra_253b_fp8",
"kimi_k2_nvfp4",
"nemotron_3_super_120b_nvfp4",
"nemotron_3_super_120b_nvfp4_mtp",
}

# Models requiring TLLM_ALLOW_LONG_MAX_MODEL_LEN=1 due to max_seq_len > 128K
LONG_MAX_SEQ_LEN_MODELS = {
"nemotron_3_super_120b_nvfp4",
# Spec-dec models real dataset in serve perf tests.
SPEC_DEC_REAL_DATASET_MODELS = {
"nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail",
}

# Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
Expand Down Expand Up @@ -1437,10 +1447,93 @@ def get_trtllm_serve_server_command(self, engine_dir):
yaml.dump(serve_config, f, default_flow_style=False)
server_cmd += ["--config", config_path]

if self._config.model_name in NEMOTRON_SUPER_MODELS:
server_cmd += [
"--reasoning_parser", "nano-v3", "--tool_parser", "qwen3_coder"
]

return server_cmd

def get_trtllm_serve_client_command(self, engine_dir, input_len,
output_len):
def generate_trtllm_custom_dataset(self, dst_dataset_path: str,
input_len: int, output_len: int,
dataset_source: str):
# Currently only support cnn_dailymail dataset source.
if dataset_source != "cnn_dailymail":
raise ValueError(
f"Unsupported real dataset source: {dataset_source}. "
"Only 'cnn_dailymail' is supported.")
from datasets import load_dataset
from transformers import AutoTokenizer

model_dir = self.get_trtllm_bench_model()
tokenizer = AutoTokenizer.from_pretrained(
model_dir,
trust_remote_code=self._config.model_name
in TRUST_REMOTE_CODE_MODELS)
dataset = load_dataset("cnn_dailymail",
"3.0.0",
split="validation",
streaming=True,
trust_remote_code=True)
if not os.path.exists(os.path.dirname(dst_dataset_path)):
os.makedirs(os.path.dirname(dst_dataset_path), exist_ok=True)

num_reqs = self._config.num_reqs
req_count = 0
with open(dst_dataset_path, "w", encoding="utf-8") as f:
for req in dataset:
article = req.get("article")
if not article:
continue

prompt = f"Summarize: {article}"
prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
if input_len > 0:
if not prompt_ids:
continue
if len(prompt_ids) < input_len:
# Keep strict fixed-length requests for perf coverage
# by extending short real-dataset prompts.
repeats = (input_len + len(prompt_ids) -
1) // len(prompt_ids)
prompt_ids = (prompt_ids * repeats)[:input_len]
elif len(prompt_ids) > input_len:
prompt_ids = prompt_ids[:input_len]
prompt_text = tokenizer.decode(prompt_ids,
skip_special_tokens=False)

sample = {
"input": {
"messages": [{
"role": "system",
"content": ""
}, {
"role": "user",
"content": prompt_text
}],
"max_tokens":
int(output_len),
"num_tokens":
len(prompt_ids),
}
}
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
req_count += 1
if req_count >= num_reqs:
break

if req_count < num_reqs:
raise ValueError(
f"Cannot sample enough requests from cnn_dailymail: requested={num_reqs}, sampled={req_count}"
)
print_info(f"Generated {req_count} samples from {dataset_source} to "
f"{dst_dataset_path}")

def get_trtllm_serve_client_command(self,
engine_dir,
input_len,
output_len,
real_dataset_path: str = ""):
model_dir = self.get_trtllm_bench_model()
client_cmd = [
"python",
Expand All @@ -1453,20 +1546,30 @@ def get_trtllm_serve_client_command(self, engine_dir, input_len,
"--num-prompts",
str(self._config.num_reqs),
"--ignore-eos",
"--tokenize-on-client",
"--no-test-input",
"--percentile-metrics",
"ttft,tpot,itl,e2el",
"--dataset-name",
"random",
"--random-ids",
"--tokenize-on-client",
"--random-input-len",
str(input_len),
"--random-output-len",
str(output_len),
"--random-range-ratio",
"0.0",
]
if real_dataset_path:
client_cmd += [
"--dataset-name",
"trtllm_custom",
"--dataset-path",
real_dataset_path,
]
else:
client_cmd += [
"--dataset-name",
"random",
"--random-ids",
"--random-input-len",
str(input_len),
"--random-output-len",
str(output_len),
"--random-range-ratio",
"0.0",
]
if self._config.concurrency != -1:
client_cmd += ["--max-concurrency", str(self._config.concurrency)]
if self._config.streaming == "streaming":
Expand All @@ -1489,19 +1592,38 @@ def get_commands(self):
if self._config.runtime == "serve":
server_cmd = self.get_trtllm_serve_server_command(engine_dir)
client_cmds = []
data_cmds = []
for bs in self._config.batch_sizes:
for len_idx, input_len in enumerate(self._config.input_lens):
output_len = self._config.output_lens[len_idx]
real_dataset_path = ""
if self._config.model_name in SPEC_DEC_REAL_DATASET_MODELS:
dataset_source = SPEC_DEC_REAL_DATASET_MODELS[
self._config.model_name]
print_info(
f"Using real dataset source '{dataset_source}' for "
f"spec-dec model: {self._config.model_name}.")
real_dataset_path = os.path.join(
engine_dir,
f"dataset_custom_{input_len}_{output_len}.jsonl")
self.generate_trtllm_custom_dataset(
real_dataset_path,
input_len,
output_len,
dataset_source=dataset_source)
client_cmd = self.get_trtllm_serve_client_command(
engine_dir, input_len, output_len)
engine_dir,
input_len,
output_len,
real_dataset_path=real_dataset_path)
client_cmds.append(client_cmd)
server_env = os.environ.copy()
if self._config.model_name in LONG_MAX_SEQ_LEN_MODELS:
if self._config.model_name in NEMOTRON_SUPER_MODELS:
server_env["TLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
server_timeout = 3600 if self._config.model_name in LONG_MAX_SEQ_LEN_MODELS else 600
server_timeout = 3600 if self._config.model_name in NEMOTRON_SUPER_MODELS else 600
return PerfServeScriptTestCmds(server_cmd=server_cmd,
client_cmds=client_cmds,
data_cmds=[],
data_cmds=data_cmds,
server_env=server_env,
server_timeout=server_timeout)

Expand Down
Loading
Loading