diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index cecf30fa9db3..ba470d71c7c0 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -228,9 +228,25 @@ def build_worker_environment(worker_config, env_config, role, benchmark_mode, 'TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP', 'TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1') if role == "GEN": + gen_config = worker_config.get('gen', {}) + concurrency_int = int(concurrency) + max_batch_size = int( + gen_config.get('max_batch_size', concurrency_int)) + enable_attention_dp = gen_config.get('enable_attention_dp', False) + tp_size = int(gen_config.get('tensor_parallel_size', 1)) + max_capacity = ((max_batch_size * tp_size) + if enable_attention_dp else max_batch_size) + queue_size = min(max_capacity, concurrency_int) + if queue_size < concurrency_int: + print(f"[WARNING] TLLM_BENCHMARK_REQ_QUEUES_SIZE capped to " + f"{queue_size} (max_batch_size={max_batch_size} x " + f"tp_size={tp_size} with " + f"attention_dp={enable_attention_dp}) " + f"which is less than concurrency={concurrency}. " + f"Fill loop would hang if set to {concurrency}.") upsert_env_config(env_config, 'gen_worker_env_var', 'TLLM_BENCHMARK_REQ_QUEUES_SIZE', - f'TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency}') + f'TLLM_BENCHMARK_REQ_QUEUES_SIZE={queue_size}') # 2. Add profiling env vars to env_config (conditional) if nsys_on: diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 09e913d1473c..54724757b5e0 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -415,10 +415,11 @@ def get_model_yaml_config(model_label: str, 'attn_backend': 'FLASHINFER', } }, - # Nemotron-3-Super-120B-NVFP4: chunked prefill + MTP-3 speculative decoding + # Nemotron-3-Super-120B-NVFP4: (no MTP) { - 'patterns': ['nemotron_3_super_120b_nvfp4'], + 'patterns': ['nemotron_3_super_120b_nvfp4-'], 'config': { + 'max_seq_len': 1048576, 'enable_chunked_prefill': True, 'enable_attention_dp': False, 'stream_interval': 1, @@ -431,6 +432,32 @@ def get_model_yaml_config(model_label: str, }, 'kv_cache_config': { 'enable_block_reuse': False, + 'mamba_ssm_cache_dtype': 'float16', + 'mamba_ssm_stochastic_rounding': True, + 'mamba_ssm_philox_rounds': 5, + }, + } + }, + # Nemotron-3-Super-120B-NVFP4: MTP speculative decoding + { + 'patterns': ['nemotron_3_super_120b_nvfp4_mtp'], + 'config': { + 'max_seq_len': 1048576, + 'enable_chunked_prefill': True, + 'enable_attention_dp': False, + 'stream_interval': 1, + 'moe_config': { + 'backend': 'CUTLASS', + }, + 'cuda_graph_config': { + 'enable_padding': True, + 'max_batch_size': 8, + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + 'mamba_ssm_cache_dtype': 'float16', + 'mamba_ssm_stochastic_rounding': True, + 'mamba_ssm_philox_rounds': 5, }, 'speculative_config': { 'decoding_type': 'MTP', diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 97e4ed166243..e1a0acb5b551 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -15,6 +15,7 @@ """ TensorRT LLM perf tests """ +import json import os import re import shutil @@ -53,6 +54,7 @@ "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B", "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct", "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8", + "llama_v3.3_8b": "llama-models-v3/llama-v3-8b-instruct-hf", "llama_v3.3_70b_instruct_fp8": "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8", "llama_v3.3_70b_instruct_fp4": @@ -179,6 +181,8 @@ "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2", "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4", "nemotron_3_super_120b_nvfp4": "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "nemotron_3_super_120b_nvfp4_mtp": + "NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4", } # Model PATH of HuggingFace @@ -238,6 +242,11 @@ TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "") +NEMOTRON_SUPER_MODELS = { + "nemotron_3_super_120b_nvfp4", + "nemotron_3_super_120b_nvfp4_mtp", +} + TRUST_REMOTE_CODE_MODELS = { # these models require explicit trust_remote_code=True "llama_v3.3_nemotron_super_49b", "llama_v3.3_nemotron_super_49b_fp8", @@ -245,11 +254,12 @@ "llama_v3.1_nemotron_ultra_253b_fp8", "kimi_k2_nvfp4", "nemotron_3_super_120b_nvfp4", + "nemotron_3_super_120b_nvfp4_mtp", } -# Models requiring TLLM_ALLOW_LONG_MAX_MODEL_LEN=1 due to max_seq_len > 128K -LONG_MAX_SEQ_LEN_MODELS = { - "nemotron_3_super_120b_nvfp4", +# Spec-dec models real dataset in serve perf tests. +SPEC_DEC_REAL_DATASET_MODELS = { + "nemotron_3_super_120b_nvfp4_mtp": "cnn_dailymail", } # Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root) @@ -1437,10 +1447,93 @@ def get_trtllm_serve_server_command(self, engine_dir): yaml.dump(serve_config, f, default_flow_style=False) server_cmd += ["--config", config_path] + if self._config.model_name in NEMOTRON_SUPER_MODELS: + server_cmd += [ + "--reasoning_parser", "nano-v3", "--tool_parser", "qwen3_coder" + ] + return server_cmd - def get_trtllm_serve_client_command(self, engine_dir, input_len, - output_len): + def generate_trtllm_custom_dataset(self, dst_dataset_path: str, + input_len: int, output_len: int, + dataset_source: str): + # Currently only support cnn_dailymail dataset source. + if dataset_source != "cnn_dailymail": + raise ValueError( + f"Unsupported real dataset source: {dataset_source}. " + "Only 'cnn_dailymail' is supported.") + from datasets import load_dataset + from transformers import AutoTokenizer + + model_dir = self.get_trtllm_bench_model() + tokenizer = AutoTokenizer.from_pretrained( + model_dir, + trust_remote_code=self._config.model_name + in TRUST_REMOTE_CODE_MODELS) + dataset = load_dataset("cnn_dailymail", + "3.0.0", + split="validation", + streaming=True, + trust_remote_code=True) + if not os.path.exists(os.path.dirname(dst_dataset_path)): + os.makedirs(os.path.dirname(dst_dataset_path), exist_ok=True) + + num_reqs = self._config.num_reqs + req_count = 0 + with open(dst_dataset_path, "w", encoding="utf-8") as f: + for req in dataset: + article = req.get("article") + if not article: + continue + + prompt = f"Summarize: {article}" + prompt_ids = tokenizer.encode(prompt, add_special_tokens=False) + if input_len > 0: + if not prompt_ids: + continue + if len(prompt_ids) < input_len: + # Keep strict fixed-length requests for perf coverage + # by extending short real-dataset prompts. + repeats = (input_len + len(prompt_ids) - + 1) // len(prompt_ids) + prompt_ids = (prompt_ids * repeats)[:input_len] + elif len(prompt_ids) > input_len: + prompt_ids = prompt_ids[:input_len] + prompt_text = tokenizer.decode(prompt_ids, + skip_special_tokens=False) + + sample = { + "input": { + "messages": [{ + "role": "system", + "content": "" + }, { + "role": "user", + "content": prompt_text + }], + "max_tokens": + int(output_len), + "num_tokens": + len(prompt_ids), + } + } + f.write(json.dumps(sample, ensure_ascii=False) + "\n") + req_count += 1 + if req_count >= num_reqs: + break + + if req_count < num_reqs: + raise ValueError( + f"Cannot sample enough requests from cnn_dailymail: requested={num_reqs}, sampled={req_count}" + ) + print_info(f"Generated {req_count} samples from {dataset_source} to " + f"{dst_dataset_path}") + + def get_trtllm_serve_client_command(self, + engine_dir, + input_len, + output_len, + real_dataset_path: str = ""): model_dir = self.get_trtllm_bench_model() client_cmd = [ "python", @@ -1453,20 +1546,30 @@ def get_trtllm_serve_client_command(self, engine_dir, input_len, "--num-prompts", str(self._config.num_reqs), "--ignore-eos", + "--tokenize-on-client", "--no-test-input", "--percentile-metrics", "ttft,tpot,itl,e2el", - "--dataset-name", - "random", - "--random-ids", - "--tokenize-on-client", - "--random-input-len", - str(input_len), - "--random-output-len", - str(output_len), - "--random-range-ratio", - "0.0", ] + if real_dataset_path: + client_cmd += [ + "--dataset-name", + "trtllm_custom", + "--dataset-path", + real_dataset_path, + ] + else: + client_cmd += [ + "--dataset-name", + "random", + "--random-ids", + "--random-input-len", + str(input_len), + "--random-output-len", + str(output_len), + "--random-range-ratio", + "0.0", + ] if self._config.concurrency != -1: client_cmd += ["--max-concurrency", str(self._config.concurrency)] if self._config.streaming == "streaming": @@ -1489,19 +1592,38 @@ def get_commands(self): if self._config.runtime == "serve": server_cmd = self.get_trtllm_serve_server_command(engine_dir) client_cmds = [] + data_cmds = [] for bs in self._config.batch_sizes: for len_idx, input_len in enumerate(self._config.input_lens): output_len = self._config.output_lens[len_idx] + real_dataset_path = "" + if self._config.model_name in SPEC_DEC_REAL_DATASET_MODELS: + dataset_source = SPEC_DEC_REAL_DATASET_MODELS[ + self._config.model_name] + print_info( + f"Using real dataset source '{dataset_source}' for " + f"spec-dec model: {self._config.model_name}.") + real_dataset_path = os.path.join( + engine_dir, + f"dataset_custom_{input_len}_{output_len}.jsonl") + self.generate_trtllm_custom_dataset( + real_dataset_path, + input_len, + output_len, + dataset_source=dataset_source) client_cmd = self.get_trtllm_serve_client_command( - engine_dir, input_len, output_len) + engine_dir, + input_len, + output_len, + real_dataset_path=real_dataset_path) client_cmds.append(client_cmd) server_env = os.environ.copy() - if self._config.model_name in LONG_MAX_SEQ_LEN_MODELS: + if self._config.model_name in NEMOTRON_SUPER_MODELS: server_env["TLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" - server_timeout = 3600 if self._config.model_name in LONG_MAX_SEQ_LEN_MODELS else 600 + server_timeout = 3600 if self._config.model_name in NEMOTRON_SUPER_MODELS else 600 return PerfServeScriptTestCmds(server_cmd=server_cmd, client_cmds=client_cmds, - data_cmds=[], + data_cmds=data_cmds, server_env=server_env, server_timeout=server_timeout) diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index e16404058955..2f17e504cb80 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -3,7 +3,7 @@ llm_perf_core: # =============================================================================== # Test Conditions Index # =============================================================================== -# 1: All GPUs common tests(L20, L40S, H100, H20, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases) +# 1: All GPUs common tests(L20, L40S, H100, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases) # 2: L20, L40S, H100, H20, H200 # 3: L40S, H100, H20, H200 # 4: H100, H20, H200 test cases @@ -26,14 +26,6 @@ llm_perf_core: system_gpu_count: gte: 2 tests: - - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120) - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:512,32] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] @@ -47,55 +39,23 @@ llm_perf_core: compute_capability: lt: 10.0 tests: - # E2E trtllm-bench - #llama_v3.1_8b_instruct - #pytorch backend - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[qwen2_7b_instruct-bench-pytorch-float16-input_output_len:128,128] - - perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-bfloat16-input_output_len:512,200] - - perf/test_perf.py::test_perf[starcoder2_3b-bench-pytorch-bfloat16-input_output_len:500,2000-con:50] - - perf/test_perf.py::test_perf[starcoder2_7b-bench-pytorch-bfloat16-input_output_len:500,2000-con:50] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128] - # Phi-4-multimodal-instruct with chunked prefill and kv_cache_reuse - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:500,2000-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,1000-con:250] - # Bielik-11B-v2.2-Instruct - - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct-bench-pytorch-bfloat16-input_output_len:512,32] - - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct-bench-pytorch-bfloat16-input_output_len:2000,2000-con:250] - #pytorch backend - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:2000,500] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:512,32] - #llama_v3.1_8b - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] - #mixtral_8x7b_v0.1 + #llama_v3.3_8b #pytorch backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-loras:8-gpus:2] - #llama_v3.2_1b + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-pytorch-bfloat16-input_output_len:512,32] + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] + #llama_v3.1_8b_instruct_fp8 #pytorch backend - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:2000,500-reqs:10-con:1-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:512,32-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:512,200-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:10-con:1-gpus:2] - #Mistral-Small-3.1-24B-Instruct-2503 - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200-gpus:2] TIMEOUT(120) - #pytorch backend - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-loras:1-reqs:100-con:2-gpus:1] - - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-con:250] - - perf/test_perf.py::test_perf[bielik_11b_v2.2_instruct_fp8-bench-pytorch-float8-input_output_len:2000,2000-con:250] - #mixtral_8x7b_v0.1_fp8 pytorch backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:2] + #nemotron_nano_12b_v2 + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-con:1] #min_latency + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:500,2000-con:250] #max_throughput + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:128,128] # 3: L40S, H100, H20, H200 @@ -107,10 +67,10 @@ llm_perf_core: gt: 8.0 lte: 9.0 tests: - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:512,32-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-gpus:4] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:512,32-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4] @@ -120,9 +80,8 @@ llm_perf_core: - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,200-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,200-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,200-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:8] # 4: H100, H20, H200 test cases @@ -136,20 +95,15 @@ llm_perf_core: gpu_memory: gt: 80000 tests: - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-streaming-float16-input_output_len:128,128-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] - # deepseek_v3_lite_fp8 - - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:2000,500] - - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500] - - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:500,2000] - - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200] - - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-streaming-float8-input_output_len:128,128] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:500-con:200] TIMEOUT(120) + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:200,2000-reqs:64-con:200-gpus:8] + # gpt_oss_20b_fp4 (NVBug 5720470: MMHA vs XQA kernel regression) + - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:64] + - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:128,128] + - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-float4-maxbs:512-maxnt:8192-input_output_len:2000,200-con:256] # 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases @@ -161,17 +115,11 @@ llm_perf_core: gte: 9.0 lte: 12.0 tests: - - perf/test_perf.py::test_perf[starcoder2_15b-bench-pytorch-bfloat16-input_output_len:500,2000-con:100] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120) - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1] TIMEOUT(120) - # Mistral-Small-3.1-24B-Instruct-2503 with chunked prefill and kv_cache_reuse - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:48-maxnt:256-input_output_len:1000,2000-reqs:500-con:200] - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:128-maxnt:512-input_output_len:1000,2000-reqs:500-con:200] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] - - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250] - - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8] + #nemotron_nano_12b_v2 + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,1000-reqs:10-con:1] #min_latency + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250] #max_throughput + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-streaming-bfloat16-input_output_len:500,2000-con:250] #max_throughput streaming # 6: GB200, B200, B300, GB300, RTX6000-Server test cases @@ -183,29 +131,16 @@ llm_perf_core: gte: 10.0 lte: 12.0 tests: - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8] - - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8] - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-maxbs:2048-maxnt:8192-input_output_len:256,256-reqs:200] - #Mistral-Small-3.1-24B-Instruct-2503 - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200] TIMEOUT(120) - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:128,128-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:512,32-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:256-input_output_len:512,32-gpus:2] - - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2] - #Mistral-Small-3.1-24B-Instruct-2503 - - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200-gpus:2] TIMEOUT(120) - - perf/test_perf.py::test_perf[starcoder2_15b-bench-float16-input_output_len:512,200-gpus:4] - #llama_v3.1_405b_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1024,2048-gpus:4] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-gpus:4] TIMEOUT(120) + #llama_v3.3_8b + - perf/test_perf.py::test_perf[llama_v3.3_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500] + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2] #llama_v3.3_70b_instruct_fp4 - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:4] @@ -223,8 +158,7 @@ llm_perf_core: gte: 10.0 lte: 10.3 tests: - # for chunked prefill cases - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:3000,500-reqs:200] + # deepseek_r1_0528_fp4 with chunked prefill - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120) @@ -234,8 +168,7 @@ llm_perf_core: - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4] - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4] - #deepseek_r1_nvfp4 with chunked prefill - - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:32-maxnt:4096-kv_frac:0.80-input_output_len:8192,512-reqs:3000-ep:2-tp:4-gpus:4] + # 8: B200, B300 test cases - condition: @@ -251,22 +184,6 @@ llm_perf_core: - perf/test_perf.py::test_perf[kimi_k2_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.6-input_output_len:1000,1000-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[kimi_k2_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.6-input_output_len:5000,500-reqs:2000-ep:8-tp:8-gpus:8] TIMEOUT(120) - # llama_v3.1_405b_instruct_fp4 - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-tp:8-gpus:8] TIMEOUT(120) - - # deepseek_r1_fp8 - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] - - # deepseek_r1_0528_fp4 - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) - # gpt_oss_120b_fp4 - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8] @@ -275,6 +192,11 @@ llm_perf_core: - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8] + # deepseek_r1_0528_fp4 + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) + - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120) + + # 9: H100, H20, H200, B200, B300 test cases - condition: ranges: @@ -284,20 +206,6 @@ llm_perf_core: gte: 9.0 lt: 12.0 tests: - # E2E trtllm-bench - #mixtral_8x7b_v0.1_instruct - #pytorch backend - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:64-gpus:8] # timeout for a100 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:10-con:50-gpus:8] # timeout for a100 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128-reqs:10-con:1-gpus:8] # timeout for a100 - # llama_v3.1_405b_fp8 - #pytorch backend - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2000,500-reqs:8-con:1-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:1500-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-tp:8-gpus:8] #llama_v3.3_70b_instruct_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120) @@ -314,18 +222,10 @@ llm_perf_core: gpu_memory: gt: 90000 tests: - #pytorch backend - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-streaming-float8-maxbs:32-input_output_len:128,128-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1024-maxnt:4096-input_output_len:1000,1000-reqs:3000-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-con:1-ep:4-tp:8-gpus:8] #min latency test - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(80) #max throughput test + # deepseek_r1_0528 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_0528_fp8-bench-pytorch-float8-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(100) - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:10000-con:3072-ep:8-tp:8-gpus:8] TIMEOUT(120) #max throughput test - # for chunked prefill cases - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120) - - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120) # 11: RTX-6000D, RTX-6000 Server test cases @@ -337,29 +237,20 @@ llm_perf_core: gte: 12.0 lte: 12.0 tests: - - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4] + #llama_v3.3_8b + - perf/test_perf.py::test_perf[llama_v3.3_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8] - #llama_v3.1_70b - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] - - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2] - - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8] + - perf/test_perf.py::test_perf[llama_v3.3_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.3_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8] + #llama_v3.3_70b + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-kv_cache_dtype:fp8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-kv_cache_dtype:fp8-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:500,2000-tp:2-gpus:2] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:1000,1000-tp:2-gpus:2] - #llama_v3.3_nemotron_super_49b - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128-tp:2-gpus:2] - #deepseek_v3_lite - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-input_output_len:128,128] #max throughput test - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-input_output_len:128,128] - #mixtral_8x7b_v0.1 - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:2-gpus:2] - - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-kv_cache_dtype:fp8-tp:2-gpus:2] # 12: RTX6000-Server test cases @@ -371,13 +262,10 @@ llm_perf_core: gte: 12.0 lte: 12.0 tests: - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8] - - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8] - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:8-tp:8-gpus:8] + # deepseek_r1_0528 - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] TIMEOUT(120) - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-streaming-float4-maxbs:512-maxnt:5220-input_output_len:4000,2000-reqs:512-ep:8-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] - - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:10000-con:3072-ep:8-tp:8-gpus:8] TIMEOUT(120) diff --git a/tests/integration/test_lists/qa/llm_spark_perf.yml b/tests/integration/test_lists/qa/llm_spark_perf.yml index 11100cc804ce..f9940781daa5 100644 --- a/tests/integration/test_lists/qa/llm_spark_perf.yml +++ b/tests/integration/test_lists/qa/llm_spark_perf.yml @@ -18,8 +18,10 @@ llm_spark_perf: - perf/test_perf.py::test_perf[gpt_oss_120b_eagle3_throughput-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[gpt_oss_120b_eagle3_throughput-bench-pytorch-streaming-float4-maxbs:1-input_output_len:128,2048-reqs:1-con:1] - perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-float4-maxbs:8-maxnt:8192-input_output_len:2048,512-kv_cache_dtype:fp8-reqs:1-con:1] - - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-float4-maxbs:8-maxnt:8192-input_output_len:16384,512-kv_cache_dtype:fp8-reqs:1-con:1] + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-float4-maxbs:8-maxnt:4096-input_output_len:2048,256-kv_cache_dtype:fp8-reqs:1-con:1] + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-float4-maxbs:8-maxnt:4096-input_output_len:2048,256-kv_cache_dtype:fp8-reqs:1-con:1] + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4-serve-pytorch-streaming-float4-maxbs:8-maxnt:4096-input_output_len:8192,512-kv_cache_dtype:fp8-reqs:1-con:1] + - perf/test_perf.py::test_perf[nemotron_3_super_120b_nvfp4_mtp-serve-pytorch-streaming-float4-maxbs:8-maxnt:4096-input_output_len:8192,512-kv_cache_dtype:fp8-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 947cf22a39b5..8125f7530f2b 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -282,7 +282,6 @@ unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend[act=Relu2-e60_ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewise_cuda_graph[mtp3_fp8kv_chunked] SKIP (https://nvbugs/5989920) accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5992113) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[use_temperature=False-attn_backend=TRTLLM] SKIP (https://nvbugs/5997547) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_python_scheduler[ep4-mtp_nextn=0] SKIP (https://nvbugs/5997051) accuracy/test_llm_api_pytorch.py::TestQwen3_5_35B_A3B::test_fp8 SKIP (https://nvbugs/6004530) unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] SKIP (https://nvbugs/6007285) accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[latency] SKIP (https://nvbugs/6012526)