diff --git a/examples/serve/aiperf_client.sh b/examples/serve/aiperf_client.sh index 8a150714de2e..d901f99cde14 100755 --- a/examples/serve/aiperf_client.sh +++ b/examples/serve/aiperf_client.sh @@ -2,7 +2,7 @@ aiperf profile \ -m TinyLlama-1.1B-Chat-v1.0 \ - --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --tokenizer ${AIPERF_TOKENIZER_PATH:-TinyLlama/TinyLlama-1.1B-Chat-v1.0} \ --endpoint-type chat \ --random-seed 123 \ --synthetic-input-tokens-mean 128 \ diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 9bc1ead4912c..ce5842d7c219 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } // End of Methods to run Slurm job with Jenkins Agent -def getNodeArgs(int nodeCount, int gpuCount) { +def getNodeArgs(int nodeCount, int gpuCount, boolean setSegment = false) { int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue() - return nodeCount == 1 ? [ + def args = nodeCount == 1 ? [ "--nodes=${nodeCount}", "--gpus=${gpuCount}" ] : [ @@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) { "--ntasks-per-node=${gpusPerNode}", "--gpus-per-node=${gpusPerNode}", ] + if (setSegment && gpuCount > 1) { + args += ["--segment=${nodeCount}"] + } + return args } def getPytestBaseCommandLine( @@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG // Create a unique suffix for the job name String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() def jobUID = "${cluster.host}-multi_node_test-${customSuffix}" + def disaggMode = stageName.contains("Perf-Sanity-Disagg") + def setSegment = disaggMode Utils.exec(pipeline, script: "env | sort && pwd && ls -alh") @@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def llmSrcLocal = "${llmPath}/TensorRT-LLM/src" def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh" def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh" + def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh" + def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh" def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" def outputPath = "${jobWorkspace}/job-output.log" @@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG true ) + Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}") + Utils.copyFileToRemoteHost( + pipeline, + remote, + scriptInstallLocalPath, + scriptInstallPathNode, + true + ) + // Generate Test List and Upload to Frontend Node def makoArgs = getMakoArgsFromStageName(stageName, true) // TODO: currently the options will only be processed if the first @@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG // Generate Job Launch Script def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") def mounts = getMountListForSlurmTest(cluster, true).join(",") - String[] taskArgs = getNodeArgs(nodeCount, gpuCount) + String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment) if (taskArgs == null) { error "Invalid Slurm test stage name is set" } @@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG envVarsToExport.each { varName, varValue -> srunArgs.add("--container-env=${varName}") } - if(nodeCount > 1) { - srunArgs.add("--mpi=pmi2") - } - def exemptionComment = "" if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) { exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'""" @@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG "export ${varName}=\"${escapedValue}\"" }.join('\n') - def scriptContent = """#!/bin/bash - #SBATCH ${exemptionComment} --output=${outputPath} + def scriptLaunchPrefix = """#!/bin/bash + #SBATCH ${exemptionComment} + #SBATCH --output=${outputPath} ${taskArgs.collect { "#SBATCH $it" }.join('\n')} #SBATCH ${partition.additionalArgs} ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""} @@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES" ${srunPrologue} - - srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode} """.replaceAll("(?m)^\\s*", "") - pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent) + + if (disaggMode) { + if(nodeCount > 1) { + srunArgs.add("--mpi=pmix") + } + + def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh") + def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt") + def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh" + def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py" + + pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix) + pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" ")) + Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}") + Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}") + + // Output is the corresponding scriptLaunchPathLocal script under the disaggMode + sh """ + python3 ${scriptSubmitLocalPath} \\ + --run-ci \\ + --llm-src ${llmSrcLocal} \\ + --test-list ${testListPathLocal} \\ + --draft-launch-sh ${scriptLaunchDraftPathLocal} \\ + --launch-sh ${scriptLaunchPathLocal} \\ + --run-sh ${scriptRunPathNode} \\ + --install-sh ${scriptInstallPathNode} \\ + --script-prefix ${scriptLaunchPrefixPathLocal} \\ + --srun-args ${scriptLaunchSrunArgsPathLocal} + """ + } else { + if(nodeCount > 1) { + srunArgs.add("--mpi=pmi2") + } + + def scriptContent = """ + ${scriptLaunchPrefix} + srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode} + """.replaceAll("(?m)^\\s*", "") + pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent) + } + Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}") Utils.copyFileToRemoteHost( pipeline, @@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO if (noRegularTests && noIsolateTests) { error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result." } - } } @@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO stage("Check perf result") { def perfCheckResult = sh( script: """ - python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \ + python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \ ${stageName}/perf_script_test_results.csv \ ${basePerfPath} """, @@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO """ } } + + if (perfMode && stageName.contains("Perf-Sanity")) { + stage ("Check perf result") { + def perfCheckResult = sh( + script: """ + python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \ + ${WORKSPACE}/${stageName} + """, + returnStatus: true + ) + // TODO: Enable this when perf regression check is stable + // if (perfCheckResult != 0) { + // error "Performance regression detected and failing the build (exit code: ${perfCheckResult})" + // } + } + } } } @@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2], - // Perf sanity post merge test - "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2], + // Perf sanity post merge aggr tests + "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2], + // Perf sanity post merge disagg tests + "GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3], + // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6], + // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6], + // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8], ] fullSet += multiNodesSBSAConfigs.keySet() diff --git a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh new file mode 100644 index 000000000000..c5dc80c971a2 --- /dev/null +++ b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh @@ -0,0 +1,76 @@ + +cleanup_on_failure() { + echo "Error: $1" + scancel ${SLURM_JOB_ID} + exit 1 +} + +mkdir -p $jobWorkspace +chmod +x $runScript +chmod +x $installScript + +# Run installation on all nodes +echo "Running installation on all nodes..." +if ! srun "${srunArgs[@]}" $installScript &> $jobWorkspace/install.log; then + cleanup_on_failure "Failed to run installation. Check $jobWorkspace/install.log" +fi +echo "Installation completed on all nodes" + +# Start gen servers +echo "Starting gen servers..." +for i in $(seq 0 $((numGenServers - 1))); do + gen_world_size=$((nodesPerGenServer * gpusPerNode)) + export DISAGG_SERVING_TYPE="GEN_$i" + export pytestCommand="$pytestCommandWorker" + srun "${srunArgs[@]}" --kill-on-bad-exit=1 \ + -N $nodesPerGenServer \ + --ntasks=$gen_world_size \ + --ntasks-per-node=$gpusPerNode \ + $runScript &> $jobWorkspace/gen_server_$i.log & + echo "Started gen server $i" +done + +# Start ctx servers (skip if gen_only mode) +if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then + echo "Starting ctx servers..." + for i in $(seq 0 $((numCtxServers - 1))); do + ctx_world_size=$((nodesPerCtxServer * gpusPerNode)) + export DISAGG_SERVING_TYPE="CTX_$i" + export pytestCommand="$pytestCommandWorker" + srun "${srunArgs[@]}" --kill-on-bad-exit=1 \ + -N $nodesPerCtxServer \ + --ntasks=$ctx_world_size \ + --ntasks-per-node=$gpusPerNode \ + $runScript &> $jobWorkspace/ctx_server_$i.log & + echo "Started ctx server $i" + done +else + echo "Skipping ctx servers (gen_only mode)" +fi + + +# Start disagg server +echo "Starting disagg server..." +export DISAGG_SERVING_TYPE="DISAGG_SERVER" +export pytestCommand="$pytestCommandDisaggServer" +srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \ + -N 1 \ + --ntasks=1 \ + --ntasks-per-node=1 \ + $runScript &> $jobWorkspace/disagg_server.log & +echo "Started disagg server" + +# Start benchmark +echo "Starting benchmark..." +export DISAGG_SERVING_TYPE="BENCHMARK" +export pytestCommand="$pytestCommandBenchmark" +if ! srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \ + -N 1 \ + --ntasks=1 \ + --ntasks-per-node=1 \ + $runScript; then + cleanup_on_failure "Benchmark failed. Check logs in ${jobWorkspace} for details" +fi + +echo "Disagg server and benchmark completed successfully" +echo "Total runtime: $SECONDS seconds" diff --git a/jenkins/scripts/perf/disaggregated/submit.py b/jenkins/scripts/perf/disaggregated/submit.py new file mode 100644 index 000000000000..5e8e374f4f08 --- /dev/null +++ b/jenkins/scripts/perf/disaggregated/submit.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +import argparse +import os + +import yaml + + +def get_hardware_config(config, benchmark_mode): + hardware = config.get("hardware", {}) + worker_config = config.get("worker_config", {}) + + num_ctx_servers = 0 if "gen_only" in benchmark_mode else hardware.get("num_ctx_servers") + num_gen_servers = hardware.get("num_gen_servers") + gpus_per_node = hardware.get("gpus_per_node") + + # Get gpus_per_ctx_server and gpus_per_gen_server from worker_config's tensor_parallel_size + ctx_config = worker_config.get("ctx", {}) + gen_config = worker_config.get("gen", {}) + ctx_tp = ctx_config.get("tensor_parallel_size", 1) + ctx_pp = ctx_config.get("pipeline_parallel_size", 1) + ctx_cp = ctx_config.get("context_parallel_size", 1) + gpus_per_ctx_server = ctx_tp * ctx_pp * ctx_cp + gen_tp = gen_config.get("tensor_parallel_size", 1) + gen_pp = gen_config.get("pipeline_parallel_size", 1) + gen_cp = gen_config.get("context_parallel_size", 1) + gpus_per_gen_server = gen_tp * gen_pp * gen_cp + + if None in [ + num_ctx_servers, + num_gen_servers, + gpus_per_node, + gpus_per_ctx_server, + gpus_per_gen_server, + ]: + raise ValueError("Missing required hardware configuration") + + # Calculate nodes per server + nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node + nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node + + total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server + total_gpus = total_nodes * gpus_per_node + + return { + "num_ctx_servers": num_ctx_servers, + "num_gen_servers": num_gen_servers, + "gpus_per_node": gpus_per_node, + "gpus_per_ctx_server": gpus_per_ctx_server, + "gpus_per_gen_server": gpus_per_gen_server, + "nodes_per_ctx_server": nodes_per_ctx_server, + "nodes_per_gen_server": nodes_per_gen_server, + "total_nodes": total_nodes, + "total_gpus": total_gpus, + } + + +def get_env_config(config): + env = config.get("environment", {}) + + container = env.get("container_image", "") + mounts = env.get("container_mount", "") + workdir = env.get("container_workdir", "") + llm_models_root = env.get("llm_models_root", "") + llmsrc = env.get("trtllm_repo", "") + build_wheel = env.get("build_wheel", False) + # Use work_dir as job_workspace + job_workspace = env.get("work_dir", "") + worker_env_var = env.get("worker_env_var", "") + server_env_var = env.get("server_env_var", "") + benchmark_env_var = env.get("benchmark_env_var", "") + open_search_db_base_url = env.get("open_search_db_base_url", "") + + return { + "container": container, + "mounts": mounts, + "workdir": workdir, + "llm_models_root": llm_models_root, + "llmsrc": llmsrc, + "build_wheel": build_wheel, + "job_workspace": job_workspace, + "worker_env_var": worker_env_var, + "server_env_var": server_env_var, + "benchmark_env_var": benchmark_env_var, + "open_search_db_base_url": open_search_db_base_url, + } + + +def get_benchmark_config(config): + benchmark = config.get("benchmark", {}) + + mode = benchmark.get("mode", "e2e") + concurrency_str = benchmark.get("concurrency_list", "1") + concurrency = int(concurrency_str) if isinstance(concurrency_str, str) else concurrency_str + + return { + "mode": mode, + "concurrency": concurrency, + } + + +def remove_whitespace_lines(lines): + return [line.strip() for line in lines if line.strip()] + + +def get_pytest_command_no_llmapilaunch(script_prefix_lines): + pytest_command_line = None + for line in script_prefix_lines: + if "export pytestCommand=" in line: + pytest_command_line = line + break + + if not pytest_command_line: + return "" + + # Replace pytestCommand with pytestCommandNoLLMAPILaunch + replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch") + + # Split by space, find and remove the substring with trtllm-llmapi-launch + replaced_line_parts = replaced_line.split() + replaced_line_parts_no_llmapi = [ + part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part + ] + return " ".join(replaced_line_parts_no_llmapi) + + +def get_config_yaml(test_list_path, llm_src): + with open(test_list_path, "r") as f: + first_line = f.readline().strip() + + if "[" not in first_line or "]" not in first_line: + raise ValueError( + f"Invalid test list format. Expected test name with brackets: {first_line}" + ) + bracket_content = first_line.split("[")[-1].split("]")[0] + parts = bracket_content.split("-") + if len(parts) < 2: + raise ValueError( + f"Invalid test name format. Expected format: prefix-config_name, got: {bracket_content}" + ) + + # parts[0] is the prefix, parts[1:] is the config name + if "disagg" not in parts[0]: + raise ValueError( + f"Invalid test name format. Expected format: disagg-config_name, got: {bracket_content}" + ) + config_base_name = "-".join(parts[1:]) + config_yaml_path = os.path.join( + llm_src, + "tests", + "integration", + "defs", + "perf", + "disagg", + "test_configs", + "disagg", + "perf", + f"{config_base_name}.yaml", + ) + if not os.path.exists(config_yaml_path): + raise FileNotFoundError(f"Config file not found: {config_yaml_path}") + return config_yaml_path + + +def main(): + parser = argparse.ArgumentParser( + description="Generate SLURM launch script for both CI and local modes" + ) + parser.add_argument( + "--run-ci", + action="store_true", + default=False, + help="Run in CI mode (true) or local mode (false)", + ) + parser.add_argument("--draft-launch-sh", required=True, help="Path to draft-launch.sh script") + parser.add_argument("--launch-sh", required=True, help="Path to output launch.sh script") + parser.add_argument("--run-sh", required=True, help="Path to slurm_run.sh script") + parser.add_argument("--install-sh", required=True, help="Path to slurm_install.sh script") + + # Optional arguments for local mode + parser.add_argument("--config-yaml", default="", help="Path to config YAML file") + parser.add_argument("--stage-name", default="", help="Stage name (optional, local mode only)") + + # Optional arguments for CI mode + parser.add_argument("--llm-src", default="", help="Path to LLM source code") + parser.add_argument("--test-list", default="", help="Path to test list file") + parser.add_argument( + "--script-prefix", + default="", + help="Launch script prefix file path (optional, CI mode only)", + ) + parser.add_argument( + "--srun-args", + default="", + help="Path to file containing srun args (optional, CI mode only)", + ) + + args = parser.parse_args() + + config_yaml = get_config_yaml(args.test_list, args.llm_src) + + with open(config_yaml, "r") as f: + config = yaml.safe_load(f) + + # Determine install script path + install_script = args.install_sh + + env_config = get_env_config(config) + print(f"Environment configuration: {env_config}") + + benchmark_config = get_benchmark_config(config) + print(f"Benchmark configuration: {benchmark_config}") + benchmark_mode = benchmark_config["mode"] + + hardware_config = get_hardware_config(config, benchmark_mode) + print(f"Hardware configuration: {hardware_config}") + + script_prefix_lines = [] + srun_args_lines = [] + + with open(args.script_prefix, "r") as f: + script_prefix_content = f.read() + script_prefix_lines = script_prefix_content.split("\n") + with open(args.srun_args, "r") as f: + srun_args_content = f.read() + + srun_args_lines = srun_args_content.split() + + # Extract pytestCommand and generate pytestCommandNoLLMAPILaunch + pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines) + + # Build worker env vars, add extra env vars for gen_only mode + worker_env_vars = env_config["worker_env_var"] + server_env_vars = env_config["server_env_var"] + if "gen_only" in benchmark_config["mode"]: + concurrency = benchmark_config["concurrency"] + worker_env_vars = ( + "TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 " + f"TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1 " + f"TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency} {worker_env_vars}" + ) + server_env_vars = f"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 {server_env_vars}" + script_prefix_lines.append("export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1") + srun_args_lines.append("--container-env=TRTLLM_DISAGG_BENCHMARK_GEN_ONLY") + + script_prefix_lines.extend( + [ + pytest_command_no_llmapi_launch, + f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"', + f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"', + f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"', + f"export runScript={args.run_sh}", + f"export installScript={install_script}", + f"export numCtxServers={hardware_config['num_ctx_servers']}", + f"export numGenServers={hardware_config['num_gen_servers']}", + f"export gpusPerNode={hardware_config['gpus_per_node']}", + f"export gpusPerCtxServer={hardware_config['gpus_per_ctx_server']}", + f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}", + f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}", + f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}", + f"export totalNodes={hardware_config['total_nodes']}", + f"export totalGpus={hardware_config['total_gpus']}", + ] + ) + + remove_whitespace_lines(script_prefix_lines) + script_prefix = "\n".join(script_prefix_lines) + + remove_whitespace_lines(srun_args_lines) + srun_args_lines.extend( + [ + "--container-env=DISAGG_SERVING_TYPE", + "--container-env=pytestCommand", + ] + ) + srun_args_lines = ["srunArgs=("] + [f' "{line}"' for line in srun_args_lines] + [")"] + srun_args = "\n".join(srun_args_lines) + + with open(args.draft_launch_sh, "r") as f: + draft_launch_content = f.read() + draft_launch_lines = draft_launch_content.split("\n") + remove_whitespace_lines(draft_launch_lines) + draft_launch_content = "\n".join(draft_launch_lines) + + with open(args.launch_sh, "w") as f: + f.write(f"{script_prefix}\n{srun_args}\n{draft_launch_content}") + + print(f"Launch script generated at: {args.launch_sh}") + print(f"Launch script:\n{script_prefix}\n{srun_args}\n{draft_launch_content}") + + +if __name__ == "__main__": + main() diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh new file mode 100644 index 000000000000..00fcd2b0935e --- /dev/null +++ b/jenkins/scripts/slurm_install.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Set up error handling +set -Eeuo pipefail +trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR + +slurm_install_setup() { + cd $resourcePathNode + llmSrcNode=$resourcePathNode/TensorRT-LLM/src + + if [ $SLURM_LOCALID -eq 0 ]; then + wget -nv $llmTarfile + tar -zxf $tarName + which python3 + python3 --version + apt-get install -y libffi-dev + nvidia-smi && nvidia-smi -q && nvidia-smi topo -m + if [[ $pytestCommand == *--run-ray* ]]; then + pip3 install --retries 10 ray[default] + fi + cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt + cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl + gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) + hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" + echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" + touch install_lock.lock + else + while [ ! -f install_lock.lock ]; do + sleep 5 + done + fi +} + +# Only run slurm_install_setup when script is executed directly (not sourced) +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + slurm_install_setup +fi diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index ce2712881b7c..e86092b7ea2e 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -39,26 +39,12 @@ if [ $SLURM_PROCID -eq 0 ]; then fi fi -if [ $SLURM_LOCALID -eq 0 ]; then - wget -nv $llmTarfile - tar -zxf $tarName - which python3 - python3 --version - apt-get install -y libffi-dev - nvidia-smi && nvidia-smi -q && nvidia-smi topo -m - if [[ $pytestCommand == *--run-ray* ]]; then - pip3 install --retries 10 ray[default] - fi - cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt - cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl - gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) - hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" - echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" - touch install_lock.lock -else - while [ ! -f install_lock.lock ]; do - sleep 5 - done +# Aggregated mode will run install together with pytest in slurm_run.sh +# Disaggregated mode will run install separately in slurm_install.sh +if [[ "$stageName" != *Disagg* ]]; then + installScriptPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_run\.sh/slurm_install.sh/')" + source "$installScriptPath" + slurm_install_setup fi if [[ "$stageName" == *GB200* ]]; then @@ -131,3 +117,9 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe --files $stageName/perf_script_test_results.csv \ $basePerfPath fi + +if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then + echo "Check Perf-Sanity Result" + python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \ + $jobWorkspace +fi diff --git a/tensorrt_llm/serve/harmony_adapter.py b/tensorrt_llm/serve/harmony_adapter.py index 9299b2b68aea..1ba163581edc 100644 --- a/tensorrt_llm/serve/harmony_adapter.py +++ b/tensorrt_llm/serve/harmony_adapter.py @@ -217,8 +217,9 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None: # Check if tool is allowed if self.should_filter_tools and func_name not in self.available_tools: - logger.debug("Request %s: tool %s not in available tools", - self.request_id, func_name) + logger.debug( + f"Request {self.request_id}: tool {func_name} not in available tools" + ) return None # Get or create tool call @@ -273,8 +274,9 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None: else: return {"content": self.parser.last_content_delta} else: - logger.debug("Request %s: no delta generated for channel=%s", - self.request_id, self.parser.current_channel) + logger.debug( + f"Request {self.request_id}: no delta generated for channel={self.parser.current_channel}" + ) return None def _get_or_create_tool_call(self, func_name: str) -> str: @@ -295,8 +297,9 @@ def _get_or_create_tool_call(self, func_name: str) -> str: "active": True } self.tool_call_index += 1 - logger.debug("Request %s: created new tool call %s for function %s", - self.request_id, tool_id, func_name) + logger.debug( + f"Request {self.request_id}: created new tool call {tool_id} for function {func_name}" + ) return tool_id def get_debug_info(self) -> dict[str, Any]: @@ -896,8 +899,8 @@ def _parse_tool_call_from_harmony_message( } except json.JSONDecodeError: logger.warning( - "Failed to parse tool call arguments as JSON: %s", - function_call_args) + f"Failed to parse tool call arguments as JSON: {function_call_args}" + ) return None elif msg_content_type and "code" in msg_content_type: function_name = str(msg_recipient) @@ -1023,10 +1026,11 @@ def harmony_output_to_openai( except (HarmonyError, UnicodeDecodeError, ValueError) as parse_error: logger.warning( - "Failed to parse harmony messages from tokens: %s", - parse_error) - logger.debug("Problematic clean tokens (%d): %s", - len(clean_tokens), clean_tokens) + f"Failed to parse harmony messages from tokens: {parse_error}" + ) + logger.debug( + f"Problematic clean tokens ({len(clean_tokens)}): {clean_tokens}" + ) # Fallback to raw text parsing raise RuntimeError(f"Harmony parsing failed: {parse_error}" ) # This will be caught by outer try-catch @@ -1103,9 +1107,9 @@ def harmony_output_to_openai( except Exception as e: raw_text = self._safe_decode_utf8(harmony_output_tokens, "HARMONY _OUTPUT: ") - logger.warning("Failed to parse harmony output: %s. Raw output: %s", - e, raw_text) - logger.debug("Detailed error: %s", traceback.format_exc()) + logger.warning( + f"Failed to parse harmony output: {e}. Raw output: {raw_text}") + logger.debug(f"Detailed error: {traceback.format_exc()}") # Check if raw_text contains a decode error (fallback content) if "HARMONY_OUTPUT:" in raw_text: @@ -1276,9 +1280,9 @@ def stateful_stream_harmony_tokens_to_openai_deltas( return deltas except (HarmonyError, UnicodeDecodeError, ValueError): logger.error( - f"Streaming: Failed to process token batch of {len(tokens)} tokens for request {request_id}", + f"Streaming: Failed to process token batch of {len(tokens)} tokens for request {request_id}" ) - logger.debug("Problematic streaming tokens: %s", tokens) + logger.debug(f"Problematic streaming tokens: {tokens}") # Return empty deltas to continue processing return [] @@ -1457,8 +1461,8 @@ def create_stream_state( """ if request_id in self._stream_states: logger.warning( - "Stream state already exists for request %s, replacing", - request_id) + f"Stream state already exists for request {request_id}, replacing" + ) stream_state = HarmonyStreamState( request_id=request_id, @@ -1494,7 +1498,7 @@ def _filter_tool_calls( # Filter unavailable external tools if should_filter_external_tools and func_name not in external_tools: - logger.debug("Filtered unavailable tool call: %s", func_name) + logger.debug(f"Filtered unavailable tool call: {func_name}") continue filtered.append(tool_call) @@ -1644,7 +1648,7 @@ def handle_non_streaming_response(tools: List[ChatCompletionToolsParam], output.token_ids, tools_for_parser, tool_choice) # CONVERTED OUTPUT (after harmony to openai conversion) - logger.debug("✅ CONVERTED OUTPUT: %s", json.dumps(parsed_output, indent=2)) + logger.debug(f"✅ CONVERTED OUTPUT: {json.dumps(parsed_output, indent=2)}") # Create response message response_message = _create_response_message(parsed_output) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 2d6b02897d52..c52911fe00bb 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -1091,11 +1091,13 @@ def test_auto_dtype(self, block_reuse, mocker): "max_attention_window": [128, 32768], "enable_block_reuse": block_reuse, "enable_partial_reuse": False, + "free_gpu_memory_fraction": 0.5, } gen_server_config["kv_cache_config"] = { "max_attention_window": [128, 32768], "enable_block_reuse": block_reuse, "enable_partial_reuse": False, + "free_gpu_memory_fraction": 0.5, } disaggregated_server_config = { "hostname": "localhost", diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ab9c1591c178..66b07ebf7b68 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4369,6 +4369,11 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler, "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4" ) + if not one_model and overlap_scheduler: + pytest.skip( + "https://nvbugs/5745152: two_model + overlap_scheduler can sometimes time out." + ) + MAX_OUTPUT_LEN = 128179 MAX_INPUT_LEN = 32768 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml index 90a198897b6c..23340d1f506c 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml index 120fc40b3c2e..774f321175a3 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml new file mode 100644 index 000000000000..387704da4a16 --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml @@ -0,0 +1,105 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: -1 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: e2e + use_nv_sa_benchmark: true + multi_round: 8 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '6144' + input_length: 1024 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + context_parallel_size: 1 + max_batch_size: 768 + max_num_tokens: 768 + max_seq_len: 2068 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 768 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + dtype: fp8 + moe_config: + backend: CUTLASS + use_low_precision_moe_combine: true + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + ctx: + max_batch_size: 16 + max_num_tokens: 16896 + max_seq_len: 2044 + tensor_parallel_size: 4 + context_parallel_size: 1 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index 6a4f5f5ddfeb..1eaa085ba6c6 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index e8f1b31a4117..46072b7585b8 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml index 2f9d1ad7c8ab..2851dc8ce405 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml index e60204a5624b..390e68f26015 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml index a307a87f173f..cd29bd85102b 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml index d44c4d51e069..b9f7881c60df 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml new file mode 100644 index 000000000000..b6299357d2ad --- /dev/null +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml @@ -0,0 +1,122 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + precision: fp4 + model_dir_name: DeepSeek-R1-0528-FP4-v2 + supported_gpus: + - GB200 + - GB300 + script_file: disaggr_torch.slurm + benchmark_type: 1k1k + config_index: -1 +slurm: + script_file: disaggr_torch.slurm + partition: + account: + job_time: 02:00:00 + job_name: unified-benchmark + extra_args: "--gres=gpu:4" + numa_bind: true +benchmark: + mode: gen_only + use_nv_sa_benchmark: true + multi_round: 1 + benchmark_ratio: 0.8 + streaming: true + concurrency_list: '1024' + input_length: 8192 + output_length: 1024 + dataset_file: +hardware: + gpus_per_node: 4 + num_ctx_servers: 1 + num_gen_servers: 1 +environment: + container_mount: + container_image: + model_path: + trtllm_repo: '' + build_wheel: false + work_dir: + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" +profiling: + nsys_on: false +accuracy: + enable_accuracy_test: false + model: local-completions + tasks: gsm8k + model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096 +worker_config: + gen: + tensor_parallel_size: 32 + moe_expert_parallel_size: 32 + context_parallel_size: 1 + enable_attention_dp: true + enable_lm_head_tp_in_adp: true + pipeline_parallel_size: 1 + max_batch_size: 128 + max_num_tokens: 512 + max_seq_len: 9256 + cuda_graph_config: + enable_padding: true + batch_sizes: + - 1 + - 2 + - 4 + - 8 + - 16 + - 24 + - 32 + - 40 + - 48 + - 56 + - 64 + - 72 + - 80 + - 88 + - 96 + - 104 + - 112 + - 120 + - 128 + print_iter_log: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + moe_config: + backend: CUTEDSL + use_low_precision_moe_combine: true + load_balancer: + num_slots: 288 + layer_updates_per_iter: 1 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + stream_interval: 100 + num_postprocess_workers: 4 + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 + ctx: + max_batch_size: 2 + max_num_tokens: 16896 + max_seq_len: 9256 + tensor_parallel_size: 4 + context_parallel_size: 1 + moe_expert_parallel_size: 4 + enable_attention_dp: true + pipeline_parallel_size: 1 + print_iter_log: true + cuda_graph_config: null + disable_overlap_scheduler: true + kv_cache_config: + enable_block_reuse: false + free_gpu_memory_fraction: 0.75 + dtype: fp8 + cache_transceiver_config: + max_tokens_in_buffer: 16384 + backend: UCX + speculative_config: + decoding_type: MTP + num_nextn_predict_layers: 3 diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml index 05c6794dd63e..14c52a0fd8c5 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml index 10aa98c4b30d..532753822974 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml index 64dd806fa6df..56e02bd4e2ee 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml index b0b731322616..c94a0698b97c 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml index 796fdbd8747c..9f85592d0aee 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml index 4a45880f1477..d1f20e16054a 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml index bc46d9fea34b..923b72aa33a4 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml index c397316b3559..fc47dd9bdb17 100644 --- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml +++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml @@ -1,5 +1,5 @@ metadata: - model_name: deepseek-r1-fp4 + model_name: deepseek_r1_0528_fp4_v2 precision: fp4 model_dir_name: DeepSeek-R1-0528-FP4-v2 supported_gpus: diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py index 8a4caa704e3e..87f0b0fed629 100644 --- a/tests/integration/defs/perf/open_search_db_utils.py +++ b/tests/integration/defs/perf/open_search_db_utils.py @@ -33,6 +33,8 @@ PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1" # "sandbox-trtllm-ci-perf" TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info" +PRE_MERGE_THRESHOLD = 0.1 +POST_MERGE_THRESHOLD = 0.05 # Metrics where larger is better MAXIMIZE_METRICS = [ @@ -268,24 +270,7 @@ def match(history_data, new_data, match_keys): def is_empty(value): return value is None or value == "" - def should_skip_field(field): - # Skip fields starting with @, _, ts_ - if field.startswith('@') or field.startswith('_') or field.startswith( - 'ts_'): - return True - # Skip log links and speculative_model_dir and job configs - if field in [ - 's_speculative_model_dir', 's_server_log_link', - 's_ctx_server_log_link', 's_gen_server_log_link', - 's_client_log_link' - ]: - return True - return False - for field in match_keys: - # Skip excluded fields - if should_skip_field(field): - continue history_value = history_data.get(field, None) new_value = new_data.get(field, None) if is_empty(history_value) and is_empty(new_value): @@ -412,6 +397,33 @@ def parse_timestamp(timestamp): return history_baseline_dict, history_data_dict +def get_threshold(baseline_data, metric): + """ + Get the threshold for a metric from baseline data. + """ + is_post_merge = baseline_data.get("b_is_post_merge", False) + + metric_suffix = metric[2:] # Remove "d_" prefix + if is_post_merge: + threshold_key = f"d_threshold_post_merge_{metric_suffix}" + else: + threshold_key = f"d_threshold_pre_merge_{metric_suffix}" + + # Try to get the specific threshold (post_merge or pre_merge) + if threshold_key in baseline_data: + return baseline_data[threshold_key] + + # Fall back to general threshold + fallback_key = f"d_threshold_{metric_suffix}" + if fallback_key in baseline_data: + return baseline_data[fallback_key] + + # No threshold found, raise error + raise KeyError( + f"No threshold found for metric '{metric}'. " + f"Expected '{threshold_key}' or '{fallback_key}' in baseline data.") + + def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): """Get regressive test cases 1. For Maximize metrics, if new perf is below baseline * (1 - threshold) @@ -419,8 +431,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): Set it as regressive. """ regressive_data_list = [] + cmd_idxs = new_data_dict.keys() # Find regressive test cases - for cmd_idx in new_data_dict: + for cmd_idx in cmd_idxs: if history_baseline_dict[cmd_idx] is None: continue @@ -433,8 +446,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): for metric in MAXIMIZE_METRICS: if metric not in new_data or metric not in baseline_data: continue - threshold_key = f"d_threshold_{metric[2:]}" - threshold = baseline_data[threshold_key] + threshold = get_threshold(baseline_data, metric) baseline_value = baseline_data[metric] new_value = new_data[metric] # Regressive if new_value < baseline_value * (1 - threshold) @@ -446,8 +458,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): for metric in MINIMIZE_METRICS: if metric not in new_data or metric not in baseline_data: continue - threshold_key = f"d_threshold_{metric[2:]}" - threshold = baseline_data.get(threshold_key, 0.1) + threshold = get_threshold(baseline_data, metric) baseline_value = baseline_data[metric] new_value = new_data[metric] # Regressive if new_value > baseline_value * (1 + threshold) @@ -464,10 +475,16 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): baseline_key = f"d_baseline_{metric[2:]}" regressive_data[baseline_key] = baseline_data[metric] - threshold_key = f"d_threshold_{metric[2:]}" - if threshold_key in baseline_data: - regressive_data[threshold_key] = baseline_data[ - threshold_key] + # Copy all threshold keys from baseline + metric_suffix = metric[2:] + for threshold_key in [ + f"d_threshold_{metric_suffix}", + f"d_threshold_post_merge_{metric_suffix}", + f"d_threshold_pre_merge_{metric_suffix}" + ]: + if threshold_key in baseline_data: + regressive_data[threshold_key] = baseline_data[ + threshold_key] # Add regression info string regressive_data["s_regression_info"] = ", ".join(regressive_metrics) @@ -478,8 +495,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict): return regressive_data_list -def prepare_baseline_data(history_baseline_dict, history_data_dict, - new_data_dict): +def prepare_baseline_data(history_data_dict, new_data_dict): """ Calculate new baseline from history post-merge data and new data. Then return new baseline data. @@ -491,20 +507,19 @@ def prepare_baseline_data(history_baseline_dict, history_data_dict, # Calculate best metrics from history post-merge data and new data best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx], new_data_dict[cmd_idx]) - new_baseline_data = history_baseline_dict[cmd_idx] - if new_baseline_data: - print_info(f"Baseline data found (cmd_idx: {cmd_idx}) in history") - else: - print_info( - f"No baseline data found (cmd_idx: {cmd_idx}), created a new baseline" - ) - new_baseline_data = new_data_dict[cmd_idx].copy() - new_baseline_data["b_is_baseline"] = True - add_id(new_baseline_data) - # Add or update baseline metrics + new_baseline_data = new_data_dict[cmd_idx].copy() + new_baseline_data["b_is_baseline"] = True + # Add or update baseline metrics and thresholds for metric, value in best_metrics.items(): new_baseline_data[metric] = value - new_baseline_data[f"d_threshold_{metric[2:]}"] = 0.1 + metric_suffix = metric[2:] + post_merge_key = f"d_threshold_post_merge_{metric_suffix}" + pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}" + new_baseline_data[post_merge_key] = new_baseline_data.get( + post_merge_key, POST_MERGE_THRESHOLD) + new_baseline_data[pre_merge_key] = new_baseline_data.get( + pre_merge_key, PRE_MERGE_THRESHOLD) + add_id(new_baseline_data) new_baseline_data_dict[cmd_idx] = new_baseline_data return new_baseline_data_dict diff --git a/tests/integration/defs/perf/perf_regression_check.py b/tests/integration/defs/perf/perf_regression_check.py new file mode 100644 index 000000000000..7c29845eb011 --- /dev/null +++ b/tests/integration/defs/perf/perf_regression_check.py @@ -0,0 +1,185 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +import yaml + +METRICS = [ + "seq_throughput", + "token_throughput", + "total_token_throughput", + "user_throughput", + "mean_tpot", + "median_tpot", + "p99_tpot", + "mean_ttft", + "median_ttft", + "p99_ttft", + "mean_itl", + "median_itl", + "p99_itl", + "mean_e2el", + "median_e2el", + "p99_e2el", +] + + +def should_skip_execution(): + disagg_type = os.getenv("DISAGG_SERVING_TYPE", "") + if ( + disagg_type.startswith("GEN") + or disagg_type.startswith("CTX") + or disagg_type == "DISAGG_SERVER" + ): + return True + return False + + +def find_yaml_files(job_workspace, filename): + yaml_files = [] + for root, dirs, files in os.walk(job_workspace): + for file in files: + if file == filename: + yaml_files.append(os.path.join(root, file)) + return yaml_files + + +def read_yaml_data(yaml_files): + all_data = [] + for file_path in yaml_files: + try: + with open(file_path, "r") as f: + data = yaml.safe_load(f) + if data: + if isinstance(data, list): + all_data.extend(data) + else: + all_data.append(data) + except Exception as e: + print(f"Error reading {file_path}: {e}") + return all_data + + +def get_metric_keys(): + metric_keys = set() + for metric in METRICS: + metric_keys.add(f"d_{metric}") + metric_keys.add(f"d_baseline_{metric}") + metric_keys.add(f"d_threshold_{metric}") + return metric_keys + + +def print_perf_data(data): + print("=== Metrics ===") + for metric in METRICS: + value_key = f"d_{metric}" + if value_key in data: + value = data.get(value_key, "N/A") + print(f'"{value_key}": {value}') + + metric_keys = get_metric_keys() + print("\n=== Config ===") + config_keys = sorted([key for key in data.keys() if key not in metric_keys]) + for key in config_keys: + value = data[key] + print(f'"{key}": {value}') + + +def print_regression_data(data): + if "s_regression_info" in data: + print("=== Regression Info ===") + print(f"{data['s_regression_info']}") + + metric_keys = get_metric_keys() + + print("=== Metrics ===") + for metric in METRICS: + value_key = f"d_{metric}" + baseline_key = f"d_baseline_{metric}" + threshold_key = f"d_threshold_{metric}" + # Only print if at least one of the keys exists + if value_key in data or baseline_key in data or threshold_key in data: + value = data.get(value_key, "N/A") + baseline = data.get(baseline_key, "N/A") + threshold = data.get(threshold_key, "N/A") + # Calculate percentage difference between value and baseline + if ( + isinstance(value, (int, float)) + and isinstance(baseline, (int, float)) + and baseline != 0 + ): + percentage = (value - baseline) / baseline * 100 + percentage_str = f"{percentage:+.2f}%" + else: + percentage_str = "N/A" + print( + f'"{value_key}": {value}, "{baseline_key}": {baseline}, ' + f'"{threshold_key}": {threshold}, "diff": {percentage_str}' + ) + + print("\n=== Config ===") + config_keys = sorted([key for key in data.keys() if key not in metric_keys]) + for key in config_keys: + if key == "s_regression_info": + continue + value = data[key] + print(f'"{key}": {value}') + + +def main(): + if should_skip_execution(): + print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE") + return 0 + + job_workspace = sys.argv[1] + + if not os.path.isdir(job_workspace): + print(f"Error: {job_workspace} is not a valid directory") + sys.exit(1) + + perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml") + all_perf_data = read_yaml_data(perf_data_files) + print(f"Found {len(all_perf_data)} perf data") + for i, data in enumerate(all_perf_data): + print(f"\n{'=' * 60}") + print(f"Perf Data #{i + 1}") + print("=" * 60) + print_perf_data(data) + + print(f"\n{'=' * 60}\n") + + regression_files = find_yaml_files(job_workspace, "regression.yaml") + all_regression_data = read_yaml_data(regression_files) + print(f"Found {len(all_regression_data)} regression data") + for i, data in enumerate(all_regression_data): + print(f"\n{'=' * 60}") + print(f"Regression Data #{i + 1}") + print("=" * 60) + print_regression_data(data) + + if len(all_regression_data) == 0: + print("\n No regression data found. Perf check is successful.") + return 0 + else: + print( + f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed." + ) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 980f0d11606a..6074f2f310f2 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -19,14 +19,15 @@ import re import shutil import socket +import subprocess import sys from typing import Dict, List, NamedTuple import pytest import yaml from defs.common import get_cpp_benchmark -from defs.trt_test_alternative import (is_linux, is_windows, print_info, - print_warning) +from defs.trt_test_alternative import (is_linux, is_windows, print_error, + print_info, print_warning) from ..conftest import get_llm_root, llm_models_root, trt_environment from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id, @@ -227,6 +228,11 @@ def get_model_dir(model_name: str): return model_dir +def get_dataset_path(): + return os.path.join(llm_models_root(), "datasets", + "ShareGPT_V3_unfiltered_cleaned_split.json") + + def cpu_socket_count_gt_1(): global MAP_BY_SOCKET if MAP_BY_SOCKET is not None: @@ -319,37 +325,37 @@ def import_allowed_perf_config(): AGGR_SERVER_PERF_METRIC_LOG_QUERIES = { PerfMetricType.SEQ_THROUGHPUT: - re.compile(r"Request throughput \(req\/s\):\s+([\d\.]+)"), + re.compile(r"Request throughput \(req\/s\):\s+(-?[\d\.]+)"), PerfMetricType.TOKEN_THROUGHPUT: - re.compile(r"Output token throughput \(tok\/s\):\s+([\d\.]+)"), + re.compile(r"Output token throughput \(tok\/s\):\s+(-?[\d\.]+)"), PerfMetricType.TOTAL_TOKEN_THROUGHPUT: - re.compile(r"Total Token throughput \(tok\/s\):\s+([\d\.]+)"), + re.compile(r"Total Token throughput \(tok\/s\):\s+(-?[\d\.]+)"), PerfMetricType.USER_THROUGHPUT: - re.compile(r"User throughput \(tok\/s\):\s+([\d\.]+)"), + re.compile(r"User throughput \(tok\/s\):\s+(-?[\d\.]+)"), PerfMetricType.FIRST_TOKEN_TIME: - re.compile(r"Mean TTFT \(ms\):\s+([\d\.]+)"), + re.compile(r"Mean TTFT \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.MEDIAN_FIRST_TOKEN_TIME: - re.compile(r"Median TTFT \(ms\):\s+([\d\.]+)"), + re.compile(r"Median TTFT \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.P99_FIRST_TOKEN_TIME: - re.compile(r"P99 TTFT \(ms\):\s+([\d\.]+)"), + re.compile(r"P99 TTFT \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.INTER_TOKEN_TIME: - re.compile(r"Mean ITL \(ms\):\s+([\d\.]+)"), + re.compile(r"Mean ITL \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.MEDIAN_INTER_TOKEN_TIME: - re.compile(r"Median ITL \(ms\):\s+([\d\.]+)"), + re.compile(r"Median ITL \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.P99_INTER_TOKEN_TIME: - re.compile(r"P99 ITL \(ms\):\s+([\d\.]+)"), + re.compile(r"P99 ITL \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.OUTPUT_TOKEN_TIME: - re.compile(r"Mean TPOT \(ms\):\s+([\d\.]+)"), + re.compile(r"Mean TPOT \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME: - re.compile(r"Median TPOT \(ms\):\s+([\d\.]+)"), + re.compile(r"Median TPOT \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.P99_OUTPUT_TOKEN_TIME: - re.compile(r"P99 TPOT \(ms\):\s+([\d\.]+)"), + re.compile(r"P99 TPOT \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.INFERENCE_TIME: - re.compile(r"Mean E2EL \(ms\):\s+([\d\.]+)"), + re.compile(r"Mean E2EL \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.MEDIAN_INFERENCE_TIME: - re.compile(r"Median E2EL \(ms\):\s+([\d\.]+)"), + re.compile(r"Median E2EL \(ms\):\s+(-?[\d\.]+)"), PerfMetricType.P99_INFERENCE_TIME: - re.compile(r"P99 E2EL \(ms\):\s+([\d\.]+)"), + re.compile(r"P99 E2EL \(ms\):\s+(-?[\d\.]+)"), } # (Relative threshold, Absolute threshold) for all metric types @@ -512,17 +518,21 @@ class ServerConfig: def __init__(self, server_config_data: dict, env_vars: str = ""): # Extract required fields + self.mode = server_config_data.get('mode', 'e2e') + self.concurrency = server_config_data.get('concurrency', 1) self.name = server_config_data['name'] self.model_name = server_config_data['model_name'] - self.gpus = server_config_data['gpus'] self.model_path = "" self.env_vars = env_vars # Extract optional fields with defaults - self.tp = server_config_data.get('tensor_parallel_size', self.gpus) + self.tp = server_config_data.get('tensor_parallel_size', 1) self.ep = server_config_data.get('moe_expert_parallel_size', 1) self.pp = server_config_data.get('pipeline_parallel_size', 1) - self.gpus_per_node = server_config_data.get('gpus_per_node', self.gpus) + self.cp = server_config_data.get('context_parallel_size', 1) + self.gpus = server_config_data.get('gpus', self.tp * self.cp * self.pp) + self.gpus_per_node = server_config_data.get('gpus_per_node', + 0) or self.gpus self.max_num_tokens = server_config_data.get('max_num_tokens', 2048) self.max_batch_size = server_config_data.get('max_batch_size', 512) self.max_seq_len = server_config_data.get('max_seq_len', 0) @@ -538,6 +548,8 @@ def __init__(self, server_config_data: dict, env_vars: str = ""): 'enable_attention_dp', False) self.trust_remote_code = server_config_data.get('trust_remote_code', False) + self.enable_lm_head_tp_in_adp = server_config_data.get( + 'enable_lm_head_tp_in_adp', False) # attention_dp_config attention_dp_config = server_config_data.get('attention_dp_config', {}) @@ -551,6 +563,12 @@ def __init__(self, server_config_data: dict, env_vars: str = ""): moe_config = server_config_data.get('moe_config', {}) self.moe_backend = moe_config.get('backend', "") self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0) + self.use_low_precision_moe_combine = moe_config.get( + 'use_low_precision_moe_combine', False) + load_balancer_config = moe_config.get('load_balancer', {}) + self.load_balancer_num_slots = load_balancer_config.get('num_slots', 0) + self.load_balancer_layer_updates_per_iter = load_balancer_config.get( + 'layer_updates_per_iter', 0) # cuda_graph_config cuda_graph_config = server_config_data.get('cuda_graph_config', {}) @@ -605,10 +623,13 @@ def __init__(self, server_config_data: dict, env_vars: str = ""): self.match_mode = server_config_data.get('match_mode', "config") # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs) + exclude_keys = [ + 'mode', 'concurrency', 'name', 'model_name', 'gpus', + 'gpus_per_node', 'client_configs' + ] self.extra_llm_api_config_data = { k: v - for k, v in server_config_data.items() - if k not in ['name', 'model_name', 'gpus', 'client_configs'] + for k, v in server_config_data.items() if k not in exclude_keys } def to_cmd(self, @@ -634,8 +655,41 @@ def to_cmd(self, def to_env(self) -> Dict[str, str]: return to_env_dict(self.env_vars) + def to_match_keys(self) -> List[str]: + return [ + "s_mode", + "s_model_name", + "l_tp", + "l_ep", + "l_pp", + "l_cp", + "l_gpus_per_node", + "l_max_batch_size", + "b_disable_overlap_scheduler", + "l_num_postprocess_workers", + "s_attn_backend", + "b_enable_chunked_prefill", + "b_enable_attention_dp", + "b_enable_lm_head_tp_in_adp", + # attention_dp_config + "b_attention_dp_balance", + # moe_config + "s_moe_backend", + # cuda_graph_config + "b_enable_cuda_graph", + # kv_cache_config + "s_kv_cache_dtype", + # cache_transceiver_config + "s_cache_transceiver_backend" + # speculative_config + "s_spec_decoding_type", + "l_num_nextn_predict_layers", + ] + def to_db_data(self) -> dict: db_data = { + "s_mode": + self.mode, "s_model_name": self.model_name.lower(), "l_gpus": @@ -646,6 +700,8 @@ def to_db_data(self) -> dict: self.ep, "l_pp": self.pp, + "l_cp": + self.cp, "l_gpus_per_node": self.gpus_per_node, "l_max_num_tokens": @@ -668,6 +724,8 @@ def to_db_data(self) -> dict: self.enable_attention_dp, "b_trust_remote_code": self.trust_remote_code, + "b_enable_lm_head_tp_in_adp": + self.enable_lm_head_tp_in_adp, # attention_dp_config "b_attention_dp_balance": self.attention_dp_balance, @@ -680,6 +738,12 @@ def to_db_data(self) -> dict: self.moe_backend, "l_moe_max_num_tokens": self.moe_max_num_tokens, + "b_use_low_precision_moe_combine": + self.use_low_precision_moe_combine, + "l_load_balancer_num_slots": + self.load_balancer_num_slots, + "l_load_balancer_layer_updates_per_iter": + self.load_balancer_layer_updates_per_iter, # cuda_graph_config "b_enable_cuda_graph": self.enable_cuda_graph, @@ -754,7 +818,7 @@ def __init__(self, self.osl = client_config_data.get('osl', 1024) self.random_range_ratio = client_config_data.get( 'random_range_ratio', 0.0) - self.backend = client_config_data.get('backend', "") + self.backend = client_config_data.get('backend', "openai") self.use_chat_template = client_config_data.get('use_chat_template', False) self.streaming = client_config_data.get('streaming', True) @@ -765,18 +829,36 @@ def to_cmd(self) -> List[str]: model_dir = get_model_dir(self.model_name) self.model_path = model_dir if os.path.exists( model_dir) else self.model_name - + dataset_path = get_dataset_path() benchmark_cmd = [ - "python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving", - "--model", self.model_path, "--dataset-name", "random", - "--random-ids", "--num-prompts", - str(self.concurrency * self.iterations), "--random-input-len", - str(self.isl), "--random-output-len", - str(self.osl), "--random-range-ratio", - str(self.random_range_ratio), "--ignore-eos", - "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency", - str(self.concurrency) + "python", + "-m", + "tensorrt_llm.serve.scripts.benchmark_serving", + "--model", + self.model_path, + "--tokenizer", + self.model_path, + "--dataset-name", + "random", + "--random-ids", + "--num-prompts", + str(self.concurrency * self.iterations), + "--max-concurrency", + str(self.concurrency), + "--random-input-len", + str(self.isl), + "--random-output-len", + str(self.osl), + "--random-range-ratio", + str(self.random_range_ratio), + "--trust-remote-code", + "--ignore-eos", + "--percentile-metrics", + "ttft,tpot,itl,e2el", ] + if dataset_path and os.path.exists(dataset_path): + benchmark_cmd.append("--dataset-path") + benchmark_cmd.append(dataset_path) if self.backend: benchmark_cmd.append("--backend") benchmark_cmd.append(self.backend) @@ -789,6 +871,18 @@ def to_cmd(self) -> List[str]: def to_env(self) -> Dict[str, str]: return to_env_dict(self.env_vars) + def to_match_keys(self) -> List[str]: + return [ + "l_concurrency", + "l_iterations", + "l_isl", + "l_osl", + "d_random_range_ratio", + "s_backend", + "b_use_chat_template", + "b_streaming", + ] + def to_db_data(self) -> dict: """Convert ClientConfig to Database data""" db_data = { @@ -867,36 +961,37 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None): else: execution_plan = None - # Read YAML config file with open(config_file_path, 'r') as f: config = yaml.safe_load(f) - # Read environment config + metadata = config.get('metadata', {}) environment = config.get('environment', {}) - if not environment: - environment = {} + hardware = config.get('hardware', {}) + gpus_per_node = hardware.get('gpus_per_node', 0) - # Get environment variables - environment.get('worker_env_var', '') + model_name = metadata.get('model_name', '') server_env_var = environment.get('server_env_var', '') client_env_var = environment.get('client_env_var', '') server_configs = [] server_client_configs = {} - for server_config_data in config['server_configs']: server_name = server_config_data['name'] + server_config_data[ + 'model_name'] = model_name if 'model_name' not in server_config_data else server_config_data[ + 'model_name'] + server_config_data['mode'] = 'e2e' + server_config_data['concurrency'] = -1 + server_config_data['gpus_per_node'] = gpus_per_node # Check if this server should be included based on execution_plan if execution_plan is not None and server_name not in execution_plan: continue - # Create ServerConfig object directly from dict server_config = ServerConfig(server_config_data, server_env_var) server_id = len(server_configs) server_configs.append(server_config) - # Create ClientConfig objects client_configs = [] selected_client_names = execution_plan.get( server_name) if execution_plan else None @@ -905,7 +1000,6 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None): client_name = client_config_data['name'] # Check if this client should be included - # Include if: execution_plan is None OR selected_client_names is None OR client_name in selected_client_names if execution_plan is not None and selected_client_names is not None: if client_name not in selected_client_names: continue @@ -929,46 +1023,48 @@ def parse_multi_node_disagg_config_file(config_file_path: str, config = yaml.safe_load(f) disagg_configs = [] + metadata = config.get('metadata', {}) hardware = config.get('hardware', {}) benchmark = config.get('benchmark', {}) environment = config.get('environment', {}) slurm_config = config.get('slurm', {}) worker_config = config.get('worker_config', {}) - timeout = slurm_config.get('timeout', 3600) + timeout = slurm_config.get('timeout', 7200) numa_bind = slurm_config.get('numa_bind', False) + gpus_per_node = hardware.get('gpus_per_node', 0) + model_name = metadata.get('model_name', '') + assert model_name, "model_name is required in metadata section" - # Get model name from environment - model_name = environment.get('model_name', '') - assert model_name, "model_name is required in environment section" + benchmark_mode = benchmark.get('mode', 'e2e') + if "gen_only" in benchmark_mode: + hardware['num_ctx_servers'] = 0 - # Get environment variables worker_env_var = environment.get('worker_env_var', '') server_env_var = environment.get('server_env_var', '') client_env_var = environment.get('client_env_var', '') - # Create ctx_server config data + concurrency_str = benchmark.get('concurrency_list', '1') + if isinstance(concurrency_str, str): + concurrency = max(int(x) for x in concurrency_str.split()) + else: + concurrency = int(concurrency_str) + ctx_server_config_data = { + 'mode': benchmark_mode, + 'concurrency': concurrency, 'name': 'ctx', 'model_name': model_name, - 'gpus': hardware.get('gpus_per_ctx_server'), - 'gpus_per_node': hardware.get('gpus_per_node'), + 'gpus_per_node': gpus_per_node, **worker_config.get('ctx', {}) } - - # Create gen_server config data gen_server_config_data = { + 'mode': benchmark_mode, + 'concurrency': concurrency, 'name': 'gen', 'model_name': model_name, - 'gpus': hardware.get('gpus_per_gen_server'), - 'gpus_per_node': hardware.get('gpus_per_node'), + 'gpus_per_node': gpus_per_node, **worker_config.get('gen', {}) } - - # Create client config data - concurrency_str = benchmark.get('concurrency_list', '1') - concurrency = int(concurrency_str) if isinstance(concurrency_str, - str) else concurrency_str - client_config_data = { 'name': 'client', 'concurrency': concurrency, @@ -980,13 +1076,12 @@ def parse_multi_node_disagg_config_file(config_file_path: str, 'use_chat_template': False, 'streaming': benchmark.get('streaming', True), } - - # Create disagg_config dict disagg_config = { 'disagg_serving_type': disagg_serving_type, 'hostname': socket.gethostname(), 'numa_bind': numa_bind, 'timeout': timeout, + 'mode': benchmark_mode, 'name': 'disagg_config', 'model_name': model_name, 'hardware': hardware, @@ -995,9 +1090,7 @@ def parse_multi_node_disagg_config_file(config_file_path: str, 'server_env_var': server_env_var, 'client': ClientConfig(client_config_data, model_name, client_env_var), } - print_info(f"disagg_config: {disagg_config}") disagg_configs.append(disagg_config) - return disagg_configs @@ -1114,6 +1207,8 @@ def __init__( self.upload_to_db = False self.config_file = None self.gpu_type = None + self.config_dir = None + self.config_file = None self.config_path = None self.select_pattern = None # Aggregated mode @@ -1330,35 +1425,47 @@ def load_from_str(self, test_param_labels) -> None: # Extract configs from test param labels. labels = test_param_labels.split("-") - def get_gpu_type(label: str) -> str: - parts = label.split("_") - if len(parts) < 2 or parts[0] != "l0": - return "" - if parts[1] == "dgx": - if len(parts) >= 3: - gpu_type = f"{parts[1]}_{parts[2]}" - else: - gpu_type = "" - else: - gpu_type = parts[1] - return gpu_type.lower() + def get_gpu_type() -> str: + try: + output = subprocess.check_output(["nvidia-smi", "-L"], + stderr=subprocess.DEVNULL, + text=True) + first_line = output.strip().split("\n")[0] + gpu_models = ["GB300", "GB200", "B300", "B200"] + for model in gpu_models: + if model in first_line: + if model.startswith("B") and not model.startswith("GB"): + return f"dgx_{model.lower()}" + return model.lower() + except (subprocess.CalledProcessError, FileNotFoundError, + IndexError): + print_error( + f"Failed to get GPU type: {subprocess.CalledProcessError}") + return "" - # Used for perf sanity test if "perf_sanity" in labels[0]: assert len(labels) > 1, "perf_sanity test must have a config file!" + is_disagg = "disagg" in labels[0] self.upload_to_db = "upload" in labels[0] - self.config_file = labels[1] - if "disagg" in labels[1]: + self.gpu_type = get_gpu_type() + if is_disagg: + # For disagg, test name is like: perf_sanity_disagg-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX + # labels[0] is perf_sanity_disagg, "-".join(labels[1:]) is config file base name self.runtime = "multi_node_disagg_server" + self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf" + config_base = "-".join(labels[1:]) + self.config_file = f"{config_base}.yaml" if not config_base.endswith( + ".yaml") else config_base + self.select_pattern = None else: + # For aggr, test name is like: perf_sanity_aggr-l0_dgx_b300-r1_fp8_dep8_mtp1_1k1k + # labels[0] is perf_sanity_aggr, labels[1] is config file base name, labels[2] is select_pattern (optional) self.runtime = "aggr_server" - self.gpu_type = get_gpu_type(labels[1]) - config_folder = os.getenv("TRTLLM_CONFIG_FOLDER", - "tests/scripts/perf-sanity") - self.config_path = os.path.join( - config_folder, f"{labels[1]}.yaml" - if not labels[1].endswith(".yaml") else labels[1]) - self.select_pattern = labels[2] if len(labels) > 2 else None + self.config_dir = "tests/scripts/perf-sanity" + config_base = labels[1] + self.config_file = f"{config_base}.yaml" if config_base and not config_base.endswith( + ".yaml") else config_base + self.select_pattern = labels[2] if len(labels) > 2 else None return self.model_name = labels.pop(0) @@ -1578,21 +1685,19 @@ def validate(self): [b >= 32 for b in self.batch_sizes] ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32." - def set_aggr_server_configs(self, llm_root: str) -> None: + def set_aggr_server_configs(self) -> None: """ Set the server and client configs. """ - config_file_path = os.path.join(llm_root, self.config_path) _, self.server_configs, self.server_client_configs = parse_aggr_config_file( - config_file_path, self.select_pattern) + self.config_path, self.select_pattern) - def set_multi_node_disagg_server_configs(self, llm_root: str) -> None: + def set_multi_node_disagg_server_configs(self) -> None: """ Set the multi-node disaggregated server configs. """ - config_file_path = os.path.join(llm_root, self.config_path) self.disagg_configs = parse_multi_node_disagg_config_file( - config_file_path, self.select_pattern) + self.config_path, self.select_pattern) def get_model_family(self) -> str: """ @@ -1682,6 +1787,13 @@ def set_runtime_configs(self, output_dir, perf_cache_fpath, gpu_clock_lock=None) -> None: + if self._config.runtime == "aggr_server" or self._config.runtime == "multi_node_disagg_server": + self._config.config_dir = os.getenv( + "TRTLLM_CONFIG_FOLDER", + os.path.join(llm_root, self._config.config_dir)) + self._config.config_path = os.path.join(self._config.config_dir, + self._config.config_file) + if self._config.runtime == "cpp": if not self._config.is_bert_like(): raise ValueError( @@ -1695,12 +1807,12 @@ def set_runtime_configs(self, benchmark_script = "trtllm-bench" elif self._config.runtime == "aggr_server": benchmark_script = None - self._config.set_aggr_server_configs(llm_root) + self._config.set_aggr_server_configs() elif self._config.runtime == "disagg_server": benchmark_script = None elif self._config.runtime == "multi_node_disagg_server": benchmark_script = None - self._config.set_multi_node_disagg_server_configs(llm_root) + self._config.set_multi_node_disagg_server_configs() else: raise RuntimeError(f"Invalid runtime {self._config.runtime}.") @@ -1730,15 +1842,12 @@ def set_runtime_configs(self, def get_trtllm_aggr_commands(self, output_dir): server_cmds = [] - server_envs = [] client_cmds = [] - client_envs = [] names = [] for server_idx, client_configs in self._config.server_client_configs.items( ): server_config = self._config.server_configs[server_idx] server_cmd = server_config.to_cmd(output_dir) - server_env = server_config.to_env() # Generate extra-llm-api-config.yml config_content = server_config.generate_extra_llm_api_config() config_filename = f"extra-llm-api-config.{server_config.name}.yml" @@ -1747,49 +1856,35 @@ def get_trtllm_aggr_commands(self, output_dir): f.write(config_content) for client_config in client_configs: server_cmds.append(server_cmd) - server_envs.append(server_env) client_cmd = client_config.to_cmd() - client_env = client_config.to_env() client_cmds.append(client_cmd) - client_envs.append(client_env) names.append(f"{server_config.name}-{client_config.name}") - return server_cmds, server_envs, client_cmds, client_envs, names + return server_cmds, client_cmds, names def get_trtllm_multi_node_disagg_commands(self, output_dir): ctx_server_cmds = [] - ctx_server_envs = [] gen_server_cmds = [] - gen_server_envs = [] disagg_server_cmds = [] - disagg_server_envs = [] benchmark_cmds = [] - benchmark_envs = [] cmd_idx = 0 for disagg_config in self._config.disagg_configs: disagg_serving_type = disagg_config['disagg_serving_type'] disagg_config['hostname'] numa_bind = disagg_config['numa_bind'] ctx_server_cmd = None - ctx_server_env = None gen_server_cmd = None - gen_server_env = None disagg_server_cmd = None - disagg_server_env = None benchmark_cmd = None - benchmark_env = None if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type: is_ctx = "CTX" in disagg_serving_type server_config = disagg_config[ 'ctx_server'] if is_ctx else disagg_config['gen_server'] server_cmd = server_config.to_cmd(output_dir, numa_bind, disagg_serving_type) - server_env = server_config.to_env() if is_ctx: ctx_server_cmd = server_cmd - ctx_server_env = server_env else: gen_server_cmd = server_cmd - gen_server_env = server_env # Generate extra-llm-api-config.yml config_content = server_config.generate_extra_llm_api_config() config_filename = f"extra-llm-api-config.{server_config.name}.yml" @@ -1805,21 +1900,15 @@ def get_trtllm_multi_node_disagg_commands(self, output_dir): str(timeout), "-r", str(timeout) ] - disagg_server_env = to_env_dict(disagg_config['server_env_var']) elif "BENCHMARK" in disagg_serving_type: # Generate benchmark command if this is the BENCHMARK server node benchmark_cmd = disagg_config['client'].to_cmd() - benchmark_env = disagg_config['client'].to_env() ctx_server_cmds.append(ctx_server_cmd) - ctx_server_envs.append(ctx_server_env) gen_server_cmds.append(gen_server_cmd) - gen_server_envs.append(gen_server_env) disagg_server_cmds.append(disagg_server_cmd) - disagg_server_envs.append(disagg_server_env) benchmark_cmds.append(benchmark_cmd) - benchmark_envs.append(benchmark_env) cmd_idx += 1 - return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs + return ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list: build_cmd = [ @@ -2094,12 +2183,10 @@ def get_commands(self): if is_aggr: if not os.path.exists(perf_sanity_output_dir): os.makedirs(perf_sanity_output_dir, exist_ok=True) - server_cmds, server_envs, client_cmds, client_envs, names = self.get_trtllm_aggr_commands( + server_cmds, client_cmds, names = self.get_trtllm_aggr_commands( perf_sanity_output_dir) return PerfAggrScriptTestCmds(server_cmds=server_cmds, - server_envs=server_envs, client_cmds=client_cmds, - client_envs=client_envs, names=names, timeout=3600, output_dir=perf_sanity_output_dir) @@ -2115,17 +2202,13 @@ def get_commands(self): if is_multi_node_disagg: if not os.path.exists(perf_sanity_output_dir): os.makedirs(perf_sanity_output_dir, exist_ok=True) - ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs = self.get_trtllm_multi_node_disagg_commands( + ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds = self.get_trtllm_multi_node_disagg_commands( perf_sanity_output_dir) return PerfMultiNodeDisaggScriptTestCmds( ctx_server_cmds=ctx_server_cmds, - ctx_server_envs=ctx_server_envs, gen_server_cmds=gen_server_cmds, - gen_server_envs=gen_server_envs, disagg_server_cmds=disagg_server_cmds, - disagg_server_envs=disagg_server_envs, benchmark_cmds=benchmark_cmds, - benchmark_envs=benchmark_envs, timeout=self._config.disagg_configs[0]['timeout'], hostname=self._config.disagg_configs[0]['hostname'], disagg_serving_type=self._config.disagg_configs[0] @@ -2156,6 +2239,7 @@ def get_commands(self): build_cmd = self.get_trtllm_bench_build_command(engine_dir) else: pytest.skip("only support trtllm-bench runtime for now") + # Construct prepare synthetic data command data_cmds = [] @@ -2293,32 +2377,24 @@ def run_metrics(self, llm_venv, gpu_clock_lock, session_data_writer, #print info to separate cases self._current_cmd_idx = 0 metrics = self._get_metrics() + commands = self.get_commands() outputs = {} result_states = {} errors = [] - def add_myelin_time_pass_to(input_env): - time_pass_flag = r" -time_pass=on" - old_myelin_env = input_env.get("__LUNOWUD", "") - if time_pass_flag not in old_myelin_env: - input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag - return old_myelin_env - - old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env) + # Only trtllm-bench needs to prepare dataset first. if self._config.runtime == 'bench': - #prepare dataset first for trtllm-bench print_info(f"Running command for generating dataset") - outputs = self.run_ex("prepare_dataset", - None, - llm_venv, - gpu_clock_lock, - session_data_writer, - output_dir, + outputs = self.run_ex(commands=commands, + cmd_idx=self._current_cmd_idx, + full_test_name="prepare_dataset", + metric_type=None, + venv=llm_venv, + gpu_clock_lock=gpu_clock_lock, + session_data_writer=session_data_writer, + output_dir=output_dir, outputs=outputs, - original_test_name="prepare_dataset", - cmd_idx=self._current_cmd_idx) - - # Save the result state. + original_test_name="prepare_dataset") result_state = self.get_result_state() result_states[self._current_cmd_idx] = result_state if result_state != "valid": @@ -2349,15 +2425,16 @@ def add_myelin_time_pass_to(input_env): # Run the command or reuse the existing output logs. print_info(f"Running command for {metric.metric_name}") outputs = self.run_ex( - metric.metric_name, - metric.metric_type, - llm_venv, - gpu_clock_lock, - session_data_writer, - output_dir, + commands=commands, + cmd_idx=self._current_cmd_idx, + full_test_name=metric.metric_name, + metric_type=metric.metric_type, + venv=llm_venv, + gpu_clock_lock=gpu_clock_lock, + session_data_writer=session_data_writer, + output_dir=output_dir, outputs=outputs, - original_test_name=metric.original_test_name, - cmd_idx=self._current_cmd_idx) + original_test_name=metric.original_test_name) # Save the result state. result_state = self.get_result_state() @@ -2373,6 +2450,14 @@ def add_myelin_time_pass_to(input_env): # Clean up engine dir after use. shutil.rmtree(self._get_engine_dir(), ignore_errors=True) + def add_myelin_time_pass_to(input_env): + time_pass_flag = r" -time_pass=on" + old_myelin_env = input_env.get("__LUNOWUD", "") + if time_pass_flag not in old_myelin_env: + input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag + return old_myelin_env + + old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env) llm_venv._new_env["__LUNOWUD"] = old_llm_venv # Check if any commands failed. @@ -2393,14 +2478,19 @@ def upload_test_results_to_database(self): Upload the test results and baseline to database. """ - def prefix_server_config_dict(config_dict: dict, - prefix_name: str) -> dict: - prefixed_dict = {} - for key, value in config_dict.items(): - type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_' - rest = key[2:] - prefixed_dict[f"{type_prefix}{prefix_name}_{rest}"] = value - return prefixed_dict + def add_prefix(key: str, prefix_name: str) -> dict: + type_prefix = key[0:2] # 'l_', 's_', 'b_', 'd_' + rest = key[2:] + return f"{type_prefix}{prefix_name}_{rest}" + + def add_list_prefix(config_list: List, prefix_name: str) -> List: + return [add_prefix(key, prefix_name) for key in config_list] + + def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict: + return { + add_prefix(key, prefix_name): value + for key, value in config_dict.items() + } match_keys = [] # Only aggr_server and multi_node_disagg_server will upload. @@ -2441,12 +2531,12 @@ def prefix_server_config_dict(config_dict: dict, new_data_dict[cmd_idx] = new_data cmd_idx += 1 if not match_keys: + match_keys.append("s_runtime") if server_config.match_mode == "scenario": match_keys = SCENARIO_MATCH_FIELDS.copy() else: - match_keys.append("s_runtime") - match_keys.extend(server_config_dict.keys()) - match_keys.extend(client_config_dict.keys()) + match_keys.extend(server_config.to_match_keys()) + match_keys.extend(client_config.to_match_keys()) elif self._config.runtime == "multi_node_disagg_server": if self._config.disagg_configs[0][ @@ -2472,27 +2562,28 @@ def prefix_server_config_dict(config_dict: dict, ) gen_server_config_dict = disagg_config['gen_server'].to_db_data( ) - ctx_server_config_dict = prefix_server_config_dict( + client_config_dict = disagg_config['client'].to_db_data() + ctx_server_config_dict = add_dict_prefix( ctx_server_config_dict, 'ctx') - gen_server_config_dict = prefix_server_config_dict( + gen_server_config_dict = add_dict_prefix( gen_server_config_dict, 'gen') - client_config_dict = disagg_config['client'].to_db_data() - # Build new_data + + hardware = disagg_config.get('hardware', {}) + num_ctx_servers = hardware.get('num_ctx_servers', 0) + num_gen_servers = hardware.get('num_gen_servers', 0) new_data = { "s_runtime": "multi_node_disagg_server", - "s_server_env_var": disagg_config['server_env_var'] + "s_benchmark_mode": disagg_config['mode'], + "s_server_env_var": disagg_config['server_env_var'], + "l_num_ctx_servers": num_ctx_servers, + "l_num_gen_servers": num_gen_servers } new_data.update(job_config) - new_data.update(ctx_server_config_dict) - new_data.update(gen_server_config_dict) + if num_ctx_servers > 0: + new_data.update(ctx_server_config_dict) + if num_gen_servers > 0: + new_data.update(gen_server_config_dict) new_data.update(client_config_dict) - # Add hardware information - hardware = disagg_config.get('hardware', {}) - new_data["l_num_ctx_servers"] = hardware.get( - 'num_ctx_servers', 0) - new_data["l_num_gen_servers"] = hardware.get( - 'num_gen_servers', 0) - # Add metrics from test results for metric_type in AGGR_SERVER_METRICS: new_data[ f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[ @@ -2503,9 +2594,17 @@ def prefix_server_config_dict(config_dict: dict, if not match_keys: match_keys.extend( ["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"]) - match_keys.extend(ctx_server_config_dict.keys()) - match_keys.extend(gen_server_config_dict.keys()) - match_keys.extend(client_config_dict.keys()) + if num_ctx_servers > 0: + match_keys.extend( + add_list_prefix( + disagg_config['ctx_server'].to_match_keys(), + 'ctx')) + if num_gen_servers > 0: + match_keys.extend( + add_list_prefix( + disagg_config['gen_server'].to_match_keys(), + 'gen')) + match_keys.extend(disagg_config['client'].to_match_keys()) else: return @@ -2519,7 +2618,7 @@ def prefix_server_config_dict(config_dict: dict, if is_post_merge: # Prepare new baseline data for post-merge new_baseline_data_dict = prepare_baseline_data( - history_baseline_dict, history_data_dict, new_data_dict) + history_data_dict, new_data_dict) else: # Pre-merge does not need to upload baseline data new_baseline_data_dict = None diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py index 6e14592a37e3..9f2ed7bb32fa 100644 --- a/tests/integration/defs/perf/utils.py +++ b/tests/integration/defs/perf/utils.py @@ -245,9 +245,7 @@ def get_cmd_str(self, cmd_idx) -> List[str]: class PerfAggrScriptTestCmds(NamedTuple): server_cmds: List[List[str]] - server_envs: List[Dict[str, str]] client_cmds: List[List[str]] - client_envs: List[Dict[str, str]] names: List[str] timeout: int output_dir: str @@ -345,13 +343,9 @@ def get_cmd_str(self, cmd_idx) -> List[str]: class PerfMultiNodeDisaggScriptTestCmds(NamedTuple): ctx_server_cmds: List[List[str]] - ctx_server_envs: List[Dict[str, str]] gen_server_cmds: List[List[str]] - gen_server_envs: List[Dict[str, str]] disagg_server_cmds: List[List[str]] - disagg_server_envs: List[Dict[str, str]] benchmark_cmds: List[List[str]] - benchmark_envs: List[Dict[str, str]] timeout: int hostname: str disagg_serving_type: str @@ -694,23 +688,21 @@ def _check_benchmark_output_for_errors(self, output: str) -> None: ) def run_ex(self, + commands, full_test_name: str, metric_type: PerfMetricType, venv: Optional[PythonVenvRunnerImpl], gpu_clock_lock: GPUClockLock, session_data_writer: SessionDataWriter, output_dir: str, + cmd_idx: int = 0, outputs: Dict[int, str] = {}, original_test_name: str = None, - cmd_idx: int = 0, **kwargs) -> List[str]: """ Run the commands and write the results to the output csv and/or yaml files. """ - # Get the commands. - commands = self.get_commands() - # Avoid modifying argument directly outputs = outputs.copy() @@ -723,7 +715,6 @@ def run_ex(self, cmd_str = commands.get_cmd_str(cmd_idx) is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str - is_perf_sanity_test = "perf_sanity" in full_test_name is_disagg_server = False @@ -804,7 +795,8 @@ def run_ex(self, outputs.pop(cmd_idx) elif is_disagg_server: print_info( - f"skip writing perf result when running disagg's server.") + f"skip writing perf result when running disagg's worker or server." + ) else: self._perf_result = self.get_perf_result(outputs) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml index d4470fe1a421..4bf4f6ce67d5 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml @@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity: backend: pytorch orchestrator: mpi tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) - condition: ranges: @@ -34,8 +34,8 @@ l0_dgx_b200_perf_sanity: backend: pytorch orchestrator: mpi tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml index ff0b9eafe387..d90907d9b40c 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml @@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity: backend: pytorch orchestrator: mpi tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180) - condition: ranges: @@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity: backend: pytorch orchestrator: mpi tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180) + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml index fcbe711760e6..e06e18772505 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml @@ -14,6 +14,6 @@ l0_gb200_multi_gpus_perf_sanity: stage: post_merge backend: pytorch tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k] - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k] - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k] + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml similarity index 58% rename from tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml rename to tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml index bc7d95b047c5..ad69e70c867e 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml @@ -1,5 +1,5 @@ version: 0.0.1 -l0_gb200_multi_nodes_perf_sanity: +l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001: - condition: ranges: # 2 nodes with each node has 4 GPUs @@ -13,4 +13,4 @@ l0_gb200_multi_nodes_perf_sanity: stage: post_merge backend: pytorch tests: - - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_nodes-r1_fp4_v2_dep8_mtp1] + - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1] diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml new file mode 100644 index 000000000000..456bb7a48ed4 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml @@ -0,0 +1,16 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001: +- condition: + ranges: + # 3 nodes with each node has 4 GPUs + system_gpu_count: + gte: 12 + lte: 12 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml new file mode 100644 index 000000000000..3e34d0cb2199 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml @@ -0,0 +1,16 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001: +- condition: + ranges: + # 6 nodes with each node has 4 GPUs + system_gpu_count: + gte: 24 + lte: 24 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml new file mode 100644 index 000000000000..273790a21800 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml @@ -0,0 +1,16 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002: +- condition: + ranges: + # 6 nodes with each node has 4 GPUs + system_gpu_count: + gte: 24 + lte: 24 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml new file mode 100644 index 000000000000..b4784d073687 --- /dev/null +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml @@ -0,0 +1,16 @@ +version: 0.0.1 +l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001: +- condition: + ranges: + # 8 nodes with each node has 4 GPUs + system_gpu_count: + gte: 32 + lte: 32 + wildcards: + gpu: + - '*gb200*' + terms: + stage: post_merge + backend: pytorch + tests: + - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 6de1fa6b552c..81284f44e61b 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -320,8 +320,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697) -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5644632) -accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5644632) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] SKIP (https://nvbugs/5629136) @@ -369,7 +367,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutla accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560) test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560) -test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5633700) accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193) accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193) accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] SKIP (https://nvbugs/5705194) diff --git a/tests/scripts/perf-sanity/README.md b/tests/scripts/perf-sanity/README.md index 66f9a93fc6c3..6cd917996ba5 100644 --- a/tests/scripts/perf-sanity/README.md +++ b/tests/scripts/perf-sanity/README.md @@ -4,106 +4,31 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te ## Overview -- Run performance sanity benchmarks across multiple model configurations +- Run performance sanity benchmarks across multiple model configs - Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated -- Manage test cases through YAML configuration files +- Manage test cases through YAML config files - Automated resource calculation and job submission via SLURM ## Configuration File Types -There are three types of YAML configuration files for different deployment architectures: +There are three types of YAML config files for different deployment architectures. +Aggregated config files are in [`tests/scripts/perf-sanity`](./). +Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf). ### 1. Single-Node Aggregated Test Configuration -**File Example**: `l0_dgx_b200.yaml` +**File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml` **Use Case**: Single-node performance tests on a single server with multiple GPUs. -**Structure**: -```yaml -server_configs: - - name: "r1_fp8_dep8_mtp1_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 8192 - attention_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60 - moe_config: - backend: 'DEEPGEMM' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 1 - client_configs: - - name: "con4096_iter10_1k1k" - concurrency: 4096 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - backend: "openai" -``` - - ### 2. Multi-Node Aggregated Test Configuration -**File Example**: `l0_gb200_multi_nodes.yaml` +**File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml` **Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution. -**Structure**: -```yaml -# Hardware Config -hardware: - gpus_per_node: 4 - gpus_per_server: 8 +### 3. Multi-Node Disaggregated Test Configuration + +**File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml` -server_configs: - - name: "r1_fp4_v2_dep8_mtp1" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 8 - gpus_per_node: 4 - trust_remote_code: true - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 2112 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60 - moe_config: - backend: 'CUTLASS' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.5 - client_configs: - - name: "con32_iter12_1k1k" - concurrency: 32 - iterations: 12 - isl: 1024 - osl: 1024 - random_range_ratio: 0.8 - backend: "openai" -``` +**Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers. diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml similarity index 93% rename from tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml rename to tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml index 432c6ee1452b..1a5c5e5212ba 100644 --- a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml +++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml @@ -1,13 +1,13 @@ -# Hardware Config +metadata: + model_name: deepseek_r1_0528_fp4_v2 + supported_gpus: + - GB200 + - GB300 hardware: gpus_per_node: 4 - gpus_per_server: 8 - server_configs: - name: "r1_fp4_v2_dep8_mtp1" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 8 - gpus_per_node: 4 trust_remote_code: true tensor_parallel_size: 8 moe_expert_parallel_size: 8 @@ -37,11 +37,8 @@ server_configs: osl: 1024 random_range_ratio: 0.2 backend: "openai" - - name: "r1_fp4_v2_tep8_mtp3" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 8 - gpus_per_node: 4 trust_remote_code: true tensor_parallel_size: 8 moe_expert_parallel_size: 8 diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml new file mode 100644 index 000000000000..06c629d3f3bb --- /dev/null +++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + supported_gpus: + - B200 + - B300 +server_configs: + - name: "r1_fp4_v2_dep4_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'CUTLASS' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con2048_iter10_1k1k" + concurrency: 2048 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" + + - name: "r1_fp4_v2_tep4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 32 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 32 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con32_iter10_1k1k" + concurrency: 32 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" + + - name: "r1_fp4_v2_tp4_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp4_v2" + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 4 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 4 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con4_iter10_1k1k" + concurrency: 4 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml similarity index 98% rename from tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml rename to tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml index ab14148b202c..388fec27c80d 100644 --- a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml +++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml @@ -1,8 +1,12 @@ +metadata: + model_name: deepseek_r1_0528_fp4_v2 + supported_gpus: + - GB200 + - GB300 server_configs: # 1k1k configs - name: "r1_fp4_v2_dep4_mtp1_1k1k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 4 pipeline_parallel_size: 1 @@ -37,7 +41,6 @@ server_configs: - name: "r1_fp4_v2_tep4_mtp3_1k1k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 4 pipeline_parallel_size: 1 @@ -68,7 +71,6 @@ server_configs: - name: "r1_fp4_v2_tp4_mtp3_1k1k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 1 pipeline_parallel_size: 1 @@ -100,7 +102,6 @@ server_configs: # 8k1k configs - name: "r1_fp4_v2_dep4_mtp1_8k1k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 4 pipeline_parallel_size: 1 @@ -135,7 +136,6 @@ server_configs: - name: "r1_fp4_v2_tep4_mtp3_8k1k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 4 pipeline_parallel_size: 1 @@ -166,7 +166,6 @@ server_configs: - name: "r1_fp4_v2_tp4_mtp3_8k1k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 1 pipeline_parallel_size: 1 @@ -198,7 +197,6 @@ server_configs: # 1k8k configs - name: "r1_fp4_v2_dep4_mtp1_1k8k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 4 pipeline_parallel_size: 1 @@ -233,7 +231,6 @@ server_configs: - name: "r1_fp4_v2_tep4_mtp3_1k8k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 4 pipeline_parallel_size: 1 @@ -264,7 +261,6 @@ server_configs: - name: "r1_fp4_v2_tp4_mtp3_1k8k" model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 tensor_parallel_size: 4 moe_expert_parallel_size: 1 pipeline_parallel_size: 1 diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml new file mode 100644 index 000000000000..6ee4fbb97285 --- /dev/null +++ b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml @@ -0,0 +1,99 @@ +metadata: + model_name: deepseek_r1_0528_fp8 + supported_gpus: + - B200 + - B300 +server_configs: + - name: "r1_fp8_dep8_mtp1_1k1k" + model_name: "deepseek_r1_0528_fp8" + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 1 + client_configs: + - name: "con4096_iter10_1k1k" + concurrency: 4096 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" + + - name: "r1_fp8_tep8_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp8" + tensor_parallel_size: 8 + moe_expert_parallel_size: 8 + pipeline_parallel_size: 1 + max_batch_size: 64 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'DEEPGEMM' + cuda_graph_config: + enable_padding: true + max_batch_size: 64 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con64_iter10_1k1k" + concurrency: 64 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" + + - name: "r1_fp8_tp8_mtp3_1k1k" + model_name: "deepseek_r1_0528_fp8" + tensor_parallel_size: 8 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 8 + max_num_tokens: 8192 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 8 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'MTP' + num_nextn_predict_layers: 3 + client_configs: + - name: "con8_iter10_1k1k" + concurrency: 8 + iterations: 10 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" diff --git a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml new file mode 100644 index 000000000000..1696347f0fd8 --- /dev/null +++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml @@ -0,0 +1,101 @@ +metadata: + model_name: gpt_oss_120b_fp4 + supported_gpus: + - B200 + - B300 +server_configs: + - name: "gpt_oss_fp4_dep2_1k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 2 + moe_expert_parallel_size: 2 + pipeline_parallel_size: 1 + max_batch_size: 1024 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + enable_balance: true + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 1024 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + num_postprocess_workers: 4 + stream_interval: 20 + client_configs: + - name: "con2048_iter5_1k1k" + concurrency: 2048 + iterations: 5 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" + + - name: "gpt_oss_fp4_dep4_1k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 4 + moe_expert_parallel_size: 4 + pipeline_parallel_size: 1 + max_batch_size: 512 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: true + attention_dp_config: + enable_balance: true + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 512 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + num_postprocess_workers: 4 + stream_interval: 20 + client_configs: + - name: "con2048_iter5_1k1k" + concurrency: 2048 + iterations: 5 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" + + - name: "gpt_oss_fp4_tp4_eagle3_1k1k" + model_name: "gpt_oss_120b_fp4" + tensor_parallel_size: 4 + moe_expert_parallel_size: 1 + pipeline_parallel_size: 1 + max_batch_size: 1 + max_num_tokens: 20000 + attn_backend: "TRTLLM" + enable_attention_dp: false + moe_config: + backend: 'TRTLLM' + cuda_graph_config: + enable_padding: true + max_batch_size: 1 + kv_cache_config: + dtype: 'fp8' + enable_block_reuse: false + free_gpu_memory_fraction: 0.8 + speculative_config: + decoding_type: 'Eagle' + eagle3_layers_to_capture: [-1] + max_draft_len: 3 + speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3" + stream_interval: 20 + num_postprocess_workers: 4 + client_configs: + - name: "con1_iter32_1k1k" + concurrency: 1 + iterations: 32 + isl: 1024 + osl: 1024 + random_range_ratio: 0.2 + backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml deleted file mode 100644 index 3074bef6c1b8..000000000000 --- a/tests/scripts/perf-sanity/l0_dgx_b200.yaml +++ /dev/null @@ -1,293 +0,0 @@ -server_configs: - - name: "r1_fp8_dep8_mtp1_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60 - moe_config: - backend: 'DEEPGEMM' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 1 - client_configs: - - name: "con4096_iter10_1k1k" - concurrency: 4096 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp8_tep8_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'DEEPGEMM' - cuda_graph_config: - enable_padding: true - max_batch_size: 64 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con64_iter10_1k1k" - concurrency: 64 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp8_tp8_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 8 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con8_iter10_1k1k" - concurrency: 8 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp4_v2_dep4_mtp1_1k1k" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60 - moe_config: - backend: 'CUTLASS' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 1 - client_configs: - - name: "con2048_iter10_1k1k" - concurrency: 2048 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp4_v2_tep4_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 32 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con32_iter10_1k1k" - concurrency: 32 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp4_v2_tp4_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 4 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con4_iter10_1k1k" - concurrency: 4 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "gpt_oss_fp4_dep2_1k1k" - model_name: "gpt_oss_120b_fp4" - gpus: 2 - tensor_parallel_size: 2 - moe_expert_parallel_size: 2 - pipeline_parallel_size: 1 - max_batch_size: 1024 - max_num_tokens: 20000 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - enable_balance: true - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 1024 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - num_postprocess_workers: 4 - stream_interval: 20 - client_configs: - - name: "con2048_iter5_1k1k" - concurrency: 2048 - iterations: 5 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "gpt_oss_fp4_dep4_1k1k" - model_name: "gpt_oss_120b_fp4" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 20000 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - enable_balance: true - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - num_postprocess_workers: 4 - stream_interval: 20 - client_configs: - - name: "con2048_iter5_1k1k" - concurrency: 2048 - iterations: 5 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "gpt_oss_fp4_tp4_eagle3_1k1k" - model_name: "gpt_oss_120b_fp4" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 1 - max_num_tokens: 20000 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 1 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'Eagle' - eagle3_layers_to_capture: [-1] - max_draft_len: 3 - speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3" - stream_interval: 20 - num_postprocess_workers: 4 - client_configs: - - name: "con1_iter32_1k1k" - concurrency: 1 - iterations: 32 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" diff --git a/tests/scripts/perf-sanity/l0_dgx_b300.yaml b/tests/scripts/perf-sanity/l0_dgx_b300.yaml deleted file mode 100644 index 0306ad25a8a8..000000000000 --- a/tests/scripts/perf-sanity/l0_dgx_b300.yaml +++ /dev/null @@ -1,194 +0,0 @@ -server_configs: - - name: "r1_fp8_dep8_mtp1_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60 - moe_config: - backend: 'DEEPGEMM' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 1 - client_configs: - - name: "con4096_iter10_1k1k" - concurrency: 4096 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp8_tep8_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 8 - pipeline_parallel_size: 1 - max_batch_size: 64 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'DEEPGEMM' - cuda_graph_config: - enable_padding: true - max_batch_size: 64 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con64_iter10_1k1k" - concurrency: 64 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp8_tp8_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp8" - gpus: 8 - tensor_parallel_size: 8 - moe_expert_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 8 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 8 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con8_iter10_1k1k" - concurrency: 8 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp4_v2_dep4_mtp1_1k1k" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 512 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: true - attention_dp_config: - batching_wait_iters: 0 - enable_balance: true - timeout_iters: 60 - moe_config: - backend: 'CUTLASS' - cuda_graph_config: - enable_padding: true - max_batch_size: 512 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 1 - client_configs: - - name: "con2048_iter10_1k1k" - concurrency: 2048 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp4_v2_tep4_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 4 - pipeline_parallel_size: 1 - max_batch_size: 32 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 32 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con32_iter10_1k1k" - concurrency: 32 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" - - - name: "r1_fp4_v2_tp4_mtp3_1k1k" - model_name: "deepseek_r1_0528_fp4_v2" - gpus: 4 - tensor_parallel_size: 4 - moe_expert_parallel_size: 1 - pipeline_parallel_size: 1 - max_batch_size: 4 - max_num_tokens: 8192 - attn_backend: "TRTLLM" - enable_attention_dp: false - moe_config: - backend: 'TRTLLM' - cuda_graph_config: - enable_padding: true - max_batch_size: 4 - kv_cache_config: - dtype: 'fp8' - enable_block_reuse: false - free_gpu_memory_fraction: 0.8 - speculative_config: - decoding_type: 'MTP' - num_nextn_predict_layers: 3 - client_configs: - - name: "con4_iter10_1k1k" - concurrency: 4 - iterations: 10 - isl: 1024 - osl: 1024 - random_range_ratio: 0.2 - backend: "openai" diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py b/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py index 575cd2f0f138..fc550f082472 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py @@ -1,12 +1,18 @@ import json +import os import openai import pytest +from utils.llm_data import llm_datasets_root from ..test_llm import get_model_path from .openai_server import RemoteOpenAIServer pytestmark = pytest.mark.threadleak(enabled=False) +os.environ['TIKTOKEN_RS_CACHE_DIR'] = os.path.join(llm_datasets_root(), + 'tiktoken_vocab') +os.environ['TIKTOKEN_ENCODINGS_BASE'] = os.path.join(llm_datasets_root(), + 'tiktoken_vocab') @pytest.fixture(scope="module", ids=["GPT-OSS-20B"]) @@ -114,8 +120,10 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str): model=model, messages=messages, tools=[tool_get_current_weather], + extra_body={"top_k": 1}, ) message = response.choices[0].message + print(message) assert response.choices[0].finish_reason == "tool_calls" assert message.content is None assert message.reasoning @@ -137,6 +145,7 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str): response = await client.chat.completions.create( model=model, messages=messages, + extra_body={"top_k": 1}, ) message = response.choices[0].message assert message.content @@ -205,6 +214,7 @@ async def test_streaming_tool_call(client: openai.AsyncOpenAI, model: str): messages=messages, tools=[tool_get_current_weather], stream=True, + extra_body={"top_k": 1}, ) tool_name: str reasoning_chunks: list[str] = [] diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py index 66677fcead3f..4b75e4c71ff1 100644 --- a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py +++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py @@ -1,17 +1,14 @@ import json import os import subprocess -import sys import tempfile import pytest import yaml +from ..test_llm import get_model_path from .openai_server import RemoteOpenAIServer -sys.path.append(os.path.join(os.path.dirname(__file__), '..')) -from test_llm import get_model_path - @pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"]) def model_name(): @@ -57,15 +54,19 @@ def example_root(): ("bash", "curl_completion_client.sh"), ("bash", "aiperf_client.sh"), ("bash", "curl_responses_client.sh")]) -def test_trtllm_serve_examples(exe: str, script: str, +def test_trtllm_serve_examples(exe: str, script: str, model_name: str, server: RemoteOpenAIServer, example_root: str): client_script = os.path.join(example_root, script) # CalledProcessError will be raised if any errors occur + custom_env = os.environ.copy() + if script.startswith("aiperf"): + custom_env[""] = get_model_path(model_name) result = subprocess.run([exe, client_script], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - check=True) + check=True, + env=custom_env) if script.startswith("curl"): # For curl scripts, we expect a JSON response result_stdout = result.stdout.strip()