diff --git a/examples/serve/aiperf_client.sh b/examples/serve/aiperf_client.sh
index 8a150714de2e..d901f99cde14 100755
--- a/examples/serve/aiperf_client.sh
+++ b/examples/serve/aiperf_client.sh
@@ -2,7 +2,7 @@
 
 aiperf profile \
     -m TinyLlama-1.1B-Chat-v1.0 \
-    --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+    --tokenizer ${AIPERF_TOKENIZER_PATH:-TinyLlama/TinyLlama-1.1B-Chat-v1.0} \
     --endpoint-type chat \
     --random-seed 123 \
     --synthetic-input-tokens-mean 128 \
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 9bc1ead4912c..ce5842d7c219 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -748,9 +748,9 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
 }
 // End of Methods to run Slurm job with Jenkins Agent
 
-def getNodeArgs(int nodeCount, int gpuCount) {
+def getNodeArgs(int nodeCount, int gpuCount, boolean setSegment = false) {
     int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
-    return nodeCount == 1 ? [
+    def args = nodeCount == 1 ? [
         "--nodes=${nodeCount}",
         "--gpus=${gpuCount}"
     ] : [
@@ -759,6 +759,10 @@ def getNodeArgs(int nodeCount, int gpuCount) {
         "--ntasks-per-node=${gpusPerNode}",
         "--gpus-per-node=${gpusPerNode}",
     ]
+    if (setSegment && gpuCount > 1) {
+        args += ["--segment=${nodeCount}"]
+    }
+    return args
 }
 
 def getPytestBaseCommandLine(
@@ -883,6 +887,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
     // Create a unique suffix for the job name
     String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
     def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
+    def disaggMode = stageName.contains("Perf-Sanity-Disagg")
+    def setSegment = disaggMode
 
     Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
 
@@ -914,6 +920,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
             def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
             def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
             def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
+            def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
+            def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
             def waivesListPathNode = "${jobWorkspace}/waives.txt"
             def outputPath = "${jobWorkspace}/job-output.log"
@@ -940,6 +948,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     true
                 )
 
+                Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
+                Utils.copyFileToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptInstallLocalPath,
+                    scriptInstallPathNode,
+                    true
+                )
+
                 // Generate Test List and Upload to Frontend Node
                 def makoArgs = getMakoArgsFromStageName(stageName, true)
                 // TODO: currently the options will only be processed if the first
@@ -1013,7 +1030,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 // Generate Job Launch Script
                 def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
                 def mounts = getMountListForSlurmTest(cluster, true).join(",")
-                String[] taskArgs = getNodeArgs(nodeCount, gpuCount)
+                String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
                 if (taskArgs == null) {
                     error "Invalid Slurm test stage name is set"
                 }
@@ -1083,10 +1100,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 envVarsToExport.each { varName, varValue ->
                     srunArgs.add("--container-env=${varName}")
                 }
-                if(nodeCount > 1) {
-                    srunArgs.add("--mpi=pmi2")
-                }
-
                 def exemptionComment = ""
                 if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
                     exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
@@ -1102,8 +1115,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     "export ${varName}=\"${escapedValue}\""
                 }.join('\n')
 
-                def scriptContent = """#!/bin/bash
-                    #SBATCH ${exemptionComment} --output=${outputPath}
+                def scriptLaunchPrefix = """#!/bin/bash
+                    #SBATCH ${exemptionComment}
+                    #SBATCH --output=${outputPath}
                     ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
                     #SBATCH ${partition.additionalArgs}
                     ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
@@ -1128,10 +1142,48 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     echo "Env NVIDIA_VISIBLE_DEVICES: \$NVIDIA_VISIBLE_DEVICES"
 
                     ${srunPrologue}
-
-                    srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
                 """.replaceAll("(?m)^\\s*", "")
-                pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
+
+                if (disaggMode) {
+                    if(nodeCount > 1) {
+                        srunArgs.add("--mpi=pmix")
+                    }
+
+                    def scriptLaunchPrefixPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch_prefix.sh")
+                    def scriptLaunchSrunArgsPathLocal = Utils.createTempLocation(pipeline, "./slurm_srun_args.txt")
+                    def scriptLaunchDraftPathLocal = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh"
+                    def scriptSubmitLocalPath = "${llmSrcLocal}/jenkins/scripts/perf/disaggregated/submit.py"
+
+                    pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
+                    pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
+                    Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
+                    Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
+
+                    // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
+                    sh """
+                        python3 ${scriptSubmitLocalPath} \\
+                        --run-ci \\
+                        --llm-src ${llmSrcLocal} \\
+                        --test-list ${testListPathLocal} \\
+                        --draft-launch-sh ${scriptLaunchDraftPathLocal} \\
+                        --launch-sh ${scriptLaunchPathLocal} \\
+                        --run-sh ${scriptRunPathNode} \\
+                        --install-sh ${scriptInstallPathNode} \\
+                        --script-prefix ${scriptLaunchPrefixPathLocal} \\
+                        --srun-args ${scriptLaunchSrunArgsPathLocal}
+                    """
+                } else {
+                    if(nodeCount > 1) {
+                        srunArgs.add("--mpi=pmi2")
+                    }
+
+                    def scriptContent = """
+                        ${scriptLaunchPrefix}
+                        srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
+                    """.replaceAll("(?m)^\\s*", "")
+                    pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
+                }
+
                 Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
                 Utils.copyFileToRemoteHost(
                     pipeline,
@@ -2634,7 +2686,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                 if (noRegularTests && noIsolateTests) {
                     error "No tests were executed for stage ${stageName}, please check the test list and test-db rendering result."
                 }
-
             }
         }
 
@@ -2653,7 +2704,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
             stage("Check perf result") {
                 def perfCheckResult = sh(
                     script: """
-                        python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
+                    python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
                         ${stageName}/perf_script_test_results.csv \
                         ${basePerfPath}
                     """,
@@ -2672,6 +2723,22 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                 """
             }
         }
+
+        if (perfMode && stageName.contains("Perf-Sanity")) {
+            stage ("Check perf result") {
+                def perfCheckResult = sh(
+                    script: """
+                        python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
+                        ${WORKSPACE}/${stageName}
+                    """,
+                    returnStatus: true
+                )
+                // TODO: Enable this when perf regression check is stable
+                // if (perfCheckResult != 0) {
+                //     error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
+                // }
+            }
+        }
     }
 }
 
@@ -3111,8 +3178,13 @@ def launchTestJobs(pipeline, testFilter)
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
-        // Perf sanity post merge test
-        "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_perf_sanity", 1, 1, 8, 2],
+        // Perf sanity post merge aggr tests
+        "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
+        // Perf sanity post merge disagg tests
+        "GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
+        // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
+        // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
+        // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
     ]
     fullSet += multiNodesSBSAConfigs.keySet()
 
diff --git a/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
new file mode 100644
index 000000000000..c5dc80c971a2
--- /dev/null
+++ b/jenkins/scripts/perf/disaggregated/slurm_launch_draft.sh
@@ -0,0 +1,76 @@
+
+cleanup_on_failure() {
+    echo "Error: $1"
+    scancel ${SLURM_JOB_ID}
+    exit 1
+}
+
+mkdir -p $jobWorkspace
+chmod +x $runScript
+chmod +x $installScript
+
+# Run installation on all nodes
+echo "Running installation on all nodes..."
+if ! srun "${srunArgs[@]}" $installScript &> $jobWorkspace/install.log; then
+    cleanup_on_failure "Failed to run installation. Check $jobWorkspace/install.log"
+fi
+echo "Installation completed on all nodes"
+
+# Start gen servers
+echo "Starting gen servers..."
+for i in $(seq 0 $((numGenServers - 1))); do
+    gen_world_size=$((nodesPerGenServer * gpusPerNode))
+    export DISAGG_SERVING_TYPE="GEN_$i"
+    export pytestCommand="$pytestCommandWorker"
+    srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
+        -N $nodesPerGenServer \
+        --ntasks=$gen_world_size \
+        --ntasks-per-node=$gpusPerNode \
+        $runScript &> $jobWorkspace/gen_server_$i.log &
+    echo "Started gen server $i"
+done
+
+# Start ctx servers (skip if gen_only mode)
+if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" != "1" ]; then
+    echo "Starting ctx servers..."
+    for i in $(seq 0 $((numCtxServers - 1))); do
+        ctx_world_size=$((nodesPerCtxServer * gpusPerNode))
+        export DISAGG_SERVING_TYPE="CTX_$i"
+        export pytestCommand="$pytestCommandWorker"
+        srun "${srunArgs[@]}" --kill-on-bad-exit=1 \
+            -N $nodesPerCtxServer \
+            --ntasks=$ctx_world_size \
+            --ntasks-per-node=$gpusPerNode \
+            $runScript &> $jobWorkspace/ctx_server_$i.log &
+        echo "Started ctx server $i"
+    done
+else
+    echo "Skipping ctx servers (gen_only mode)"
+fi
+
+
+# Start disagg server
+echo "Starting disagg server..."
+export DISAGG_SERVING_TYPE="DISAGG_SERVER"
+export pytestCommand="$pytestCommandDisaggServer"
+srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
+    -N 1 \
+    --ntasks=1 \
+    --ntasks-per-node=1 \
+    $runScript &> $jobWorkspace/disagg_server.log &
+echo "Started disagg server"
+
+# Start benchmark
+echo "Starting benchmark..."
+export DISAGG_SERVING_TYPE="BENCHMARK"
+export pytestCommand="$pytestCommandBenchmark"
+if ! srun "${srunArgs[@]}" --kill-on-bad-exit=1 --overlap \
+    -N 1 \
+    --ntasks=1 \
+    --ntasks-per-node=1 \
+    $runScript; then
+    cleanup_on_failure "Benchmark failed. Check logs in ${jobWorkspace} for details"
+fi
+
+echo "Disagg server and benchmark completed successfully"
+echo "Total runtime: $SECONDS seconds"
diff --git a/jenkins/scripts/perf/disaggregated/submit.py b/jenkins/scripts/perf/disaggregated/submit.py
new file mode 100644
index 000000000000..5e8e374f4f08
--- /dev/null
+++ b/jenkins/scripts/perf/disaggregated/submit.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+import argparse
+import os
+
+import yaml
+
+
+def get_hardware_config(config, benchmark_mode):
+    hardware = config.get("hardware", {})
+    worker_config = config.get("worker_config", {})
+
+    num_ctx_servers = 0 if "gen_only" in benchmark_mode else hardware.get("num_ctx_servers")
+    num_gen_servers = hardware.get("num_gen_servers")
+    gpus_per_node = hardware.get("gpus_per_node")
+
+    # Get gpus_per_ctx_server and gpus_per_gen_server from worker_config's tensor_parallel_size
+    ctx_config = worker_config.get("ctx", {})
+    gen_config = worker_config.get("gen", {})
+    ctx_tp = ctx_config.get("tensor_parallel_size", 1)
+    ctx_pp = ctx_config.get("pipeline_parallel_size", 1)
+    ctx_cp = ctx_config.get("context_parallel_size", 1)
+    gpus_per_ctx_server = ctx_tp * ctx_pp * ctx_cp
+    gen_tp = gen_config.get("tensor_parallel_size", 1)
+    gen_pp = gen_config.get("pipeline_parallel_size", 1)
+    gen_cp = gen_config.get("context_parallel_size", 1)
+    gpus_per_gen_server = gen_tp * gen_pp * gen_cp
+
+    if None in [
+        num_ctx_servers,
+        num_gen_servers,
+        gpus_per_node,
+        gpus_per_ctx_server,
+        gpus_per_gen_server,
+    ]:
+        raise ValueError("Missing required hardware configuration")
+
+    # Calculate nodes per server
+    nodes_per_ctx_server = (gpus_per_ctx_server + gpus_per_node - 1) // gpus_per_node
+    nodes_per_gen_server = (gpus_per_gen_server + gpus_per_node - 1) // gpus_per_node
+
+    total_nodes = num_ctx_servers * nodes_per_ctx_server + num_gen_servers * nodes_per_gen_server
+    total_gpus = total_nodes * gpus_per_node
+
+    return {
+        "num_ctx_servers": num_ctx_servers,
+        "num_gen_servers": num_gen_servers,
+        "gpus_per_node": gpus_per_node,
+        "gpus_per_ctx_server": gpus_per_ctx_server,
+        "gpus_per_gen_server": gpus_per_gen_server,
+        "nodes_per_ctx_server": nodes_per_ctx_server,
+        "nodes_per_gen_server": nodes_per_gen_server,
+        "total_nodes": total_nodes,
+        "total_gpus": total_gpus,
+    }
+
+
+def get_env_config(config):
+    env = config.get("environment", {})
+
+    container = env.get("container_image", "")
+    mounts = env.get("container_mount", "")
+    workdir = env.get("container_workdir", "")
+    llm_models_root = env.get("llm_models_root", "")
+    llmsrc = env.get("trtllm_repo", "")
+    build_wheel = env.get("build_wheel", False)
+    # Use work_dir as job_workspace
+    job_workspace = env.get("work_dir", "")
+    worker_env_var = env.get("worker_env_var", "")
+    server_env_var = env.get("server_env_var", "")
+    benchmark_env_var = env.get("benchmark_env_var", "")
+    open_search_db_base_url = env.get("open_search_db_base_url", "")
+
+    return {
+        "container": container,
+        "mounts": mounts,
+        "workdir": workdir,
+        "llm_models_root": llm_models_root,
+        "llmsrc": llmsrc,
+        "build_wheel": build_wheel,
+        "job_workspace": job_workspace,
+        "worker_env_var": worker_env_var,
+        "server_env_var": server_env_var,
+        "benchmark_env_var": benchmark_env_var,
+        "open_search_db_base_url": open_search_db_base_url,
+    }
+
+
+def get_benchmark_config(config):
+    benchmark = config.get("benchmark", {})
+
+    mode = benchmark.get("mode", "e2e")
+    concurrency_str = benchmark.get("concurrency_list", "1")
+    concurrency = int(concurrency_str) if isinstance(concurrency_str, str) else concurrency_str
+
+    return {
+        "mode": mode,
+        "concurrency": concurrency,
+    }
+
+
+def remove_whitespace_lines(lines):
+    return [line.strip() for line in lines if line.strip()]
+
+
+def get_pytest_command_no_llmapilaunch(script_prefix_lines):
+    pytest_command_line = None
+    for line in script_prefix_lines:
+        if "export pytestCommand=" in line:
+            pytest_command_line = line
+            break
+
+    if not pytest_command_line:
+        return ""
+
+    # Replace pytestCommand with pytestCommandNoLLMAPILaunch
+    replaced_line = pytest_command_line.replace("pytestCommand", "pytestCommandNoLLMAPILaunch")
+
+    # Split by space, find and remove the substring with trtllm-llmapi-launch
+    replaced_line_parts = replaced_line.split()
+    replaced_line_parts_no_llmapi = [
+        part for part in replaced_line_parts if "trtllm-llmapi-launch" not in part
+    ]
+    return " ".join(replaced_line_parts_no_llmapi)
+
+
+def get_config_yaml(test_list_path, llm_src):
+    with open(test_list_path, "r") as f:
+        first_line = f.readline().strip()
+
+    if "[" not in first_line or "]" not in first_line:
+        raise ValueError(
+            f"Invalid test list format. Expected test name with brackets: {first_line}"
+        )
+    bracket_content = first_line.split("[")[-1].split("]")[0]
+    parts = bracket_content.split("-")
+    if len(parts) < 2:
+        raise ValueError(
+            f"Invalid test name format. Expected format: prefix-config_name, got: {bracket_content}"
+        )
+
+    # parts[0] is the prefix, parts[1:] is the config name
+    if "disagg" not in parts[0]:
+        raise ValueError(
+            f"Invalid test name format. Expected format: disagg-config_name, got: {bracket_content}"
+        )
+    config_base_name = "-".join(parts[1:])
+    config_yaml_path = os.path.join(
+        llm_src,
+        "tests",
+        "integration",
+        "defs",
+        "perf",
+        "disagg",
+        "test_configs",
+        "disagg",
+        "perf",
+        f"{config_base_name}.yaml",
+    )
+    if not os.path.exists(config_yaml_path):
+        raise FileNotFoundError(f"Config file not found: {config_yaml_path}")
+    return config_yaml_path
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate SLURM launch script for both CI and local modes"
+    )
+    parser.add_argument(
+        "--run-ci",
+        action="store_true",
+        default=False,
+        help="Run in CI mode (true) or local mode (false)",
+    )
+    parser.add_argument("--draft-launch-sh", required=True, help="Path to draft-launch.sh script")
+    parser.add_argument("--launch-sh", required=True, help="Path to output launch.sh script")
+    parser.add_argument("--run-sh", required=True, help="Path to slurm_run.sh script")
+    parser.add_argument("--install-sh", required=True, help="Path to slurm_install.sh script")
+
+    # Optional arguments for local mode
+    parser.add_argument("--config-yaml", default="", help="Path to config YAML file")
+    parser.add_argument("--stage-name", default="", help="Stage name (optional, local mode only)")
+
+    # Optional arguments for CI mode
+    parser.add_argument("--llm-src", default="", help="Path to LLM source code")
+    parser.add_argument("--test-list", default="", help="Path to test list file")
+    parser.add_argument(
+        "--script-prefix",
+        default="",
+        help="Launch script prefix file path (optional, CI mode only)",
+    )
+    parser.add_argument(
+        "--srun-args",
+        default="",
+        help="Path to file containing srun args (optional, CI mode only)",
+    )
+
+    args = parser.parse_args()
+
+    config_yaml = get_config_yaml(args.test_list, args.llm_src)
+
+    with open(config_yaml, "r") as f:
+        config = yaml.safe_load(f)
+
+    # Determine install script path
+    install_script = args.install_sh
+
+    env_config = get_env_config(config)
+    print(f"Environment configuration: {env_config}")
+
+    benchmark_config = get_benchmark_config(config)
+    print(f"Benchmark configuration: {benchmark_config}")
+    benchmark_mode = benchmark_config["mode"]
+
+    hardware_config = get_hardware_config(config, benchmark_mode)
+    print(f"Hardware configuration: {hardware_config}")
+
+    script_prefix_lines = []
+    srun_args_lines = []
+
+    with open(args.script_prefix, "r") as f:
+        script_prefix_content = f.read()
+    script_prefix_lines = script_prefix_content.split("\n")
+    with open(args.srun_args, "r") as f:
+        srun_args_content = f.read()
+
+    srun_args_lines = srun_args_content.split()
+
+    # Extract pytestCommand and generate pytestCommandNoLLMAPILaunch
+    pytest_command_no_llmapi_launch = get_pytest_command_no_llmapilaunch(script_prefix_lines)
+
+    # Build worker env vars, add extra env vars for gen_only mode
+    worker_env_vars = env_config["worker_env_var"]
+    server_env_vars = env_config["server_env_var"]
+    if "gen_only" in benchmark_config["mode"]:
+        concurrency = benchmark_config["concurrency"]
+        worker_env_vars = (
+            "TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 "
+            f"TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1 "
+            f"TLLM_BENCHMARK_REQ_QUEUES_SIZE={concurrency} {worker_env_vars}"
+        )
+        server_env_vars = f"TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 {server_env_vars}"
+        script_prefix_lines.append("export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1")
+        srun_args_lines.append("--container-env=TRTLLM_DISAGG_BENCHMARK_GEN_ONLY")
+
+    script_prefix_lines.extend(
+        [
+            pytest_command_no_llmapi_launch,
+            f'export pytestCommandWorker="unset UCX_TLS && {worker_env_vars} $pytestCommand"',
+            f'export pytestCommandDisaggServer="{server_env_vars} $pytestCommandNoLLMAPILaunch"',
+            f'export pytestCommandBenchmark="{env_config["benchmark_env_var"]} $pytestCommandNoLLMAPILaunch"',
+            f"export runScript={args.run_sh}",
+            f"export installScript={install_script}",
+            f"export numCtxServers={hardware_config['num_ctx_servers']}",
+            f"export numGenServers={hardware_config['num_gen_servers']}",
+            f"export gpusPerNode={hardware_config['gpus_per_node']}",
+            f"export gpusPerCtxServer={hardware_config['gpus_per_ctx_server']}",
+            f"export gpusPerGenServer={hardware_config['gpus_per_gen_server']}",
+            f"export nodesPerCtxServer={hardware_config['nodes_per_ctx_server']}",
+            f"export nodesPerGenServer={hardware_config['nodes_per_gen_server']}",
+            f"export totalNodes={hardware_config['total_nodes']}",
+            f"export totalGpus={hardware_config['total_gpus']}",
+        ]
+    )
+
+    remove_whitespace_lines(script_prefix_lines)
+    script_prefix = "\n".join(script_prefix_lines)
+
+    remove_whitespace_lines(srun_args_lines)
+    srun_args_lines.extend(
+        [
+            "--container-env=DISAGG_SERVING_TYPE",
+            "--container-env=pytestCommand",
+        ]
+    )
+    srun_args_lines = ["srunArgs=("] + [f'  "{line}"' for line in srun_args_lines] + [")"]
+    srun_args = "\n".join(srun_args_lines)
+
+    with open(args.draft_launch_sh, "r") as f:
+        draft_launch_content = f.read()
+    draft_launch_lines = draft_launch_content.split("\n")
+    remove_whitespace_lines(draft_launch_lines)
+    draft_launch_content = "\n".join(draft_launch_lines)
+
+    with open(args.launch_sh, "w") as f:
+        f.write(f"{script_prefix}\n{srun_args}\n{draft_launch_content}")
+
+    print(f"Launch script generated at: {args.launch_sh}")
+    print(f"Launch script:\n{script_prefix}\n{srun_args}\n{draft_launch_content}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh
new file mode 100644
index 000000000000..00fcd2b0935e
--- /dev/null
+++ b/jenkins/scripts/slurm_install.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Set up error handling
+set -Eeuo pipefail
+trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
+
+slurm_install_setup() {
+    cd $resourcePathNode
+    llmSrcNode=$resourcePathNode/TensorRT-LLM/src
+
+    if [ $SLURM_LOCALID -eq 0 ]; then
+        wget -nv $llmTarfile
+        tar -zxf $tarName
+        which python3
+        python3 --version
+        apt-get install -y libffi-dev
+        nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
+        if [[ $pytestCommand == *--run-ray* ]]; then
+            pip3 install --retries 10 ray[default]
+        fi
+        cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
+        cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
+        gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
+        hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
+        echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
+        touch install_lock.lock
+    else
+        while [ ! -f install_lock.lock ]; do
+            sleep 5
+        done
+    fi
+}
+
+# Only run slurm_install_setup when script is executed directly (not sourced)
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    slurm_install_setup
+fi
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
index ce2712881b7c..e86092b7ea2e 100755
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@@ -39,26 +39,12 @@ if [ $SLURM_PROCID -eq 0 ]; then
     fi
 fi
 
-if [ $SLURM_LOCALID -eq 0 ]; then
-    wget -nv $llmTarfile
-    tar -zxf $tarName
-    which python3
-    python3 --version
-    apt-get install -y libffi-dev
-    nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
-    if [[ $pytestCommand == *--run-ray* ]]; then
-        pip3 install --retries 10 ray[default]
-    fi
-    cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
-    cd $resourcePathNode &&  pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
-    gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
-    hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
-    echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
-    touch install_lock.lock
-else
-    while [ ! -f install_lock.lock ]; do
-        sleep 5
-    done
+# Aggregated mode will run install together with pytest in slurm_run.sh
+# Disaggregated mode will run install separately in slurm_install.sh
+if [[ "$stageName" != *Disagg* ]]; then
+    installScriptPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_run\.sh/slurm_install.sh/')"
+    source "$installScriptPath"
+    slurm_install_setup
 fi
 
 if [[ "$stageName" == *GB200* ]]; then
@@ -131,3 +117,9 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
         --files $stageName/perf_script_test_results.csv \
         $basePerfPath
 fi
+
+if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
+    echo "Check Perf-Sanity Result"
+    python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
+        $jobWorkspace
+fi
diff --git a/tensorrt_llm/serve/harmony_adapter.py b/tensorrt_llm/serve/harmony_adapter.py
index 9299b2b68aea..1ba163581edc 100644
--- a/tensorrt_llm/serve/harmony_adapter.py
+++ b/tensorrt_llm/serve/harmony_adapter.py
@@ -217,8 +217,9 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
 
                 # Check if tool is allowed
                 if self.should_filter_tools and func_name not in self.available_tools:
-                    logger.debug("Request %s: tool %s not in available tools",
-                                 self.request_id, func_name)
+                    logger.debug(
+                        f"Request {self.request_id}: tool {func_name} not in available tools"
+                    )
                     return None
 
                 # Get or create tool call
@@ -273,8 +274,9 @@ def _create_delta_from_parser_state(self) -> dict[str, Any] | None:
             else:
                 return {"content": self.parser.last_content_delta}
         else:
-            logger.debug("Request %s: no delta generated for channel=%s",
-                         self.request_id, self.parser.current_channel)
+            logger.debug(
+                f"Request {self.request_id}: no delta generated for channel={self.parser.current_channel}"
+            )
             return None
 
     def _get_or_create_tool_call(self, func_name: str) -> str:
@@ -295,8 +297,9 @@ def _get_or_create_tool_call(self, func_name: str) -> str:
             "active": True
         }
         self.tool_call_index += 1
-        logger.debug("Request %s: created new tool call %s for function %s",
-                     self.request_id, tool_id, func_name)
+        logger.debug(
+            f"Request {self.request_id}: created new tool call {tool_id} for function {func_name}"
+        )
         return tool_id
 
     def get_debug_info(self) -> dict[str, Any]:
@@ -896,8 +899,8 @@ def _parse_tool_call_from_harmony_message(
                 }
             except json.JSONDecodeError:
                 logger.warning(
-                    "Failed to parse tool call arguments as JSON: %s",
-                    function_call_args)
+                    f"Failed to parse tool call arguments as JSON: {function_call_args}"
+                )
                 return None
         elif msg_content_type and "code" in msg_content_type:
             function_name = str(msg_recipient)
@@ -1023,10 +1026,11 @@ def harmony_output_to_openai(
             except (HarmonyError, UnicodeDecodeError,
                     ValueError) as parse_error:
                 logger.warning(
-                    "Failed to parse harmony messages from tokens: %s",
-                    parse_error)
-                logger.debug("Problematic clean tokens (%d): %s",
-                             len(clean_tokens), clean_tokens)
+                    f"Failed to parse harmony messages from tokens: {parse_error}"
+                )
+                logger.debug(
+                    f"Problematic clean tokens ({len(clean_tokens)}): {clean_tokens}"
+                )
                 # Fallback to raw text parsing
                 raise RuntimeError(f"Harmony parsing failed: {parse_error}"
                                    )  # This will be caught by outer try-catch
@@ -1103,9 +1107,9 @@ def harmony_output_to_openai(
         except Exception as e:
             raw_text = self._safe_decode_utf8(harmony_output_tokens,
                                               "HARMONY _OUTPUT: ")
-            logger.warning("Failed to parse harmony output: %s. Raw output: %s",
-                           e, raw_text)
-            logger.debug("Detailed error: %s", traceback.format_exc())
+            logger.warning(
+                f"Failed to parse harmony output: {e}. Raw output: {raw_text}")
+            logger.debug(f"Detailed error: {traceback.format_exc()}")
 
             # Check if raw_text contains a decode error (fallback content)
             if "HARMONY_OUTPUT:" in raw_text:
@@ -1276,9 +1280,9 @@ def stateful_stream_harmony_tokens_to_openai_deltas(
             return deltas
         except (HarmonyError, UnicodeDecodeError, ValueError):
             logger.error(
-                f"Streaming: Failed to process token batch of {len(tokens)} tokens for request {request_id}",
+                f"Streaming: Failed to process token batch of {len(tokens)} tokens for request {request_id}"
             )
-            logger.debug("Problematic streaming tokens: %s", tokens)
+            logger.debug(f"Problematic streaming tokens: {tokens}")
 
             # Return empty deltas to continue processing
             return []
@@ -1457,8 +1461,8 @@ def create_stream_state(
         """
         if request_id in self._stream_states:
             logger.warning(
-                "Stream state already exists for request %s, replacing",
-                request_id)
+                f"Stream state already exists for request {request_id}, replacing"
+            )
 
         stream_state = HarmonyStreamState(
             request_id=request_id,
@@ -1494,7 +1498,7 @@ def _filter_tool_calls(
 
             # Filter unavailable external tools
             if should_filter_external_tools and func_name not in external_tools:
-                logger.debug("Filtered unavailable tool call: %s", func_name)
+                logger.debug(f"Filtered unavailable tool call: {func_name}")
                 continue
 
             filtered.append(tool_call)
@@ -1644,7 +1648,7 @@ def handle_non_streaming_response(tools: List[ChatCompletionToolsParam],
         output.token_ids, tools_for_parser, tool_choice)
 
     # CONVERTED OUTPUT (after harmony to openai conversion)
-    logger.debug("✅ CONVERTED OUTPUT: %s", json.dumps(parsed_output, indent=2))
+    logger.debug(f"✅ CONVERTED OUTPUT: {json.dumps(parsed_output, indent=2)}")
 
     # Create response message
     response_message = _create_response_message(parsed_output)
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 2d6b02897d52..c52911fe00bb 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -1091,11 +1091,13 @@ def test_auto_dtype(self, block_reuse, mocker):
             "max_attention_window": [128, 32768],
             "enable_block_reuse": block_reuse,
             "enable_partial_reuse": False,
+            "free_gpu_memory_fraction": 0.5,
         }
         gen_server_config["kv_cache_config"] = {
             "max_attention_window": [128, 32768],
             "enable_block_reuse": block_reuse,
             "enable_partial_reuse": False,
+            "free_gpu_memory_fraction": 0.5,
         }
         disaggregated_server_config = {
             "hostname": "localhost",
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index ab9c1591c178..66b07ebf7b68 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -4369,6 +4369,11 @@ def test_eagle3_4gpus(self, moe_backend, one_model, overlap_scheduler,
                 "https://nvbugs/5636916: Remaining Hopper Eagle Accuracy Issue for only TP=4"
             )
 
+        if not one_model and overlap_scheduler:
+            pytest.skip(
+                "https://nvbugs/5745152: two_model + overlap_scheduler can sometimes time out."
+            )
+
         MAX_OUTPUT_LEN = 128179
         MAX_INPUT_LEN = 32768
 
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
index 90a198897b6c..23340d1f506c 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
index 120fc40b3c2e..774f321175a3 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml
new file mode 100644
index 000000000000..387704da4a16
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml
@@ -0,0 +1,105 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: -1
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: e2e
+  use_nv_sa_benchmark: true
+  multi_round: 8
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '6144'
+  input_length: 1024
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    context_parallel_size: 1
+    max_batch_size: 768
+    max_num_tokens: 768
+    max_seq_len: 2068
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+      dtype: fp8
+    moe_config:
+      backend: CUTLASS
+      use_low_precision_moe_combine: true
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    stream_interval: 100
+    num_postprocess_workers: 4
+  ctx:
+    max_batch_size: 16
+    max_num_tokens: 16896
+    max_seq_len: 2044
+    tensor_parallel_size: 4
+    context_parallel_size: 1
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index 6a4f5f5ddfeb..1eaa085ba6c6 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index e8f1b31a4117..46072b7585b8 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
index 2f9d1ad7c8ab..2851dc8ce405 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
index e60204a5624b..390e68f26015 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp3_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
index a307a87f173f..cd29bd85102b 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
index d44c4d51e069..b9f7881c60df 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml
new file mode 100644
index 000000000000..b6299357d2ad
--- /dev/null
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX.yaml
@@ -0,0 +1,122 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  precision: fp4
+  model_dir_name: DeepSeek-R1-0528-FP4-v2
+  supported_gpus:
+  - GB200
+  - GB300
+  script_file: disaggr_torch.slurm
+  benchmark_type: 1k1k
+  config_index: -1
+slurm:
+  script_file: disaggr_torch.slurm
+  partition: <partition>
+  account: <account>
+  job_time: 02:00:00
+  job_name: unified-benchmark
+  extra_args: "--gres=gpu:4"
+  numa_bind: true
+benchmark:
+  mode: gen_only
+  use_nv_sa_benchmark: true
+  multi_round: 1
+  benchmark_ratio: 0.8
+  streaming: true
+  concurrency_list: '1024'
+  input_length: 8192
+  output_length: 1024
+  dataset_file: <dataset_file>
+hardware:
+  gpus_per_node: 4
+  num_ctx_servers: 1
+  num_gen_servers: 1
+environment:
+  container_mount: <container_mount>
+  container_image: <container_image>
+  model_path: <model_path>
+  trtllm_repo: ''
+  build_wheel: false
+  work_dir: <full_path_to_work_dir>
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
+profiling:
+  nsys_on: false
+accuracy:
+  enable_accuracy_test: false
+  model: local-completions
+  tasks: gsm8k
+  model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
+worker_config:
+  gen:
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    context_parallel_size: 1
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 512
+    max_seq_len: 9256
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 24
+      - 32
+      - 40
+      - 48
+      - 56
+      - 64
+      - 72
+      - 80
+      - 88
+      - 96
+      - 104
+      - 112
+      - 120
+      - 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    moe_config:
+      backend: CUTEDSL
+      use_low_precision_moe_combine: true
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    stream_interval: 100
+    num_postprocess_workers: 4
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+  ctx:
+    max_batch_size: 2
+    max_num_tokens: 16896
+    max_seq_len: 9256
+    tensor_parallel_size: 4
+    context_parallel_size: 1
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 16384
+      backend: UCX
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
index 05c6794dd63e..14c52a0fd8c5 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
index 10aa98c4b30d..532753822974 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
index 64dd806fa6df..56e02bd4e2ee 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
index b0b731322616..c94a0698b97c 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs32_eplb0_mtp0_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
index 796fdbd8747c..9f85592d0aee 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
index 4a45880f1477..d1f20e16054a 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb0_mtp0_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
index bc46d9fea34b..923b72aa33a4 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-NIXL.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
index c397316b3559..fc47dd9bdb17 100644
--- a/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
+++ b/tests/integration/defs/perf/disagg/test_configs/disagg/perf/deepseek-r1-fp4_8k1k_ctx8_gen1_dep32_bs16_eplb0_mtp3_ccb-UCX.yaml
@@ -1,5 +1,5 @@
 metadata:
-  model_name: deepseek-r1-fp4
+  model_name: deepseek_r1_0528_fp4_v2
   precision: fp4
   model_dir_name: DeepSeek-R1-0528-FP4-v2
   supported_gpus:
diff --git a/tests/integration/defs/perf/open_search_db_utils.py b/tests/integration/defs/perf/open_search_db_utils.py
index 8a4caa704e3e..87f0b0fed629 100644
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@@ -33,6 +33,8 @@
 
 PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1"  # "sandbox-trtllm-ci-perf"
 TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
+PRE_MERGE_THRESHOLD = 0.1
+POST_MERGE_THRESHOLD = 0.05
 
 # Metrics where larger is better
 MAXIMIZE_METRICS = [
@@ -268,24 +270,7 @@ def match(history_data, new_data, match_keys):
     def is_empty(value):
         return value is None or value == ""
 
-    def should_skip_field(field):
-        # Skip fields starting with @, _, ts_
-        if field.startswith('@') or field.startswith('_') or field.startswith(
-                'ts_'):
-            return True
-        # Skip log links and speculative_model_dir and job configs
-        if field in [
-                's_speculative_model_dir', 's_server_log_link',
-                's_ctx_server_log_link', 's_gen_server_log_link',
-                's_client_log_link'
-        ]:
-            return True
-        return False
-
     for field in match_keys:
-        # Skip excluded fields
-        if should_skip_field(field):
-            continue
         history_value = history_data.get(field, None)
         new_value = new_data.get(field, None)
         if is_empty(history_value) and is_empty(new_value):
@@ -412,6 +397,33 @@ def parse_timestamp(timestamp):
     return history_baseline_dict, history_data_dict
 
 
+def get_threshold(baseline_data, metric):
+    """
+    Get the threshold for a metric from baseline data.
+    """
+    is_post_merge = baseline_data.get("b_is_post_merge", False)
+
+    metric_suffix = metric[2:]  # Remove "d_" prefix
+    if is_post_merge:
+        threshold_key = f"d_threshold_post_merge_{metric_suffix}"
+    else:
+        threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
+
+    # Try to get the specific threshold (post_merge or pre_merge)
+    if threshold_key in baseline_data:
+        return baseline_data[threshold_key]
+
+    # Fall back to general threshold
+    fallback_key = f"d_threshold_{metric_suffix}"
+    if fallback_key in baseline_data:
+        return baseline_data[fallback_key]
+
+    # No threshold found, raise error
+    raise KeyError(
+        f"No threshold found for metric '{metric}'. "
+        f"Expected '{threshold_key}' or '{fallback_key}' in baseline data.")
+
+
 def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
     """Get regressive test cases
     1. For Maximize metrics, if new perf is below baseline * (1 - threshold)
@@ -419,8 +431,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
     Set it as regressive.
     """
     regressive_data_list = []
+    cmd_idxs = new_data_dict.keys()
     # Find regressive test cases
-    for cmd_idx in new_data_dict:
+    for cmd_idx in cmd_idxs:
         if history_baseline_dict[cmd_idx] is None:
             continue
 
@@ -433,8 +446,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
         for metric in MAXIMIZE_METRICS:
             if metric not in new_data or metric not in baseline_data:
                 continue
-            threshold_key = f"d_threshold_{metric[2:]}"
-            threshold = baseline_data[threshold_key]
+            threshold = get_threshold(baseline_data, metric)
             baseline_value = baseline_data[metric]
             new_value = new_data[metric]
             # Regressive if new_value < baseline_value * (1 - threshold)
@@ -446,8 +458,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
         for metric in MINIMIZE_METRICS:
             if metric not in new_data or metric not in baseline_data:
                 continue
-            threshold_key = f"d_threshold_{metric[2:]}"
-            threshold = baseline_data.get(threshold_key, 0.1)
+            threshold = get_threshold(baseline_data, metric)
             baseline_value = baseline_data[metric]
             new_value = new_data[metric]
             # Regressive if new_value > baseline_value * (1 + threshold)
@@ -464,10 +475,16 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
                     baseline_key = f"d_baseline_{metric[2:]}"
                     regressive_data[baseline_key] = baseline_data[metric]
 
-                    threshold_key = f"d_threshold_{metric[2:]}"
-                    if threshold_key in baseline_data:
-                        regressive_data[threshold_key] = baseline_data[
-                            threshold_key]
+                    # Copy all threshold keys from baseline
+                    metric_suffix = metric[2:]
+                    for threshold_key in [
+                            f"d_threshold_{metric_suffix}",
+                            f"d_threshold_post_merge_{metric_suffix}",
+                            f"d_threshold_pre_merge_{metric_suffix}"
+                    ]:
+                        if threshold_key in baseline_data:
+                            regressive_data[threshold_key] = baseline_data[
+                                threshold_key]
 
             # Add regression info string
             regressive_data["s_regression_info"] = ", ".join(regressive_metrics)
@@ -478,8 +495,7 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
     return regressive_data_list
 
 
-def prepare_baseline_data(history_baseline_dict, history_data_dict,
-                          new_data_dict):
+def prepare_baseline_data(history_data_dict, new_data_dict):
     """
     Calculate new baseline from history post-merge data and new data.
     Then return new baseline data.
@@ -491,20 +507,19 @@ def prepare_baseline_data(history_baseline_dict, history_data_dict,
         # Calculate best metrics from history post-merge data and new data
         best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
                                                   new_data_dict[cmd_idx])
-        new_baseline_data = history_baseline_dict[cmd_idx]
-        if new_baseline_data:
-            print_info(f"Baseline data found (cmd_idx: {cmd_idx}) in history")
-        else:
-            print_info(
-                f"No baseline data found (cmd_idx: {cmd_idx}), created a new baseline"
-            )
-            new_baseline_data = new_data_dict[cmd_idx].copy()
-            new_baseline_data["b_is_baseline"] = True
-            add_id(new_baseline_data)
-        # Add or update baseline metrics
+        new_baseline_data = new_data_dict[cmd_idx].copy()
+        new_baseline_data["b_is_baseline"] = True
+        # Add or update baseline metrics and thresholds
         for metric, value in best_metrics.items():
             new_baseline_data[metric] = value
-            new_baseline_data[f"d_threshold_{metric[2:]}"] = 0.1
+            metric_suffix = metric[2:]
+            post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
+            pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
+            new_baseline_data[post_merge_key] = new_baseline_data.get(
+                post_merge_key, POST_MERGE_THRESHOLD)
+            new_baseline_data[pre_merge_key] = new_baseline_data.get(
+                pre_merge_key, PRE_MERGE_THRESHOLD)
+        add_id(new_baseline_data)
         new_baseline_data_dict[cmd_idx] = new_baseline_data
 
     return new_baseline_data_dict
diff --git a/tests/integration/defs/perf/perf_regression_check.py b/tests/integration/defs/perf/perf_regression_check.py
new file mode 100644
index 000000000000..7c29845eb011
--- /dev/null
+++ b/tests/integration/defs/perf/perf_regression_check.py
@@ -0,0 +1,185 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import yaml
+
+METRICS = [
+    "seq_throughput",
+    "token_throughput",
+    "total_token_throughput",
+    "user_throughput",
+    "mean_tpot",
+    "median_tpot",
+    "p99_tpot",
+    "mean_ttft",
+    "median_ttft",
+    "p99_ttft",
+    "mean_itl",
+    "median_itl",
+    "p99_itl",
+    "mean_e2el",
+    "median_e2el",
+    "p99_e2el",
+]
+
+
+def should_skip_execution():
+    disagg_type = os.getenv("DISAGG_SERVING_TYPE", "")
+    if (
+        disagg_type.startswith("GEN")
+        or disagg_type.startswith("CTX")
+        or disagg_type == "DISAGG_SERVER"
+    ):
+        return True
+    return False
+
+
+def find_yaml_files(job_workspace, filename):
+    yaml_files = []
+    for root, dirs, files in os.walk(job_workspace):
+        for file in files:
+            if file == filename:
+                yaml_files.append(os.path.join(root, file))
+    return yaml_files
+
+
+def read_yaml_data(yaml_files):
+    all_data = []
+    for file_path in yaml_files:
+        try:
+            with open(file_path, "r") as f:
+                data = yaml.safe_load(f)
+                if data:
+                    if isinstance(data, list):
+                        all_data.extend(data)
+                    else:
+                        all_data.append(data)
+        except Exception as e:
+            print(f"Error reading {file_path}: {e}")
+    return all_data
+
+
+def get_metric_keys():
+    metric_keys = set()
+    for metric in METRICS:
+        metric_keys.add(f"d_{metric}")
+        metric_keys.add(f"d_baseline_{metric}")
+        metric_keys.add(f"d_threshold_{metric}")
+    return metric_keys
+
+
+def print_perf_data(data):
+    print("=== Metrics ===")
+    for metric in METRICS:
+        value_key = f"d_{metric}"
+        if value_key in data:
+            value = data.get(value_key, "N/A")
+            print(f'"{value_key}": {value}')
+
+    metric_keys = get_metric_keys()
+    print("\n=== Config ===")
+    config_keys = sorted([key for key in data.keys() if key not in metric_keys])
+    for key in config_keys:
+        value = data[key]
+        print(f'"{key}": {value}')
+
+
+def print_regression_data(data):
+    if "s_regression_info" in data:
+        print("=== Regression Info ===")
+        print(f"{data['s_regression_info']}")
+
+    metric_keys = get_metric_keys()
+
+    print("=== Metrics ===")
+    for metric in METRICS:
+        value_key = f"d_{metric}"
+        baseline_key = f"d_baseline_{metric}"
+        threshold_key = f"d_threshold_{metric}"
+        # Only print if at least one of the keys exists
+        if value_key in data or baseline_key in data or threshold_key in data:
+            value = data.get(value_key, "N/A")
+            baseline = data.get(baseline_key, "N/A")
+            threshold = data.get(threshold_key, "N/A")
+            # Calculate percentage difference between value and baseline
+            if (
+                isinstance(value, (int, float))
+                and isinstance(baseline, (int, float))
+                and baseline != 0
+            ):
+                percentage = (value - baseline) / baseline * 100
+                percentage_str = f"{percentage:+.2f}%"
+            else:
+                percentage_str = "N/A"
+            print(
+                f'"{value_key}": {value}, "{baseline_key}": {baseline}, '
+                f'"{threshold_key}": {threshold}, "diff": {percentage_str}'
+            )
+
+    print("\n=== Config ===")
+    config_keys = sorted([key for key in data.keys() if key not in metric_keys])
+    for key in config_keys:
+        if key == "s_regression_info":
+            continue
+        value = data[key]
+        print(f'"{key}": {value}')
+
+
+def main():
+    if should_skip_execution():
+        print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE")
+        return 0
+
+    job_workspace = sys.argv[1]
+
+    if not os.path.isdir(job_workspace):
+        print(f"Error: {job_workspace} is not a valid directory")
+        sys.exit(1)
+
+    perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
+    all_perf_data = read_yaml_data(perf_data_files)
+    print(f"Found {len(all_perf_data)} perf data")
+    for i, data in enumerate(all_perf_data):
+        print(f"\n{'=' * 60}")
+        print(f"Perf Data #{i + 1}")
+        print("=" * 60)
+        print_perf_data(data)
+
+    print(f"\n{'=' * 60}\n")
+
+    regression_files = find_yaml_files(job_workspace, "regression.yaml")
+    all_regression_data = read_yaml_data(regression_files)
+    print(f"Found {len(all_regression_data)} regression data")
+    for i, data in enumerate(all_regression_data):
+        print(f"\n{'=' * 60}")
+        print(f"Regression Data #{i + 1}")
+        print("=" * 60)
+        print_regression_data(data)
+
+    if len(all_regression_data) == 0:
+        print("\n No regression data found. Perf check is successful.")
+        return 0
+    else:
+        print(
+            f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
+        )
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index 980f0d11606a..6074f2f310f2 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -19,14 +19,15 @@
 import re
 import shutil
 import socket
+import subprocess
 import sys
 from typing import Dict, List, NamedTuple
 
 import pytest
 import yaml
 from defs.common import get_cpp_benchmark
-from defs.trt_test_alternative import (is_linux, is_windows, print_info,
-                                       print_warning)
+from defs.trt_test_alternative import (is_linux, is_windows, print_error,
+                                       print_info, print_warning)
 
 from ..conftest import get_llm_root, llm_models_root, trt_environment
 from .open_search_db_utils import (SCENARIO_MATCH_FIELDS, add_id,
@@ -227,6 +228,11 @@ def get_model_dir(model_name: str):
     return model_dir
 
 
+def get_dataset_path():
+    return os.path.join(llm_models_root(), "datasets",
+                        "ShareGPT_V3_unfiltered_cleaned_split.json")
+
+
 def cpu_socket_count_gt_1():
     global MAP_BY_SOCKET
     if MAP_BY_SOCKET is not None:
@@ -319,37 +325,37 @@ def import_allowed_perf_config():
 
 AGGR_SERVER_PERF_METRIC_LOG_QUERIES = {
     PerfMetricType.SEQ_THROUGHPUT:
-    re.compile(r"Request throughput \(req\/s\):\s+([\d\.]+)"),
+    re.compile(r"Request throughput \(req\/s\):\s+(-?[\d\.]+)"),
     PerfMetricType.TOKEN_THROUGHPUT:
-    re.compile(r"Output token throughput \(tok\/s\):\s+([\d\.]+)"),
+    re.compile(r"Output token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
     PerfMetricType.TOTAL_TOKEN_THROUGHPUT:
-    re.compile(r"Total Token throughput \(tok\/s\):\s+([\d\.]+)"),
+    re.compile(r"Total Token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
     PerfMetricType.USER_THROUGHPUT:
-    re.compile(r"User throughput \(tok\/s\):\s+([\d\.]+)"),
+    re.compile(r"User throughput \(tok\/s\):\s+(-?[\d\.]+)"),
     PerfMetricType.FIRST_TOKEN_TIME:
-    re.compile(r"Mean TTFT \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Mean TTFT \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.MEDIAN_FIRST_TOKEN_TIME:
-    re.compile(r"Median TTFT \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Median TTFT \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.P99_FIRST_TOKEN_TIME:
-    re.compile(r"P99 TTFT \(ms\):\s+([\d\.]+)"),
+    re.compile(r"P99 TTFT \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.INTER_TOKEN_TIME:
-    re.compile(r"Mean ITL \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Mean ITL \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.MEDIAN_INTER_TOKEN_TIME:
-    re.compile(r"Median ITL \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Median ITL \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.P99_INTER_TOKEN_TIME:
-    re.compile(r"P99 ITL \(ms\):\s+([\d\.]+)"),
+    re.compile(r"P99 ITL \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.OUTPUT_TOKEN_TIME:
-    re.compile(r"Mean TPOT \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Mean TPOT \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME:
-    re.compile(r"Median TPOT \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Median TPOT \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.P99_OUTPUT_TOKEN_TIME:
-    re.compile(r"P99 TPOT \(ms\):\s+([\d\.]+)"),
+    re.compile(r"P99 TPOT \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.INFERENCE_TIME:
-    re.compile(r"Mean E2EL \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Mean E2EL \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.MEDIAN_INFERENCE_TIME:
-    re.compile(r"Median E2EL \(ms\):\s+([\d\.]+)"),
+    re.compile(r"Median E2EL \(ms\):\s+(-?[\d\.]+)"),
     PerfMetricType.P99_INFERENCE_TIME:
-    re.compile(r"P99 E2EL \(ms\):\s+([\d\.]+)"),
+    re.compile(r"P99 E2EL \(ms\):\s+(-?[\d\.]+)"),
 }
 
 # (Relative threshold, Absolute threshold) for all metric types
@@ -512,17 +518,21 @@ class ServerConfig:
 
     def __init__(self, server_config_data: dict, env_vars: str = ""):
         # Extract required fields
+        self.mode = server_config_data.get('mode', 'e2e')
+        self.concurrency = server_config_data.get('concurrency', 1)
         self.name = server_config_data['name']
         self.model_name = server_config_data['model_name']
-        self.gpus = server_config_data['gpus']
         self.model_path = ""
         self.env_vars = env_vars
 
         # Extract optional fields with defaults
-        self.tp = server_config_data.get('tensor_parallel_size', self.gpus)
+        self.tp = server_config_data.get('tensor_parallel_size', 1)
         self.ep = server_config_data.get('moe_expert_parallel_size', 1)
         self.pp = server_config_data.get('pipeline_parallel_size', 1)
-        self.gpus_per_node = server_config_data.get('gpus_per_node', self.gpus)
+        self.cp = server_config_data.get('context_parallel_size', 1)
+        self.gpus = server_config_data.get('gpus', self.tp * self.cp * self.pp)
+        self.gpus_per_node = server_config_data.get('gpus_per_node',
+                                                    0) or self.gpus
         self.max_num_tokens = server_config_data.get('max_num_tokens', 2048)
         self.max_batch_size = server_config_data.get('max_batch_size', 512)
         self.max_seq_len = server_config_data.get('max_seq_len', 0)
@@ -538,6 +548,8 @@ def __init__(self, server_config_data: dict, env_vars: str = ""):
             'enable_attention_dp', False)
         self.trust_remote_code = server_config_data.get('trust_remote_code',
                                                         False)
+        self.enable_lm_head_tp_in_adp = server_config_data.get(
+            'enable_lm_head_tp_in_adp', False)
 
         # attention_dp_config
         attention_dp_config = server_config_data.get('attention_dp_config', {})
@@ -551,6 +563,12 @@ def __init__(self, server_config_data: dict, env_vars: str = ""):
         moe_config = server_config_data.get('moe_config', {})
         self.moe_backend = moe_config.get('backend', "")
         self.moe_max_num_tokens = moe_config.get('max_num_tokens', 0)
+        self.use_low_precision_moe_combine = moe_config.get(
+            'use_low_precision_moe_combine', False)
+        load_balancer_config = moe_config.get('load_balancer', {})
+        self.load_balancer_num_slots = load_balancer_config.get('num_slots', 0)
+        self.load_balancer_layer_updates_per_iter = load_balancer_config.get(
+            'layer_updates_per_iter', 0)
 
         # cuda_graph_config
         cuda_graph_config = server_config_data.get('cuda_graph_config', {})
@@ -605,10 +623,13 @@ def __init__(self, server_config_data: dict, env_vars: str = ""):
         self.match_mode = server_config_data.get('match_mode', "config")
 
         # Store filtered config for extra_llm_api_config (exclude name, model_name, gpus, client_configs)
+        exclude_keys = [
+            'mode', 'concurrency', 'name', 'model_name', 'gpus',
+            'gpus_per_node', 'client_configs'
+        ]
         self.extra_llm_api_config_data = {
             k: v
-            for k, v in server_config_data.items()
-            if k not in ['name', 'model_name', 'gpus', 'client_configs']
+            for k, v in server_config_data.items() if k not in exclude_keys
         }
 
     def to_cmd(self,
@@ -634,8 +655,41 @@ def to_cmd(self,
     def to_env(self) -> Dict[str, str]:
         return to_env_dict(self.env_vars)
 
+    def to_match_keys(self) -> List[str]:
+        return [
+            "s_mode",
+            "s_model_name",
+            "l_tp",
+            "l_ep",
+            "l_pp",
+            "l_cp",
+            "l_gpus_per_node",
+            "l_max_batch_size",
+            "b_disable_overlap_scheduler",
+            "l_num_postprocess_workers",
+            "s_attn_backend",
+            "b_enable_chunked_prefill",
+            "b_enable_attention_dp",
+            "b_enable_lm_head_tp_in_adp",
+            # attention_dp_config
+            "b_attention_dp_balance",
+            # moe_config
+            "s_moe_backend",
+            # cuda_graph_config
+            "b_enable_cuda_graph",
+            # kv_cache_config
+            "s_kv_cache_dtype",
+            # cache_transceiver_config
+            "s_cache_transceiver_backend"
+            # speculative_config
+            "s_spec_decoding_type",
+            "l_num_nextn_predict_layers",
+        ]
+
     def to_db_data(self) -> dict:
         db_data = {
+            "s_mode":
+            self.mode,
             "s_model_name":
             self.model_name.lower(),
             "l_gpus":
@@ -646,6 +700,8 @@ def to_db_data(self) -> dict:
             self.ep,
             "l_pp":
             self.pp,
+            "l_cp":
+            self.cp,
             "l_gpus_per_node":
             self.gpus_per_node,
             "l_max_num_tokens":
@@ -668,6 +724,8 @@ def to_db_data(self) -> dict:
             self.enable_attention_dp,
             "b_trust_remote_code":
             self.trust_remote_code,
+            "b_enable_lm_head_tp_in_adp":
+            self.enable_lm_head_tp_in_adp,
             # attention_dp_config
             "b_attention_dp_balance":
             self.attention_dp_balance,
@@ -680,6 +738,12 @@ def to_db_data(self) -> dict:
             self.moe_backend,
             "l_moe_max_num_tokens":
             self.moe_max_num_tokens,
+            "b_use_low_precision_moe_combine":
+            self.use_low_precision_moe_combine,
+            "l_load_balancer_num_slots":
+            self.load_balancer_num_slots,
+            "l_load_balancer_layer_updates_per_iter":
+            self.load_balancer_layer_updates_per_iter,
             # cuda_graph_config
             "b_enable_cuda_graph":
             self.enable_cuda_graph,
@@ -754,7 +818,7 @@ def __init__(self,
         self.osl = client_config_data.get('osl', 1024)
         self.random_range_ratio = client_config_data.get(
             'random_range_ratio', 0.0)
-        self.backend = client_config_data.get('backend', "")
+        self.backend = client_config_data.get('backend', "openai")
         self.use_chat_template = client_config_data.get('use_chat_template',
                                                         False)
         self.streaming = client_config_data.get('streaming', True)
@@ -765,18 +829,36 @@ def to_cmd(self) -> List[str]:
         model_dir = get_model_dir(self.model_name)
         self.model_path = model_dir if os.path.exists(
             model_dir) else self.model_name
-
+        dataset_path = get_dataset_path()
         benchmark_cmd = [
-            "python", "-m", "tensorrt_llm.serve.scripts.benchmark_serving",
-            "--model", self.model_path, "--dataset-name", "random",
-            "--random-ids", "--num-prompts",
-            str(self.concurrency * self.iterations), "--random-input-len",
-            str(self.isl), "--random-output-len",
-            str(self.osl), "--random-range-ratio",
-            str(self.random_range_ratio), "--ignore-eos",
-            "--percentile-metrics", "ttft,tpot,itl,e2el", "--max-concurrency",
-            str(self.concurrency)
+            "python",
+            "-m",
+            "tensorrt_llm.serve.scripts.benchmark_serving",
+            "--model",
+            self.model_path,
+            "--tokenizer",
+            self.model_path,
+            "--dataset-name",
+            "random",
+            "--random-ids",
+            "--num-prompts",
+            str(self.concurrency * self.iterations),
+            "--max-concurrency",
+            str(self.concurrency),
+            "--random-input-len",
+            str(self.isl),
+            "--random-output-len",
+            str(self.osl),
+            "--random-range-ratio",
+            str(self.random_range_ratio),
+            "--trust-remote-code",
+            "--ignore-eos",
+            "--percentile-metrics",
+            "ttft,tpot,itl,e2el",
         ]
+        if dataset_path and os.path.exists(dataset_path):
+            benchmark_cmd.append("--dataset-path")
+            benchmark_cmd.append(dataset_path)
         if self.backend:
             benchmark_cmd.append("--backend")
             benchmark_cmd.append(self.backend)
@@ -789,6 +871,18 @@ def to_cmd(self) -> List[str]:
     def to_env(self) -> Dict[str, str]:
         return to_env_dict(self.env_vars)
 
+    def to_match_keys(self) -> List[str]:
+        return [
+            "l_concurrency",
+            "l_iterations",
+            "l_isl",
+            "l_osl",
+            "d_random_range_ratio",
+            "s_backend",
+            "b_use_chat_template",
+            "b_streaming",
+        ]
+
     def to_db_data(self) -> dict:
         """Convert ClientConfig to Database data"""
         db_data = {
@@ -867,36 +961,37 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
     else:
         execution_plan = None
 
-    # Read YAML config file
     with open(config_file_path, 'r') as f:
         config = yaml.safe_load(f)
 
-    # Read environment config
+    metadata = config.get('metadata', {})
     environment = config.get('environment', {})
-    if not environment:
-        environment = {}
+    hardware = config.get('hardware', {})
+    gpus_per_node = hardware.get('gpus_per_node', 0)
 
-    # Get environment variables
-    environment.get('worker_env_var', '')
+    model_name = metadata.get('model_name', '')
     server_env_var = environment.get('server_env_var', '')
     client_env_var = environment.get('client_env_var', '')
 
     server_configs = []
     server_client_configs = {}
-
     for server_config_data in config['server_configs']:
         server_name = server_config_data['name']
+        server_config_data[
+            'model_name'] = model_name if 'model_name' not in server_config_data else server_config_data[
+                'model_name']
+        server_config_data['mode'] = 'e2e'
+        server_config_data['concurrency'] = -1
+        server_config_data['gpus_per_node'] = gpus_per_node
 
         # Check if this server should be included based on execution_plan
         if execution_plan is not None and server_name not in execution_plan:
             continue
 
-        # Create ServerConfig object directly from dict
         server_config = ServerConfig(server_config_data, server_env_var)
         server_id = len(server_configs)
         server_configs.append(server_config)
 
-        # Create ClientConfig objects
         client_configs = []
         selected_client_names = execution_plan.get(
             server_name) if execution_plan else None
@@ -905,7 +1000,6 @@ def parse_aggr_config_file(config_file_path: str, select_pattern: str = None):
             client_name = client_config_data['name']
 
             # Check if this client should be included
-            # Include if: execution_plan is None OR selected_client_names is None OR client_name in selected_client_names
             if execution_plan is not None and selected_client_names is not None:
                 if client_name not in selected_client_names:
                     continue
@@ -929,46 +1023,48 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
         config = yaml.safe_load(f)
 
     disagg_configs = []
+    metadata = config.get('metadata', {})
     hardware = config.get('hardware', {})
     benchmark = config.get('benchmark', {})
     environment = config.get('environment', {})
     slurm_config = config.get('slurm', {})
     worker_config = config.get('worker_config', {})
-    timeout = slurm_config.get('timeout', 3600)
+    timeout = slurm_config.get('timeout', 7200)
     numa_bind = slurm_config.get('numa_bind', False)
+    gpus_per_node = hardware.get('gpus_per_node', 0)
+    model_name = metadata.get('model_name', '')
+    assert model_name, "model_name is required in metadata section"
 
-    # Get model name from environment
-    model_name = environment.get('model_name', '')
-    assert model_name, "model_name is required in environment section"
+    benchmark_mode = benchmark.get('mode', 'e2e')
+    if "gen_only" in benchmark_mode:
+        hardware['num_ctx_servers'] = 0
 
-    # Get environment variables
     worker_env_var = environment.get('worker_env_var', '')
     server_env_var = environment.get('server_env_var', '')
     client_env_var = environment.get('client_env_var', '')
 
-    # Create ctx_server config data
+    concurrency_str = benchmark.get('concurrency_list', '1')
+    if isinstance(concurrency_str, str):
+        concurrency = max(int(x) for x in concurrency_str.split())
+    else:
+        concurrency = int(concurrency_str)
+
     ctx_server_config_data = {
+        'mode': benchmark_mode,
+        'concurrency': concurrency,
         'name': 'ctx',
         'model_name': model_name,
-        'gpus': hardware.get('gpus_per_ctx_server'),
-        'gpus_per_node': hardware.get('gpus_per_node'),
+        'gpus_per_node': gpus_per_node,
         **worker_config.get('ctx', {})
     }
-
-    # Create gen_server config data
     gen_server_config_data = {
+        'mode': benchmark_mode,
+        'concurrency': concurrency,
         'name': 'gen',
         'model_name': model_name,
-        'gpus': hardware.get('gpus_per_gen_server'),
-        'gpus_per_node': hardware.get('gpus_per_node'),
+        'gpus_per_node': gpus_per_node,
         **worker_config.get('gen', {})
     }
-
-    # Create client config data
-    concurrency_str = benchmark.get('concurrency_list', '1')
-    concurrency = int(concurrency_str) if isinstance(concurrency_str,
-                                                     str) else concurrency_str
-
     client_config_data = {
         'name': 'client',
         'concurrency': concurrency,
@@ -980,13 +1076,12 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
         'use_chat_template': False,
         'streaming': benchmark.get('streaming', True),
     }
-
-    # Create disagg_config dict
     disagg_config = {
         'disagg_serving_type': disagg_serving_type,
         'hostname': socket.gethostname(),
         'numa_bind': numa_bind,
         'timeout': timeout,
+        'mode': benchmark_mode,
         'name': 'disagg_config',
         'model_name': model_name,
         'hardware': hardware,
@@ -995,9 +1090,7 @@ def parse_multi_node_disagg_config_file(config_file_path: str,
         'server_env_var': server_env_var,
         'client': ClientConfig(client_config_data, model_name, client_env_var),
     }
-    print_info(f"disagg_config: {disagg_config}")
     disagg_configs.append(disagg_config)
-
     return disagg_configs
 
 
@@ -1114,6 +1207,8 @@ def __init__(
         self.upload_to_db = False
         self.config_file = None
         self.gpu_type = None
+        self.config_dir = None
+        self.config_file = None
         self.config_path = None
         self.select_pattern = None
         # Aggregated mode
@@ -1330,35 +1425,47 @@ def load_from_str(self, test_param_labels) -> None:
         # Extract configs from test param labels.
         labels = test_param_labels.split("-")
 
-        def get_gpu_type(label: str) -> str:
-            parts = label.split("_")
-            if len(parts) < 2 or parts[0] != "l0":
-                return ""
-            if parts[1] == "dgx":
-                if len(parts) >= 3:
-                    gpu_type = f"{parts[1]}_{parts[2]}"
-                else:
-                    gpu_type = ""
-            else:
-                gpu_type = parts[1]
-            return gpu_type.lower()
+        def get_gpu_type() -> str:
+            try:
+                output = subprocess.check_output(["nvidia-smi", "-L"],
+                                                 stderr=subprocess.DEVNULL,
+                                                 text=True)
+                first_line = output.strip().split("\n")[0]
+                gpu_models = ["GB300", "GB200", "B300", "B200"]
+                for model in gpu_models:
+                    if model in first_line:
+                        if model.startswith("B") and not model.startswith("GB"):
+                            return f"dgx_{model.lower()}"
+                        return model.lower()
+            except (subprocess.CalledProcessError, FileNotFoundError,
+                    IndexError):
+                print_error(
+                    f"Failed to get GPU type: {subprocess.CalledProcessError}")
+            return ""
 
-        # Used for perf sanity test
         if "perf_sanity" in labels[0]:
             assert len(labels) > 1, "perf_sanity test must have a config file!"
+            is_disagg = "disagg" in labels[0]
             self.upload_to_db = "upload" in labels[0]
-            self.config_file = labels[1]
-            if "disagg" in labels[1]:
+            self.gpu_type = get_gpu_type()
+            if is_disagg:
+                # For disagg, test name is like: perf_sanity_disagg-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp0_ccb-UCX
+                # labels[0] is perf_sanity_disagg, "-".join(labels[1:]) is config file base name
                 self.runtime = "multi_node_disagg_server"
+                self.config_dir = "tests/integration/defs/perf/disagg/test_configs/disagg/perf"
+                config_base = "-".join(labels[1:])
+                self.config_file = f"{config_base}.yaml" if not config_base.endswith(
+                    ".yaml") else config_base
+                self.select_pattern = None
             else:
+                # For aggr, test name is like: perf_sanity_aggr-l0_dgx_b300-r1_fp8_dep8_mtp1_1k1k
+                # labels[0] is perf_sanity_aggr, labels[1] is config file base name, labels[2] is select_pattern (optional)
                 self.runtime = "aggr_server"
-            self.gpu_type = get_gpu_type(labels[1])
-            config_folder = os.getenv("TRTLLM_CONFIG_FOLDER",
-                                      "tests/scripts/perf-sanity")
-            self.config_path = os.path.join(
-                config_folder, f"{labels[1]}.yaml"
-                if not labels[1].endswith(".yaml") else labels[1])
-            self.select_pattern = labels[2] if len(labels) > 2 else None
+                self.config_dir = "tests/scripts/perf-sanity"
+                config_base = labels[1]
+                self.config_file = f"{config_base}.yaml" if config_base and not config_base.endswith(
+                    ".yaml") else config_base
+                self.select_pattern = labels[2] if len(labels) > 2 else None
             return
 
         self.model_name = labels.pop(0)
@@ -1578,21 +1685,19 @@ def validate(self):
                     [b >= 32 for b in self.batch_sizes]
                 ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."
 
-    def set_aggr_server_configs(self, llm_root: str) -> None:
+    def set_aggr_server_configs(self) -> None:
         """
         Set the server and client configs.
         """
-        config_file_path = os.path.join(llm_root, self.config_path)
         _, self.server_configs, self.server_client_configs = parse_aggr_config_file(
-            config_file_path, self.select_pattern)
+            self.config_path, self.select_pattern)
 
-    def set_multi_node_disagg_server_configs(self, llm_root: str) -> None:
+    def set_multi_node_disagg_server_configs(self) -> None:
         """
         Set the multi-node disaggregated server configs.
         """
-        config_file_path = os.path.join(llm_root, self.config_path)
         self.disagg_configs = parse_multi_node_disagg_config_file(
-            config_file_path, self.select_pattern)
+            self.config_path, self.select_pattern)
 
     def get_model_family(self) -> str:
         """
@@ -1682,6 +1787,13 @@ def set_runtime_configs(self,
                             output_dir,
                             perf_cache_fpath,
                             gpu_clock_lock=None) -> None:
+        if self._config.runtime == "aggr_server" or self._config.runtime == "multi_node_disagg_server":
+            self._config.config_dir = os.getenv(
+                "TRTLLM_CONFIG_FOLDER",
+                os.path.join(llm_root, self._config.config_dir))
+            self._config.config_path = os.path.join(self._config.config_dir,
+                                                    self._config.config_file)
+
         if self._config.runtime == "cpp":
             if not self._config.is_bert_like():
                 raise ValueError(
@@ -1695,12 +1807,12 @@ def set_runtime_configs(self,
             benchmark_script = "trtllm-bench"
         elif self._config.runtime == "aggr_server":
             benchmark_script = None
-            self._config.set_aggr_server_configs(llm_root)
+            self._config.set_aggr_server_configs()
         elif self._config.runtime == "disagg_server":
             benchmark_script = None
         elif self._config.runtime == "multi_node_disagg_server":
             benchmark_script = None
-            self._config.set_multi_node_disagg_server_configs(llm_root)
+            self._config.set_multi_node_disagg_server_configs()
         else:
             raise RuntimeError(f"Invalid runtime {self._config.runtime}.")
 
@@ -1730,15 +1842,12 @@ def set_runtime_configs(self,
 
     def get_trtllm_aggr_commands(self, output_dir):
         server_cmds = []
-        server_envs = []
         client_cmds = []
-        client_envs = []
         names = []
         for server_idx, client_configs in self._config.server_client_configs.items(
         ):
             server_config = self._config.server_configs[server_idx]
             server_cmd = server_config.to_cmd(output_dir)
-            server_env = server_config.to_env()
             # Generate extra-llm-api-config.yml
             config_content = server_config.generate_extra_llm_api_config()
             config_filename = f"extra-llm-api-config.{server_config.name}.yml"
@@ -1747,49 +1856,35 @@ def get_trtllm_aggr_commands(self, output_dir):
                 f.write(config_content)
             for client_config in client_configs:
                 server_cmds.append(server_cmd)
-                server_envs.append(server_env)
                 client_cmd = client_config.to_cmd()
-                client_env = client_config.to_env()
                 client_cmds.append(client_cmd)
-                client_envs.append(client_env)
                 names.append(f"{server_config.name}-{client_config.name}")
-        return server_cmds, server_envs, client_cmds, client_envs, names
+        return server_cmds, client_cmds, names
 
     def get_trtllm_multi_node_disagg_commands(self, output_dir):
         ctx_server_cmds = []
-        ctx_server_envs = []
         gen_server_cmds = []
-        gen_server_envs = []
         disagg_server_cmds = []
-        disagg_server_envs = []
         benchmark_cmds = []
-        benchmark_envs = []
         cmd_idx = 0
         for disagg_config in self._config.disagg_configs:
             disagg_serving_type = disagg_config['disagg_serving_type']
             disagg_config['hostname']
             numa_bind = disagg_config['numa_bind']
             ctx_server_cmd = None
-            ctx_server_env = None
             gen_server_cmd = None
-            gen_server_env = None
             disagg_server_cmd = None
-            disagg_server_env = None
             benchmark_cmd = None
-            benchmark_env = None
             if "CTX" in disagg_serving_type or "GEN" in disagg_serving_type:
                 is_ctx = "CTX" in disagg_serving_type
                 server_config = disagg_config[
                     'ctx_server'] if is_ctx else disagg_config['gen_server']
                 server_cmd = server_config.to_cmd(output_dir, numa_bind,
                                                   disagg_serving_type)
-                server_env = server_config.to_env()
                 if is_ctx:
                     ctx_server_cmd = server_cmd
-                    ctx_server_env = server_env
                 else:
                     gen_server_cmd = server_cmd
-                    gen_server_env = server_env
                 # Generate extra-llm-api-config.yml
                 config_content = server_config.generate_extra_llm_api_config()
                 config_filename = f"extra-llm-api-config.{server_config.name}.yml"
@@ -1805,21 +1900,15 @@ def get_trtllm_multi_node_disagg_commands(self, output_dir):
                     str(timeout), "-r",
                     str(timeout)
                 ]
-                disagg_server_env = to_env_dict(disagg_config['server_env_var'])
             elif "BENCHMARK" in disagg_serving_type:
                 # Generate benchmark command if this is the BENCHMARK server node
                 benchmark_cmd = disagg_config['client'].to_cmd()
-                benchmark_env = disagg_config['client'].to_env()
             ctx_server_cmds.append(ctx_server_cmd)
-            ctx_server_envs.append(ctx_server_env)
             gen_server_cmds.append(gen_server_cmd)
-            gen_server_envs.append(gen_server_env)
             disagg_server_cmds.append(disagg_server_cmd)
-            disagg_server_envs.append(disagg_server_env)
             benchmark_cmds.append(benchmark_cmd)
-            benchmark_envs.append(benchmark_env)
             cmd_idx += 1
-        return ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs
+        return ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds
 
     def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
         build_cmd = [
@@ -2094,12 +2183,10 @@ def get_commands(self):
         if is_aggr:
             if not os.path.exists(perf_sanity_output_dir):
                 os.makedirs(perf_sanity_output_dir, exist_ok=True)
-            server_cmds, server_envs, client_cmds, client_envs, names = self.get_trtllm_aggr_commands(
+            server_cmds, client_cmds, names = self.get_trtllm_aggr_commands(
                 perf_sanity_output_dir)
             return PerfAggrScriptTestCmds(server_cmds=server_cmds,
-                                          server_envs=server_envs,
                                           client_cmds=client_cmds,
-                                          client_envs=client_envs,
                                           names=names,
                                           timeout=3600,
                                           output_dir=perf_sanity_output_dir)
@@ -2115,17 +2202,13 @@ def get_commands(self):
         if is_multi_node_disagg:
             if not os.path.exists(perf_sanity_output_dir):
                 os.makedirs(perf_sanity_output_dir, exist_ok=True)
-            ctx_server_cmds, ctx_server_envs, gen_server_cmds, gen_server_envs, disagg_server_cmds, disagg_server_envs, benchmark_cmds, benchmark_envs = self.get_trtllm_multi_node_disagg_commands(
+            ctx_server_cmds, gen_server_cmds, disagg_server_cmds, benchmark_cmds = self.get_trtllm_multi_node_disagg_commands(
                 perf_sanity_output_dir)
             return PerfMultiNodeDisaggScriptTestCmds(
                 ctx_server_cmds=ctx_server_cmds,
-                ctx_server_envs=ctx_server_envs,
                 gen_server_cmds=gen_server_cmds,
-                gen_server_envs=gen_server_envs,
                 disagg_server_cmds=disagg_server_cmds,
-                disagg_server_envs=disagg_server_envs,
                 benchmark_cmds=benchmark_cmds,
-                benchmark_envs=benchmark_envs,
                 timeout=self._config.disagg_configs[0]['timeout'],
                 hostname=self._config.disagg_configs[0]['hostname'],
                 disagg_serving_type=self._config.disagg_configs[0]
@@ -2156,6 +2239,7 @@ def get_commands(self):
                 build_cmd = self.get_trtllm_bench_build_command(engine_dir)
         else:
             pytest.skip("only support trtllm-bench runtime for now")
+
         # Construct prepare synthetic data command
         data_cmds = []
 
@@ -2293,32 +2377,24 @@ def run_metrics(self, llm_venv, gpu_clock_lock, session_data_writer,
         #print info to separate cases
         self._current_cmd_idx = 0
         metrics = self._get_metrics()
+        commands = self.get_commands()
         outputs = {}
         result_states = {}
         errors = []
 
-        def add_myelin_time_pass_to(input_env):
-            time_pass_flag = r" -time_pass=on"
-            old_myelin_env = input_env.get("__LUNOWUD", "")
-            if time_pass_flag not in old_myelin_env:
-                input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
-            return old_myelin_env
-
-        old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
+        # Only trtllm-bench needs to prepare dataset first.
         if self._config.runtime == 'bench':
-            #prepare dataset first for trtllm-bench
             print_info(f"Running command for generating dataset")
-            outputs = self.run_ex("prepare_dataset",
-                                  None,
-                                  llm_venv,
-                                  gpu_clock_lock,
-                                  session_data_writer,
-                                  output_dir,
+            outputs = self.run_ex(commands=commands,
+                                  cmd_idx=self._current_cmd_idx,
+                                  full_test_name="prepare_dataset",
+                                  metric_type=None,
+                                  venv=llm_venv,
+                                  gpu_clock_lock=gpu_clock_lock,
+                                  session_data_writer=session_data_writer,
+                                  output_dir=output_dir,
                                   outputs=outputs,
-                                  original_test_name="prepare_dataset",
-                                  cmd_idx=self._current_cmd_idx)
-
-            # Save the result state.
+                                  original_test_name="prepare_dataset")
             result_state = self.get_result_state()
             result_states[self._current_cmd_idx] = result_state
             if result_state != "valid":
@@ -2349,15 +2425,16 @@ def add_myelin_time_pass_to(input_env):
                 # Run the command or reuse the existing output logs.
                 print_info(f"Running command for {metric.metric_name}")
                 outputs = self.run_ex(
-                    metric.metric_name,
-                    metric.metric_type,
-                    llm_venv,
-                    gpu_clock_lock,
-                    session_data_writer,
-                    output_dir,
+                    commands=commands,
+                    cmd_idx=self._current_cmd_idx,
+                    full_test_name=metric.metric_name,
+                    metric_type=metric.metric_type,
+                    venv=llm_venv,
+                    gpu_clock_lock=gpu_clock_lock,
+                    session_data_writer=session_data_writer,
+                    output_dir=output_dir,
                     outputs=outputs,
-                    original_test_name=metric.original_test_name,
-                    cmd_idx=self._current_cmd_idx)
+                    original_test_name=metric.original_test_name)
 
                 # Save the result state.
                 result_state = self.get_result_state()
@@ -2373,6 +2450,14 @@ def add_myelin_time_pass_to(input_env):
             # Clean up engine dir after use.
             shutil.rmtree(self._get_engine_dir(), ignore_errors=True)
 
+        def add_myelin_time_pass_to(input_env):
+            time_pass_flag = r" -time_pass=on"
+            old_myelin_env = input_env.get("__LUNOWUD", "")
+            if time_pass_flag not in old_myelin_env:
+                input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
+            return old_myelin_env
+
+        old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
         llm_venv._new_env["__LUNOWUD"] = old_llm_venv
 
         # Check if any commands failed.
@@ -2393,14 +2478,19 @@ def upload_test_results_to_database(self):
         Upload the test results and baseline to database.
         """
 
-        def prefix_server_config_dict(config_dict: dict,
-                                      prefix_name: str) -> dict:
-            prefixed_dict = {}
-            for key, value in config_dict.items():
-                type_prefix = key[0:2]  # 'l_', 's_', 'b_', 'd_'
-                rest = key[2:]
-                prefixed_dict[f"{type_prefix}{prefix_name}_{rest}"] = value
-            return prefixed_dict
+        def add_prefix(key: str, prefix_name: str) -> dict:
+            type_prefix = key[0:2]  # 'l_', 's_', 'b_', 'd_'
+            rest = key[2:]
+            return f"{type_prefix}{prefix_name}_{rest}"
+
+        def add_list_prefix(config_list: List, prefix_name: str) -> List:
+            return [add_prefix(key, prefix_name) for key in config_list]
+
+        def add_dict_prefix(config_dict: dict, prefix_name: str) -> dict:
+            return {
+                add_prefix(key, prefix_name): value
+                for key, value in config_dict.items()
+            }
 
         match_keys = []
         # Only aggr_server and multi_node_disagg_server will upload.
@@ -2441,12 +2531,12 @@ def prefix_server_config_dict(config_dict: dict,
                     new_data_dict[cmd_idx] = new_data
                     cmd_idx += 1
                     if not match_keys:
+                        match_keys.append("s_runtime")
                         if server_config.match_mode == "scenario":
                             match_keys = SCENARIO_MATCH_FIELDS.copy()
                         else:
-                            match_keys.append("s_runtime")
-                            match_keys.extend(server_config_dict.keys())
-                            match_keys.extend(client_config_dict.keys())
+                            match_keys.extend(server_config.to_match_keys())
+                            match_keys.extend(client_config.to_match_keys())
 
         elif self._config.runtime == "multi_node_disagg_server":
             if self._config.disagg_configs[0][
@@ -2472,27 +2562,28 @@ def prefix_server_config_dict(config_dict: dict,
                 )
                 gen_server_config_dict = disagg_config['gen_server'].to_db_data(
                 )
-                ctx_server_config_dict = prefix_server_config_dict(
+                client_config_dict = disagg_config['client'].to_db_data()
+                ctx_server_config_dict = add_dict_prefix(
                     ctx_server_config_dict, 'ctx')
-                gen_server_config_dict = prefix_server_config_dict(
+                gen_server_config_dict = add_dict_prefix(
                     gen_server_config_dict, 'gen')
-                client_config_dict = disagg_config['client'].to_db_data()
-                # Build new_data
+
+                hardware = disagg_config.get('hardware', {})
+                num_ctx_servers = hardware.get('num_ctx_servers', 0)
+                num_gen_servers = hardware.get('num_gen_servers', 0)
                 new_data = {
                     "s_runtime": "multi_node_disagg_server",
-                    "s_server_env_var": disagg_config['server_env_var']
+                    "s_benchmark_mode": disagg_config['mode'],
+                    "s_server_env_var": disagg_config['server_env_var'],
+                    "l_num_ctx_servers": num_ctx_servers,
+                    "l_num_gen_servers": num_gen_servers
                 }
                 new_data.update(job_config)
-                new_data.update(ctx_server_config_dict)
-                new_data.update(gen_server_config_dict)
+                if num_ctx_servers > 0:
+                    new_data.update(ctx_server_config_dict)
+                if num_gen_servers > 0:
+                    new_data.update(gen_server_config_dict)
                 new_data.update(client_config_dict)
-                # Add hardware information
-                hardware = disagg_config.get('hardware', {})
-                new_data["l_num_ctx_servers"] = hardware.get(
-                    'num_ctx_servers', 0)
-                new_data["l_num_gen_servers"] = hardware.get(
-                    'num_gen_servers', 0)
-                # Add metrics from test results
                 for metric_type in AGGR_SERVER_METRICS:
                     new_data[
                         f"d_{PERF_METRIC_STRING[metric_type]}"] = self._test_results[
@@ -2503,9 +2594,17 @@ def prefix_server_config_dict(config_dict: dict,
                 if not match_keys:
                     match_keys.extend(
                         ["s_runtime", "l_num_ctx_servers", "l_num_gen_servers"])
-                    match_keys.extend(ctx_server_config_dict.keys())
-                    match_keys.extend(gen_server_config_dict.keys())
-                    match_keys.extend(client_config_dict.keys())
+                    if num_ctx_servers > 0:
+                        match_keys.extend(
+                            add_list_prefix(
+                                disagg_config['ctx_server'].to_match_keys(),
+                                'ctx'))
+                    if num_gen_servers > 0:
+                        match_keys.extend(
+                            add_list_prefix(
+                                disagg_config['gen_server'].to_match_keys(),
+                                'gen'))
+                    match_keys.extend(disagg_config['client'].to_match_keys())
         else:
             return
 
@@ -2519,7 +2618,7 @@ def prefix_server_config_dict(config_dict: dict,
         if is_post_merge:
             # Prepare new baseline data for post-merge
             new_baseline_data_dict = prepare_baseline_data(
-                history_baseline_dict, history_data_dict, new_data_dict)
+                history_data_dict, new_data_dict)
         else:
             # Pre-merge does not need to upload baseline data
             new_baseline_data_dict = None
diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py
index 6e14592a37e3..9f2ed7bb32fa 100644
--- a/tests/integration/defs/perf/utils.py
+++ b/tests/integration/defs/perf/utils.py
@@ -245,9 +245,7 @@ def get_cmd_str(self, cmd_idx) -> List[str]:
 
 class PerfAggrScriptTestCmds(NamedTuple):
     server_cmds: List[List[str]]
-    server_envs: List[Dict[str, str]]
     client_cmds: List[List[str]]
-    client_envs: List[Dict[str, str]]
     names: List[str]
     timeout: int
     output_dir: str
@@ -345,13 +343,9 @@ def get_cmd_str(self, cmd_idx) -> List[str]:
 
 class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
     ctx_server_cmds: List[List[str]]
-    ctx_server_envs: List[Dict[str, str]]
     gen_server_cmds: List[List[str]]
-    gen_server_envs: List[Dict[str, str]]
     disagg_server_cmds: List[List[str]]
-    disagg_server_envs: List[Dict[str, str]]
     benchmark_cmds: List[List[str]]
-    benchmark_envs: List[Dict[str, str]]
     timeout: int
     hostname: str
     disagg_serving_type: str
@@ -694,23 +688,21 @@ def _check_benchmark_output_for_errors(self, output: str) -> None:
             )
 
     def run_ex(self,
+               commands,
                full_test_name: str,
                metric_type: PerfMetricType,
                venv: Optional[PythonVenvRunnerImpl],
                gpu_clock_lock: GPUClockLock,
                session_data_writer: SessionDataWriter,
                output_dir: str,
+               cmd_idx: int = 0,
                outputs: Dict[int, str] = {},
                original_test_name: str = None,
-               cmd_idx: int = 0,
                **kwargs) -> List[str]:
         """
         Run the commands and write the results to the output csv and/or yaml files.
         """
 
-        # Get the commands.
-        commands = self.get_commands()
-
         # Avoid modifying argument directly
         outputs = outputs.copy()
 
@@ -723,7 +715,6 @@ def run_ex(self,
 
         cmd_str = commands.get_cmd_str(cmd_idx)
         is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
-
         is_perf_sanity_test = "perf_sanity" in full_test_name
 
         is_disagg_server = False
@@ -804,7 +795,8 @@ def run_ex(self,
                 outputs.pop(cmd_idx)
             elif is_disagg_server:
                 print_info(
-                    f"skip writing perf result when running disagg's server.")
+                    f"skip writing perf result when running disagg's worker or server."
+                )
             else:
                 self._perf_result = self.get_perf_result(outputs)
 
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
index d4470fe1a421..4bf4f6ce67d5 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
@@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
 
 - condition:
     ranges:
@@ -34,8 +34,8 @@ l0_dgx_b200_perf_sanity:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
index ff0b9eafe387..d90907d9b40c 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
@@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
 
 - condition:
     ranges:
@@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_dgx_b200-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
index fcbe711760e6..e06e18772505 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
@@ -14,6 +14,6 @@ l0_gb200_multi_gpus_perf_sanity:
       stage: post_merge
       backend: pytorch
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_dep4_mtp1_1k1k]
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tep4_mtp3_1k1k]
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_gpus-r1_fp4_v2_tp4_mtp3_1k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml
similarity index 58%
rename from tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml
rename to tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml
index bc7d95b047c5..ad69e70c867e 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml
@@ -1,5 +1,5 @@
 version: 0.0.1
-l0_gb200_multi_nodes_perf_sanity:
+l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001:
 - condition:
     ranges:
       # 2 nodes with each node has 4 GPUs
@@ -13,4 +13,4 @@ l0_gb200_multi_nodes_perf_sanity:
       stage: post_merge
       backend: pytorch
   tests:
-  - perf/test_perf.py::test_perf[perf_sanity_upload-l0_gb200_multi_nodes-r1_fp4_v2_dep8_mtp1]
+  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1]
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml
new file mode 100644
index 000000000000..456bb7a48ed4
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml
@@ -0,0 +1,16 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
+- condition:
+    ranges:
+      # 3 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 12
+        lte: 12
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml
new file mode 100644
index 000000000000..3e34d0cb2199
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml
@@ -0,0 +1,16 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001:
+- condition:
+    ranges:
+      # 6 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 24
+        lte: 24
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml
new file mode 100644
index 000000000000..273790a21800
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml
@@ -0,0 +1,16 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002:
+- condition:
+    ranges:
+      # 6 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 24
+        lte: 24
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml
new file mode 100644
index 000000000000..b4784d073687
--- /dev/null
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml
@@ -0,0 +1,16 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
+- condition:
+    ranges:
+      # 8 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 32
+        lte: 32
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 6de1fa6b552c..81284f44e61b 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -320,8 +320,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
 accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
-accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True] SKIP (https://nvbugs/5644632)
-accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False] SKIP (https://nvbugs/5644632)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] SKIP (https://nvbugs/5629136)
@@ -369,7 +367,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutla
 accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
 test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
-test_e2e.py::test_openai_chat_harmony SKIP (https://nvbugs/5633700)
 accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193)
 accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193)
 accuracy/test_cli_flow.py::TestLlama2_7B::test_fp8_2gpus[cp2] SKIP (https://nvbugs/5705194)
diff --git a/tests/scripts/perf-sanity/README.md b/tests/scripts/perf-sanity/README.md
index 66f9a93fc6c3..6cd917996ba5 100644
--- a/tests/scripts/perf-sanity/README.md
+++ b/tests/scripts/perf-sanity/README.md
@@ -4,106 +4,31 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te
 
 ## Overview
 
-- Run performance sanity benchmarks across multiple model configurations
+- Run performance sanity benchmarks across multiple model configs
 - Support three deployment architectures: single-node, multi-node aggregated, and multi-node disaggregated
-- Manage test cases through YAML configuration files
+- Manage test cases through YAML config files
 - Automated resource calculation and job submission via SLURM
 
 ## Configuration File Types
 
-There are three types of YAML configuration files for different deployment architectures:
+There are three types of YAML config files for different deployment architectures.
+Aggregated config files are in [`tests/scripts/perf-sanity`](./).
+Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf).
 
 ### 1. Single-Node Aggregated Test Configuration
 
-**File Example**: `l0_dgx_b200.yaml`
+**File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml`
 
 **Use Case**: Single-node performance tests on a single server with multiple GPUs.
 
-**Structure**:
-```yaml
-server_configs:
-  - name: "r1_fp8_dep8_mtp1_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 8
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 8192
-    attention_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      batching_wait_iters: 0
-      enable_balance: true
-      timeout_iters: 60
-    moe_config:
-      backend: 'DEEPGEMM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 1
-    client_configs:
-      - name: "con4096_iter10_1k1k"
-        concurrency: 4096
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.8
-        backend: "openai"
-```
-
-
 ### 2. Multi-Node Aggregated Test Configuration
 
-**File Example**: `l0_gb200_multi_nodes.yaml`
+**File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml`
 
 **Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
 
-**Structure**:
-```yaml
-# Hardware Config
-hardware:
-  gpus_per_node: 4
-  gpus_per_server: 8
+### 3. Multi-Node Disaggregated Test Configuration
+
+**File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml`
 
-server_configs:
-  - name: "r1_fp4_v2_dep8_mtp1"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 8
-    gpus_per_node: 4
-    trust_remote_code: true
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 8
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 2112
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      batching_wait_iters: 0
-      enable_balance: true
-      timeout_iters: 60
-    moe_config:
-      backend: 'CUTLASS'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.5
-    client_configs:
-      - name: "con32_iter12_1k1k"
-        concurrency: 32
-        iterations: 12
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.8
-        backend: "openai"
-```
+**Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers.
diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
similarity index 93%
rename from tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml
rename to tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
index 432c6ee1452b..1a5c5e5212ba 100644
--- a/tests/scripts/perf-sanity/l0_gb200_multi_nodes.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
@@ -1,13 +1,13 @@
-# Hardware Config
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  supported_gpus:
+  - GB200
+  - GB300
 hardware:
   gpus_per_node: 4
-  gpus_per_server: 8
-
 server_configs:
   - name: "r1_fp4_v2_dep8_mtp1"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 8
-    gpus_per_node: 4
     trust_remote_code: true
     tensor_parallel_size: 8
     moe_expert_parallel_size: 8
@@ -37,11 +37,8 @@ server_configs:
         osl: 1024
         random_range_ratio: 0.2
         backend: "openai"
-
   - name: "r1_fp4_v2_tep8_mtp3"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 8
-    gpus_per_node: 4
     trust_remote_code: true
     tensor_parallel_size: 8
     moe_expert_parallel_size: 8
diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
new file mode 100644
index 000000000000..06c629d3f3bb
--- /dev/null
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
@@ -0,0 +1,99 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  supported_gpus:
+  - B200
+  - B300
+server_configs:
+  - name: "r1_fp4_v2_dep4_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'CUTLASS'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con2048_iter10_1k1k"
+        concurrency: 2048
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 32
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 32
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con32_iter10_1k1k"
+        concurrency: 32
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
+
+  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp4_v2"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 4
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 4
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con4_iter10_1k1k"
+        concurrency: 4
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
diff --git a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
similarity index 98%
rename from tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml
rename to tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
index ab14148b202c..388fec27c80d 100644
--- a/tests/scripts/perf-sanity/l0_gb200_multi_gpus.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
@@ -1,8 +1,12 @@
+metadata:
+  model_name: deepseek_r1_0528_fp4_v2
+  supported_gpus:
+  - GB200
+  - GB300
 server_configs:
   # 1k1k configs
   - name: "r1_fp4_v2_dep4_mtp1_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 4
     pipeline_parallel_size: 1
@@ -37,7 +41,6 @@ server_configs:
 
   - name: "r1_fp4_v2_tep4_mtp3_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 4
     pipeline_parallel_size: 1
@@ -68,7 +71,6 @@ server_configs:
 
   - name: "r1_fp4_v2_tp4_mtp3_1k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 1
     pipeline_parallel_size: 1
@@ -100,7 +102,6 @@ server_configs:
   # 8k1k configs
   - name: "r1_fp4_v2_dep4_mtp1_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 4
     pipeline_parallel_size: 1
@@ -135,7 +136,6 @@ server_configs:
 
   - name: "r1_fp4_v2_tep4_mtp3_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 4
     pipeline_parallel_size: 1
@@ -166,7 +166,6 @@ server_configs:
 
   - name: "r1_fp4_v2_tp4_mtp3_8k1k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 1
     pipeline_parallel_size: 1
@@ -198,7 +197,6 @@ server_configs:
   # 1k8k configs
   - name: "r1_fp4_v2_dep4_mtp1_1k8k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 4
     pipeline_parallel_size: 1
@@ -233,7 +231,6 @@ server_configs:
 
   - name: "r1_fp4_v2_tep4_mtp3_1k8k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 4
     pipeline_parallel_size: 1
@@ -264,7 +261,6 @@ server_configs:
 
   - name: "r1_fp4_v2_tp4_mtp3_1k8k"
     model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
     tensor_parallel_size: 4
     moe_expert_parallel_size: 1
     pipeline_parallel_size: 1
diff --git a/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
new file mode 100644
index 000000000000..6ee4fbb97285
--- /dev/null
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
@@ -0,0 +1,99 @@
+metadata:
+  model_name: deepseek_r1_0528_fp8
+  supported_gpus:
+  - B200
+  - B300
+server_configs:
+  - name: "r1_fp8_dep8_mtp1_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      batching_wait_iters: 0
+      enable_balance: true
+      timeout_iters: 60
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 1
+    client_configs:
+      - name: "con4096_iter10_1k1k"
+        concurrency: 4096
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
+
+  - name: "r1_fp8_tep8_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 8
+    pipeline_parallel_size: 1
+    max_batch_size: 64
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'DEEPGEMM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 64
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con64_iter10_1k1k"
+        concurrency: 64
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
+
+  - name: "r1_fp8_tp8_mtp3_1k1k"
+    model_name: "deepseek_r1_0528_fp8"
+    tensor_parallel_size: 8
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 8
+    max_num_tokens: 8192
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'MTP'
+      num_nextn_predict_layers: 3
+    client_configs:
+      - name: "con8_iter10_1k1k"
+        concurrency: 8
+        iterations: 10
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
diff --git a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml
new file mode 100644
index 000000000000..1696347f0fd8
--- /dev/null
+++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_blackwell.yaml
@@ -0,0 +1,101 @@
+metadata:
+  model_name: gpt_oss_120b_fp4
+  supported_gpus:
+  - B200
+  - B300
+server_configs:
+  - name: "gpt_oss_fp4_dep2_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 2
+    pipeline_parallel_size: 1
+    max_batch_size: 1024
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      enable_balance: true
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1024
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    num_postprocess_workers: 4
+    stream_interval: 20
+    client_configs:
+      - name: "con2048_iter5_1k1k"
+        concurrency: 2048
+        iterations: 5
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
+
+  - name: "gpt_oss_fp4_dep4_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 512
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      enable_balance: true
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 512
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    num_postprocess_workers: 4
+    stream_interval: 20
+    client_configs:
+      - name: "con2048_iter5_1k1k"
+        concurrency: 2048
+        iterations: 5
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
+
+  - name: "gpt_oss_fp4_tp4_eagle3_1k1k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 1
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 1
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    speculative_config:
+      decoding_type: 'Eagle'
+      eagle3_layers_to_capture: [-1]
+      max_draft_len: 3
+      speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
+    stream_interval: 20
+    num_postprocess_workers: 4
+    client_configs:
+      - name: "con1_iter32_1k1k"
+        concurrency: 1
+        iterations: 32
+        isl: 1024
+        osl: 1024
+        random_range_ratio: 0.2
+        backend: "openai"
diff --git a/tests/scripts/perf-sanity/l0_dgx_b200.yaml b/tests/scripts/perf-sanity/l0_dgx_b200.yaml
deleted file mode 100644
index 3074bef6c1b8..000000000000
--- a/tests/scripts/perf-sanity/l0_dgx_b200.yaml
+++ /dev/null
@@ -1,293 +0,0 @@
-server_configs:
-  - name: "r1_fp8_dep8_mtp1_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 8
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      batching_wait_iters: 0
-      enable_balance: true
-      timeout_iters: 60
-    moe_config:
-      backend: 'DEEPGEMM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 1
-    client_configs:
-      - name: "con4096_iter10_1k1k"
-        concurrency: 4096
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp8_tep8_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 8
-    pipeline_parallel_size: 1
-    max_batch_size: 64
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'DEEPGEMM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 64
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con64_iter10_1k1k"
-        concurrency: 64
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp8_tp8_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 1
-    pipeline_parallel_size: 1
-    max_batch_size: 8
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 8
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con8_iter10_1k1k"
-        concurrency: 8
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp4_v2_dep4_mtp1_1k1k"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      batching_wait_iters: 0
-      enable_balance: true
-      timeout_iters: 60
-    moe_config:
-      backend: 'CUTLASS'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 1
-    client_configs:
-      - name: "con2048_iter10_1k1k"
-        concurrency: 2048
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
-    pipeline_parallel_size: 1
-    max_batch_size: 32
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 32
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con32_iter10_1k1k"
-        concurrency: 32
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 1
-    pipeline_parallel_size: 1
-    max_batch_size: 4
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 4
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con4_iter10_1k1k"
-        concurrency: 4
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "gpt_oss_fp4_dep2_1k1k"
-    model_name: "gpt_oss_120b_fp4"
-    gpus: 2
-    tensor_parallel_size: 2
-    moe_expert_parallel_size: 2
-    pipeline_parallel_size: 1
-    max_batch_size: 1024
-    max_num_tokens: 20000
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      enable_balance: true
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 1024
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    num_postprocess_workers: 4
-    stream_interval: 20
-    client_configs:
-      - name: "con2048_iter5_1k1k"
-        concurrency: 2048
-        iterations: 5
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "gpt_oss_fp4_dep4_1k1k"
-    model_name: "gpt_oss_120b_fp4"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 20000
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      enable_balance: true
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    num_postprocess_workers: 4
-    stream_interval: 20
-    client_configs:
-      - name: "con2048_iter5_1k1k"
-        concurrency: 2048
-        iterations: 5
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "gpt_oss_fp4_tp4_eagle3_1k1k"
-    model_name: "gpt_oss_120b_fp4"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 1
-    pipeline_parallel_size: 1
-    max_batch_size: 1
-    max_num_tokens: 20000
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 1
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'Eagle'
-      eagle3_layers_to_capture: [-1]
-      max_draft_len: 3
-      speculative_model_dir: "gpt_oss/gpt-oss-120b-Eagle3"
-    stream_interval: 20
-    num_postprocess_workers: 4
-    client_configs:
-      - name: "con1_iter32_1k1k"
-        concurrency: 1
-        iterations: 32
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
diff --git a/tests/scripts/perf-sanity/l0_dgx_b300.yaml b/tests/scripts/perf-sanity/l0_dgx_b300.yaml
deleted file mode 100644
index 0306ad25a8a8..000000000000
--- a/tests/scripts/perf-sanity/l0_dgx_b300.yaml
+++ /dev/null
@@ -1,194 +0,0 @@
-server_configs:
-  - name: "r1_fp8_dep8_mtp1_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 8
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      batching_wait_iters: 0
-      enable_balance: true
-      timeout_iters: 60
-    moe_config:
-      backend: 'DEEPGEMM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 1
-    client_configs:
-      - name: "con4096_iter10_1k1k"
-        concurrency: 4096
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp8_tep8_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 8
-    pipeline_parallel_size: 1
-    max_batch_size: 64
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'DEEPGEMM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 64
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con64_iter10_1k1k"
-        concurrency: 64
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp8_tp8_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp8"
-    gpus: 8
-    tensor_parallel_size: 8
-    moe_expert_parallel_size: 1
-    pipeline_parallel_size: 1
-    max_batch_size: 8
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 8
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con8_iter10_1k1k"
-        concurrency: 8
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp4_v2_dep4_mtp1_1k1k"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
-    pipeline_parallel_size: 1
-    max_batch_size: 512
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      batching_wait_iters: 0
-      enable_balance: true
-      timeout_iters: 60
-    moe_config:
-      backend: 'CUTLASS'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 512
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 1
-    client_configs:
-      - name: "con2048_iter10_1k1k"
-        concurrency: 2048
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
-    pipeline_parallel_size: 1
-    max_batch_size: 32
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 32
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con32_iter10_1k1k"
-        concurrency: 32
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
-
-  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
-    model_name: "deepseek_r1_0528_fp4_v2"
-    gpus: 4
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 1
-    pipeline_parallel_size: 1
-    max_batch_size: 4
-    max_num_tokens: 8192
-    attn_backend: "TRTLLM"
-    enable_attention_dp: false
-    moe_config:
-      backend: 'TRTLLM'
-    cuda_graph_config:
-      enable_padding: true
-      max_batch_size: 4
-    kv_cache_config:
-      dtype: 'fp8'
-      enable_block_reuse: false
-      free_gpu_memory_fraction: 0.8
-    speculative_config:
-      decoding_type: 'MTP'
-      num_nextn_predict_layers: 3
-    client_configs:
-      - name: "con4_iter10_1k1k"
-        concurrency: 4
-        iterations: 10
-        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
-        backend: "openai"
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py b/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py
index 575cd2f0f138..fc550f082472 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_harmony.py
@@ -1,12 +1,18 @@
 import json
+import os
 
 import openai
 import pytest
+from utils.llm_data import llm_datasets_root
 
 from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
 
 pytestmark = pytest.mark.threadleak(enabled=False)
+os.environ['TIKTOKEN_RS_CACHE_DIR'] = os.path.join(llm_datasets_root(),
+                                                   'tiktoken_vocab')
+os.environ['TIKTOKEN_ENCODINGS_BASE'] = os.path.join(llm_datasets_root(),
+                                                     'tiktoken_vocab')
 
 
 @pytest.fixture(scope="module", ids=["GPT-OSS-20B"])
@@ -114,8 +120,10 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str):
         model=model,
         messages=messages,
         tools=[tool_get_current_weather],
+        extra_body={"top_k": 1},
     )
     message = response.choices[0].message
+    print(message)
     assert response.choices[0].finish_reason == "tool_calls"
     assert message.content is None
     assert message.reasoning
@@ -137,6 +145,7 @@ async def test_tool_calls(client: openai.AsyncOpenAI, model: str):
     response = await client.chat.completions.create(
         model=model,
         messages=messages,
+        extra_body={"top_k": 1},
     )
     message = response.choices[0].message
     assert message.content
@@ -205,6 +214,7 @@ async def test_streaming_tool_call(client: openai.AsyncOpenAI, model: str):
         messages=messages,
         tools=[tool_get_current_weather],
         stream=True,
+        extra_body={"top_k": 1},
     )
     tool_name: str
     reasoning_chunks: list[str] = []
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py
index 66677fcead3f..4b75e4c71ff1 100644
--- a/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py
+++ b/tests/unittest/llmapi/apps/_test_trtllm_serve_example.py
@@ -1,17 +1,14 @@
 import json
 import os
 import subprocess
-import sys
 import tempfile
 
 import pytest
 import yaml
 
+from ..test_llm import get_model_path
 from .openai_server import RemoteOpenAIServer
 
-sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-from test_llm import get_model_path
-
 
 @pytest.fixture(scope="module", ids=["TinyLlama-1.1B-Chat"])
 def model_name():
@@ -57,15 +54,19 @@ def example_root():
                     ("bash", "curl_completion_client.sh"),
                     ("bash", "aiperf_client.sh"),
                     ("bash", "curl_responses_client.sh")])
-def test_trtllm_serve_examples(exe: str, script: str,
+def test_trtllm_serve_examples(exe: str, script: str, model_name: str,
                                server: RemoteOpenAIServer, example_root: str):
     client_script = os.path.join(example_root, script)
     # CalledProcessError will be raised if any errors occur
+    custom_env = os.environ.copy()
+    if script.startswith("aiperf"):
+        custom_env[""] = get_model_path(model_name)
     result = subprocess.run([exe, client_script],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             text=True,
-                            check=True)
+                            check=True,
+                            env=custom_env)
     if script.startswith("curl"):
         # For curl scripts, we expect a JSON response
         result_stdout = result.stdout.strip()