yingguo-trt · pull · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026 · Jan 19, 2026
diff --git a/.devcontainer/docker-compose.override-example.yml b/.devcontainer/docker-compose.override-example.yml
@@ -5,4 +5,4 @@ services:
     volumes:
       # Uncomment the following lines to enable
       # # Mount TRTLLM data volume:
-      # - /home/scratch.trt_llm_data/:/home/scratch.trt_llm_data/:ro
+      # - /home/scratch.trt_llm_data_ci/:/home/scratch.trt_llm_data_ci/:ro
diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -37,7 +37,7 @@ environment:
   cuda_architectures: ""  # Optional CUDA architectures to build for (e.g. "90-real;100-real"). If empty, builds for all architectures
   trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   work_dir: "<full_path_to_work_dir>"
-  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes NCCL_GRAPH_MIXING_SUPPORT=0"
   server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 
 # Profiling Configuration

diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -124,9 +124,16 @@ elif [ -d "${trtllm_repo}" ]; then
     echo "TensorRT-LLM installation completed successfully"
 else
     echo "trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container"
-    if [ -v TRT_LLM_GIT_COMMIT ]; then
-        echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
+    # get_env file is in the same directory as this script
+    get_env_file=${work_dir}/get_env.py
+    if ! srun --container-name=${container_name} \
+        --container-mounts=${container_mount} --no-container-mount-home \
+        --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
+        bash -c "python ${get_env_file} -e ${full_logdir}/env_vars.json" \
+        &> ${full_logdir}/2_get_env.log; then
+        cleanup_on_failure "Failed to get TensorRT-LLM environment variables. Check ${full_logdir}/2_get_env.log for details"
     fi
+    echo "TensorRT-LLM environment variables saved to ${full_logdir}/env_vars.json"
 fi
 
 # Get node lists and replace the placeholder with the actual node names

diff --git a/examples/disaggregated/slurm/benchmark/get_env.py b/examples/disaggregated/slurm/benchmark/get_env.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Get TensorRT-LLM environment variables and save to JSON"
+    )
+    parser.add_argument("-e", "--env-file", required=True, help="Environment file path")
+    args = parser.parse_args()
+
+    # read env file, append new envs to it
+    with open(args.env_file, "r") as f:
+        env_data = json.load(f)
+
+    # Get environment variables
+    new_env_data = {
+        "TRT_LLM_GIT_COMMIT": os.environ.get("TRT_LLM_GIT_COMMIT", ""),
+        "TRT_LLM_VERSION": os.environ.get("TRT_LLM_VERSION", ""),
+    }
+    print(f"Environment variables: {new_env_data}")
+    env_data.update(new_env_data)
+    # Save to environment file
+    with open(args.env_file, "w") as f:
+        json.dump(env_data, f, indent=2)
+
+    print(f"Environment variables saved to {args.env_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark.sh b/examples/disaggregated/slurm/benchmark/run_benchmark.sh
@@ -6,9 +6,9 @@ set -u
 trap 'echo "Error occurred at line $LINENO"; exit 1' ERR
 
 # Add parameter validation
-if [ "$#" -lt 9 ]; then
-    echo "Error: Missing required arguments"
-    echo "Usage: $0 model_name dataset_file multi_round concurrency_list streaming log_path hostname port"
+if [ "$#" -lt 10 ]; then
+    echo "Error: Missing required arguments, got $# arguments, args: $@"
+    echo "Usage: $0 model_name dataset_file multi_round num_gen_servers concurrency_list streaming log_path hostname port ucx_warmup_requests"
     exit 1
 fi
 
@@ -21,20 +21,115 @@ streaming=$6
 log_path=$7
 hostname=$8
 port=$9
+ucx_warmup_requests=${10}
 
 # check process id is not 0
 if [[ ${SLURM_PROCID} != "0" ]]; then
     echo "Process id is ${SLURM_PROCID} for loadgen, exiting"
     exit 0
 fi
 
+do_get_logs(){
+    local input_file=$1
+    local output_file=$2
+    local mode=$3
+    local start_line=$4
+    # check mode is ctx or gen
+    if [ "${mode}" = "ctx" ]; then
+        sed -n "${start_line},\$p" ${input_file} | grep -a "'num_generation_tokens': 0" > ${output_file} || true
+    elif [ "${mode}" = "gen" ]; then
+        sed -n "${start_line},\$p" ${input_file} | grep -a "'num_ctx_requests': 0, 'num_ctx_tokens': 0" > ${output_file} || true
+    else
+        echo "Invalid mode: ${mode}"
+        return 1
+    fi
+    return 0
+}
+
+do_process_all_logs(){
+    local input_folder=$1
+    local output_folder=$2
+    local mode=$3
+    if [ "${mode}" != "line" ] && [ "${mode}" != "log" ] && [ "${mode}" != "clean" ]; then
+        echo "Invalid mode: ${mode}"
+        exit 1
+    fi
+    local ctx_log
+    local ctx_num
+    local gen_log
+    local gen_num
+    local line_count
+    local start_line
+    for ctx_log in ${input_folder}/3_output_CTX_*.log; do
+        if [ -f "${ctx_log}" ]; then
+            ctx_num=$(basename "${ctx_log}" | sed 's/3_output_CTX_\([0-9]*\)\.log/\1/')
+            if [ "${mode}" = "line" ]; then
+                line_count=$(wc -l < ${ctx_log})
+                echo ${line_count} > ${output_folder}/ctx_only_line_${ctx_num}.txt
+            elif [ "${mode}" = "log" ]; then
+                if [ ! -f "${output_folder}/ctx_only_line_${ctx_num}.txt" ]; then
+                    start_line=0
+                else
+                    start_line=$(cat ${output_folder}/ctx_only_line_${ctx_num}.txt)
+                    rm -f ${output_folder}/ctx_only_line_${ctx_num}.txt
+                fi
+                do_get_logs ${ctx_log} ${output_folder}/ctx_only_${ctx_num}.txt "ctx" ${start_line}
+            elif [ "${mode}" = "clean" ]; then
+                rm -f ${ctx_log}
+            fi
+        fi
+    done
+    # process all the gen log files in the input folder
+    for gen_log in ${input_folder}/3_output_GEN_*.log; do
+        if [ -f "${gen_log}" ]; then
+            gen_num=$(basename "${gen_log}" | sed 's/3_output_GEN_\([0-9]*\)\.log/\1/')
+            if [ "${mode}" = "line" ]; then
+                line_count=$(wc -l < ${gen_log})
+                echo ${line_count} > ${output_folder}/gen_only_line_${gen_num}.txt
+            elif [ "${mode}" = "log" ]; then
+                if [ ! -f "${output_folder}/gen_only_line_${gen_num}.txt" ]; then
+                    start_line=0
+                else
+                    start_line=$(cat ${output_folder}/gen_only_line_${gen_num}.txt)
+                    rm -f ${output_folder}/gen_only_line_${gen_num}.txt
+                fi
+                do_get_logs ${gen_log} ${output_folder}/gen_only_${gen_num}.txt "gen" ${start_line}
+            elif [ "${mode}" = "clean" ]; then
+                rm -f ${gen_log}
+            fi
+        fi
+    done
+}
+
+mkdir -p ${log_path}/start_logs
+cp ${log_path}/3_output_CTX_*.log ${log_path}/start_logs/ 2>/dev/null || true
+cp ${log_path}/3_output_GEN_*.log ${log_path}/start_logs/ 2>/dev/null || true
+
+# warmup requests for ucx connections
+if [ "${ucx_warmup_requests}" -gt 0 ]; then
+    echo "warming up ucx connections with small requests... ${ucx_warmup_requests}"
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_name} \
+        --dataset-name random \
+        --random-ids \
+        --random-input-len 100 \
+        --random-output-len 10 \
+        --num-prompts ${ucx_warmup_requests} \
+        --host ${hostname} \
+        --port ${port} \
+        --ignore-eos \
+        --non-streaming
+    echo "UCX warmup done"
+fi
+
 echo "Hostname: ${hostname}, Port: ${port}"
 echo "Starting benchmark..."
 for concurrency in ${concurrency_list}; do
     concurrency=$((concurrency * num_gen_servers))
     num_prompts=$((concurrency * multi_round))
     echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
     mkdir -p ${log_path}/concurrency_${concurrency}
+    do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "line"
     python -m tensorrt_llm.serve.scripts.benchmark_serving \
         --model ${model_name} \
         --backend openai \
@@ -53,4 +148,6 @@ for concurrency in ${concurrency_list}; do
         --percentile-metrics "ttft,tpot,itl,e2el" \
         $(if [ "${streaming}" = "false" ]; then echo "--non-streaming"; fi)
     echo "Benchmark with concurrency ${concurrency} done"
+    do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "log"
 done
+# do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "clean"
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh
@@ -42,6 +42,7 @@ streaming=$8
 log_path=$9
 hostname=${10}
 port=${11}
+ucx_warmup_requests=${12}
 
 # check process id is not 0
 if [[ ${SLURM_PROCID} != "0" ]]; then
@@ -59,14 +60,107 @@ fi
 echo "Cloning benchmark repository..."
 git clone "${BENCH_SERVING_REPO}" "${BENCH_SERVING_DIR}"
 
+do_get_logs(){
+    local input_file=$1
+    local output_file=$2
+    local mode=$3
+    local start_line=$4
+    # check mode is ctx or gen
+    if [ "${mode}" = "ctx" ]; then
+        sed -n "${start_line},\$p" ${input_file} | grep -a "'num_generation_tokens': 0" > ${output_file} || true
+    elif [ "${mode}" = "gen" ]; then
+        sed -n "${start_line},\$p" ${input_file} | grep -a "'num_ctx_requests': 0, 'num_ctx_tokens': 0" > ${output_file} || true
+    else
+        echo "Invalid mode: ${mode}"
+        return 1
+    fi
+    return 0
+}
+
+do_process_all_logs(){
+    local input_folder=$1
+    local output_folder=$2
+    local mode=$3
+    if [ "${mode}" != "line" ] && [ "${mode}" != "log" ] && [ "${mode}" != "clean" ]; then
+        echo "Invalid mode: ${mode}"
+        exit 1
+    fi
+    local ctx_log
+    local ctx_num
+    local gen_log
+    local gen_num
+    local line_count
+    local start_line
+    for ctx_log in ${input_folder}/output_ctx_*.log; do
+        if [ -f "${ctx_log}" ]; then
+            ctx_num=$(basename "${ctx_log}" | sed 's/output_ctx_\([0-9]*\)\.log/\1/')
+            if [ "${mode}" = "line" ]; then
+                line_count=$(wc -l < ${ctx_log})
+                echo ${line_count} > ${output_folder}/ctx_only_line_${ctx_num}.txt
+            elif [ "${mode}" = "log" ]; then
+                if [ ! -f "${output_folder}/ctx_only_line_${ctx_num}.txt" ]; then
+                    start_line=0
+                else
+                    start_line=$(cat ${output_folder}/ctx_only_line_${ctx_num}.txt)
+                    rm -f ${output_folder}/ctx_only_line_${ctx_num}.txt
+                fi
+                do_get_logs ${ctx_log} ${output_folder}/ctx_only_${ctx_num}.txt "ctx" ${start_line}
+            elif [ "${mode}" = "clean" ]; then
+                rm -f ${ctx_log}
+            fi
+        fi
+    done
+    # process all the gen log files in the input folder
+    for gen_log in ${input_folder}/output_gen_*.log; do
+        if [ -f "${gen_log}" ]; then
+            gen_num=$(basename "${gen_log}" | sed 's/output_gen_\([0-9]*\)\.log/\1/')
+            if [ "${mode}" = "line" ]; then
+                line_count=$(wc -l < ${gen_log})
+                echo ${line_count} > ${output_folder}/gen_only_line_${gen_num}.txt
+            elif [ "${mode}" = "log" ]; then
+                if [ ! -f "${output_folder}/gen_only_line_${gen_num}.txt" ]; then
+                    start_line=0
+                else
+                    start_line=$(cat ${output_folder}/gen_only_line_${gen_num}.txt)
+                    rm -f ${output_folder}/gen_only_line_${gen_num}.txt
+                fi
+                do_get_logs ${gen_log} ${output_folder}/gen_only_${gen_num}.txt "gen" ${start_line}
+            elif [ "${mode}" = "clean" ]; then
+                rm -f ${gen_log}
+            fi
+        fi
+    done
+}
+
+mkdir -p ${log_path}/start_logs
+cp ${log_path}/output_ctx_*.log ${log_path}/start_logs/ 2>/dev/null || true
+cp ${log_path}/output_gen_*.log ${log_path}/start_logs/ 2>/dev/null || true
+
+# warmup requests for ucx connections
+if [ "${ucx_warmup_requests}" -gt 0 ]; then
+    echo "warming up ucx connections with small requests... ${ucx_warmup_requests}"
+    python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model ${model_name} \
+        --dataset-name random \
+        --random-ids \
+        --random-input-len 100 \
+        --random-output-len 10 \
+        --num-prompts ${ucx_warmup_requests} \
+        --host ${hostname} \
+        --port ${port} \
+        --ignore-eos \
+        --non-streaming
+    echo "UCX warmup done"
+fi
+
 # Run benchmarks
 echo "Starting benchmark..."
 for concurrency in ${concurrency_list}; do
     concurrency=$((concurrency * num_gen_servers))
     num_prompts=$((concurrency * multi_round))
     output_dir="${log_path}/concurrency_${concurrency}"
-
     echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
+    do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "line"
     mkdir -p "${output_dir}"
 
     python "${BENCH_SCRIPT}" \
@@ -89,4 +183,6 @@ for concurrency in ${concurrency_list}; do
         $([ "${streaming}" = "false" ] && echo "--non-streaming")
 
     echo "Benchmark with concurrency ${concurrency} done"
+    do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "log"
 done
+# do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "clean"
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -34,7 +34,6 @@ else
 fi
 
 if [ "${benchmark_mode}" = "gen_only" ]; then
-    export TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1
     export TLLM_BENCHMARK_REQ_QUEUES_SIZE=${concurrency}
 fi
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,7 +34,6 @@ else @@
     fi
     if [ "${benchmark_mode}" = "gen_only" ]; then
-        export TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1
         export TLLM_BENCHMARK_REQ_QUEUES_SIZE=${concurrency}
     fi
@@ Expand Down @@