Skip to content

Commit 8e26c01

Browse files
authored
Merge pull request #745 from NVIDIA/am/dynamo-slurm
Simplify Dynamo slurm configuration
2 parents 18b5b9a + 925b53f commit 8e26c01

File tree

3 files changed

+49
-43
lines changed

3 files changed

+49
-43
lines changed

conf/experimental/ai_dynamo/test/vllm.toml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,6 @@ docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
2828
prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
2929
decode-cmd = 'python3 -m dynamo.vllm'
3030

31-
[cmd_args.dynamo.prefill_worker]
32-
pipeline-parallel-size = 1
33-
3431
[cmd_args.dynamo.decode_worker]
3532
pipeline-parallel-size = 1
3633

@@ -42,13 +39,14 @@ docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
4239
output-tokens-mean = 500
4340
output-tokens-stddev = 0
4441
random-seed = 123
45-
request-count = 128
42+
request-count = 50
4643
synthetic-input-tokens-mean = 300
4744
synthetic-input-tokens-stddev = 0
48-
warmup-request-count = 10
49-
concurrency = 1
45+
warmup-request-count = 5
46+
concurrency = 2
5047
extra-args = "--streaming -- -v --async"
5148

5249
[extra_env_vars]
5350
UCX_LOG_LEVEL = "warn"
5451
UCX_TLS = "cuda_copy,rc_x"
52+
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"

conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,40 +15,36 @@
1515
# limitations under the License.
1616

1717
name = "dynamo-vllm-slurm"
18+
job_status_check = false
1819

1920
[[Tests]]
20-
id = "qwen3-0.6B"
21+
id = "test.disagg.single-node"
2122
test_name = "vLLM-Qwen3-0.6B"
22-
num_nodes = 4
23-
time_limit = "00:20:00"
23+
num_nodes = 2 # 1 prefill node + 1 decode node
24+
time_limit = "00:10:00"
2425

25-
[Tests.cmd_args]
26+
[Tests.cmd_args.dynamo.prefill_worker]
27+
num-nodes = 1
28+
tensor-parallel-size = 4
29+
pipeline-parallel-size = 1
2630

27-
[Tests.cmd_args.dynamo]
28-
decode-initialized-regex = 'VllmWorker.*has.been.initialized'
29-
etcd-cmd = "etcd --log-level debug"
30-
etcd-port = 2379
31-
genai-perf-cmd = 'genai-perf profile'
32-
ingress-cmd = "python -m dynamo.frontend --router-mode kv"
33-
nats-cmd = "nats-server -js"
34-
nats-port = 4222
35-
node-setup-cmd = "apt-get update -o APT::Sandbox::User=root && apt-get install -y curl libibverbs1 rdma-core ibverbs-utils libibumad3 libnuma1 librdmacm1 ibverbs-providers; /usr/local/ucx/bin/ucx_info -d |grep Transport | sort -u;"
36-
port = 8787
37-
prefill-initialized-regex = 'VllmWorker.*has.been.initialized'
31+
[Tests.cmd_args.dynamo.decode_worker]
32+
num-nodes = 1
33+
tensor-parallel-size = 4
34+
pipeline-parallel-size = 1
3835

39-
[Tests.cmd_args.dynamo.prefill_worker]
40-
gpu-memory-utilization = 0.90
41-
max_model_len = 19280
42-
num-nodes = 2
43-
tensor-parallel-size = 4
36+
[[Tests]]
37+
id = "test.disagg.multinode"
38+
test_name = "vLLM-Qwen3-0.6B"
39+
num_nodes = 4 # 2 prefill nodes + 2 decode nodes
40+
time_limit = "00:10:00"
4441

45-
[Tests.cmd_args.dynamo.decode_worker]
46-
gpu-memory-utilization = 0.90
47-
max_model_len = 19280
48-
num-nodes = 2
49-
tensor-parallel-size = 4
42+
[Tests.cmd_args.dynamo.prefill_worker]
43+
num-nodes = 2
44+
tensor-parallel-size = 4
45+
pipeline-parallel-size = 1
5046

51-
[Tests.extra_env_vars]
52-
UCX_LOG_LEVEL = "warn"
53-
UCX_TLS = "cuda_copy,rc_x"
54-
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
47+
[Tests.cmd_args.dynamo.decode_worker]
48+
num-nodes = 2
49+
tensor-parallel-size = 4
50+
pipeline-parallel-size = 1

src/cloudai/workloads/ai_dynamo/ai_dynamo.sh

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ dynamo_args["ingress-cmd"]="python -m dynamo.frontend --router-mode kv"
3333
dynamo_args["port"]=8080
3434
dynamo_args["endpoint"]="v1/chat/completions"
3535
dynamo_args["model"]="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
36-
dynamo_args["etcd-cmd"]="etcd --log-level debug"
37-
dynamo_args["nats-cmd"]="nats-server -js"
3836
dynamo_args["etcd-port"]=2379
3937
dynamo_args["nats-port"]=4222
4038
dynamo_args["workspace-path"]="/workspace"
@@ -47,8 +45,12 @@ dynamo_args["tp-arg-name"]="tensor-parallel-size"
4745
dynamo_args["pp-arg-name"]="pipeline-parallel-size"
4846
dynamo_args["multiple-prefill-workers-per-node"]="true"
4947
dynamo_args["multiple-decode-workers-per-node"]="true"
50-
dynamo_args["prefill-initialized-regex"]="prefill.*initialized"
51-
dynamo_args["decode-initialized-regex"]="decode.*initialized"
48+
dynamo_args["prefill-initialized-regex"]="Worker.*has.been.initialized"
49+
dynamo_args["decode-initialized-regex"]="Worker.*has.been.initialized"
50+
51+
dynamo_args["etcd-cmd"]="etcd --log-level debug"
52+
dynamo_args["nats-cmd"]="nats-server -js"
53+
dynamo_args["genai-perf-cmd"]="genai-perf profile"
5254

5355
# sglang-specific optional ports. Ignored by vllm.
5456
dynamo_args["sgl-http-port"]=9001
@@ -310,15 +312,21 @@ _compute_worker_allocation_vllm() {
310312
dynamo_args["decode-gpus-per-worker"]=$num_gpus
311313
fi
312314

315+
log "DECODE: num GPUs: $num_gpus, GPUs per worker: ${dynamo_args["decode-gpus-per-worker"]}"
316+
log "PREFILL: num GPUs: $num_gpus, GPUs per worker: ${dynamo_args["prefill-gpus-per-worker"]}"
313317
dynamo_args["prefill-workers-per-node"]=$(( num_gpus / dynamo_args["prefill-gpus-per-worker"] ))
314318
dynamo_args["decode-workers-per-node"]=$(( num_gpus / dynamo_args["decode-gpus-per-worker"] ))
319+
log "DECODE: workers per node: ${dynamo_args["decode-workers-per-node"]}"
320+
log "PREFILL: workers per node: ${dynamo_args["prefill-workers-per-node"]}"
315321

316322
if [[ -n "${prefill_args["--num-nodes"]}" ]]; then
317323
dynamo_args["num-prefill-nodes"]=${prefill_args["--num-nodes"]}
318324
fi
319325
if [[ -n "${decode_args["--num-nodes"]}" ]]; then
320326
dynamo_args["num-decode-nodes"]=${decode_args["--num-nodes"]}
321327
fi
328+
log "NUM PREFILL NODES: ${dynamo_args["num-prefill-nodes"]}"
329+
log "NUM DECODE NODES: ${dynamo_args["num-decode-nodes"]}"
322330
}
323331

324332
_compute_worker_allocation() {
@@ -597,7 +605,7 @@ validate_environment() {
597605

598606
function launch_etcd()
599607
{
600-
log "Launching etcd"
608+
log "Launching etcd with cmd: ${dynamo_args["etcd-cmd"]} --listen-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} --advertise-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]}"
601609
${dynamo_args["etcd-cmd"]} \
602610
--listen-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} \
603611
--advertise-client-urls http://0.0.0.0:${dynamo_args["etcd-port"]} \
@@ -606,7 +614,7 @@ function launch_etcd()
606614

607615
function launch_nats()
608616
{
609-
log "Launching nats"
617+
log "Launching nats with cmd: ${dynamo_args["nats-cmd"]} -p ${dynamo_args["nats-port"]}"
610618
${dynamo_args["nats-cmd"]} -p ${dynamo_args["nats-port"]} > ${RESULTS_DIR}/nats.log 2>&1
611619
}
612620

@@ -633,12 +641,14 @@ function launch_decode()
633641
wait_for_etcd
634642

635643
local workers_per_node=${dynamo_args["decode-workers-per-node"]}
644+
log "Using workers per node: $workers_per_node"
636645

637646
for i in $(seq 0 $(( $workers_per_node - 1 ))); do
638647
local gpu_list=$(_gpu_list_for_worker "${dynamo_args["decode-gpus-per-worker"]}" "$i")
639648
local log_file=$(_log_file_for_worker "decode" "$i")
640649

641650
log "Launching decode worker $i on GPUs $gpu_list"
651+
log "Decode cmd: ${dynamo_args["decode-cmd"]} $(array_to_args decode_args) ${decode_args["--extra-args"]}"
642652
CUDA_VISIBLE_DEVICES=$gpu_list \
643653
${dynamo_args["decode-cmd"]} \
644654
$(array_to_args decode_args) ${decode_args["--extra-args"]} > $log_file 2>&1 &
@@ -665,6 +675,7 @@ function launch_prefill()
665675
local log_file=$(_log_file_for_worker "prefill" "$i")
666676

667677
log "Launching prefill worker $i on GPUs $gpu_list"
678+
log "Prefill cmd: ${dynamo_args["prefill-cmd"]} $(array_to_args prefill_args) ${prefill_args["--extra-args"]}"
668679
CUDA_VISIBLE_DEVICES=$gpu_list \
669680
${dynamo_args["prefill-cmd"]} \
670681
$(array_to_args prefill_args) ${prefill_args["--extra-args"]} > $log_file 2>&1 &
@@ -680,11 +691,12 @@ function wait_for_dynamo_frontend()
680691
local have_prefill=$(_count_initialized_prefill)
681692
local have_decode=$(_count_initialized_decode)
682693

694+
log "Initialized: prefill ${have_prefill}/${want_prefill}; decode ${have_decode}/${want_decode}"
695+
683696
if [[ $have_prefill -ge $want_prefill && $have_decode -ge $want_decode ]]; then
684697
break
685698
fi
686699

687-
log "Initialized: prefill ${have_prefill}/${want_prefill}; decode ${have_decode}/${want_decode}"
688700
exit_on_error
689701
sleep 30
690702
done
@@ -710,7 +722,7 @@ function launch_genai_perf()
710722
echo "Response: $resp"
711723

712724
local genai_perf_arguments=$(array_to_args genai_perf_args)
713-
log "Launching genai-perf with args: $genai_perf_arguments ${genai_perf_args["--extra-args"]}"
725+
log "Launching genai-perf with cmd: ${dynamo_args["genai-perf-cmd"]} $genai_perf_arguments ${genai_perf_args["--extra-args"]}"
714726

715727
${dynamo_args["genai-perf-cmd"]} ${genai_perf_arguments} ${genai_perf_args["--extra-args"]} > ${RESULTS_DIR}/genai_perf.log 2>&1
716728

0 commit comments

Comments
 (0)