@@ -56,8 +56,7 @@ function log()
5656 echo " [$( date --iso-8601=ns) $( hostname) ]: $@ "
5757}
5858
59- function parse_args()
60- {
59+ _parse_cli_pairs () {
6160 log " Parsing args:"
6261 while [[ $# -ge 2 ]]; do
6362 echo " $1 $2 "
@@ -78,8 +77,9 @@ function parse_args()
7877 esac
7978 shift ; shift ;
8079 done
80+ }
8181
82- # Patch Dynamo args
82+ _patch_dynamo_args () {
8383 if [[ -z " ${dynamo_args["decode-nodelist"]} " ]]; then
8484 dynamo_args[" decode-nodelist" ]=$( echo $DYNAMO_NODELIST | cut -d' ,' -f1-${dynamo_args["num-decode-nodes"]} )
8585 fi
@@ -93,19 +93,20 @@ function parse_args()
9393 fi
9494
9595 dynamo_args[" url" ]=" http://${dynamo_args["frontend-node"]} :${dynamo_args["port"]} "
96+ }
9697
97- # Patch Prefill/Decode args
98+ _patch_section_args () {
9899 prefill_args[" --model" ]=${dynamo_args["model"]}
99100 decode_args[" --model" ]=${dynamo_args["model"]}
100101
101- # Patch GenAI Perf args
102102 genai_perf_args[" --model" ]=${dynamo_args["model"]}
103103 genai_perf_args[" --url" ]=${dynamo_args["url"]}
104104 genai_perf_args[" --endpoint" ]=${dynamo_args["endpoint"]}
105105 genai_perf_args[" --artifact-dir" ]=" ${RESULTS_DIR} /${GENAI_PERF_ARTIFACT_DIR} /"
106106 genai_perf_args[" --profile-export-file" ]=" ${GENAI_PERF_PROFILE_EXPORT_FILE} "
107+ }
107108
108- # Worker GPU allocation logic
109+ _compute_worker_allocation () {
109110 local tp_arg_name=" --${dynamo_args["tp-arg-name"]} "
110111 local pp_arg_name=" --${dynamo_args["pp-arg-name"]} "
111112
@@ -141,13 +142,24 @@ function parse_args()
141142 if [[ -n " ${decode_args["--num-nodes"]} " ]]; then
142143 dynamo_args[" num-decode-nodes" ]=${decode_args["--num-nodes"]}
143144 fi
145+ }
144146
147+ _dump_args () {
145148 log " Dynamo args: $( for key in " ${! dynamo_args[@]} " ; do echo -n " $key : ${dynamo_args[$key]} ; " ; done) "
146149 log " Prefill args: $( for key in " ${! prefill_args[@]} " ; do echo -n " $key : ${prefill_args[$key]} ; " ; done) "
147150 log " Decode args: $( for key in " ${! decode_args[@]} " ; do echo -n " $key : ${decode_args[$key]} ; " ; done) "
148151 log " GenAI perf args: $( for key in " ${! genai_perf_args[@]} " ; do echo -n " $key : ${genai_perf_args[$key]} ; " ; done) "
149152}
150153
154+ function parse_args()
155+ {
156+ _parse_cli_pairs " $@ "
157+ _patch_dynamo_args
158+ _patch_section_args
159+ _compute_worker_allocation
160+ _dump_args
161+ }
162+
151163function array_to_args()
152164{
153165 local -n arr=$1
@@ -222,16 +234,54 @@ function exit_on_error()
222234 fi
223235}
224236
237+ _total_workers_prefill () {
238+ echo $(( dynamo_args["num- prefill- nodes"] * dynamo_args["prefill- workers- per- node"] ))
239+ }
240+
241+ _total_workers_decode () {
242+ echo $(( dynamo_args["num- decode- nodes"] * dynamo_args["decode- workers- per- node"] ))
243+ }
244+
245+ _count_initialized_prefill () {
246+ grep ${dynamo_args["prefill-initialized-regex"]} $RESULTS_DIR /* prefill* -il 2> /dev/null | wc -l
247+ }
248+
249+ _count_initialized_decode () {
250+ grep ${dynamo_args["decode-initialized-regex"]} $RESULTS_DIR /* decode* -il 2> /dev/null | wc -l
251+ }
252+
253+ _gpu_list_for_worker () {
254+ local per_worker=$1
255+ local idx=$2
256+ local start=$(( 1 + (idx * per_worker) ))
257+ local end=$(( start + per_worker - 1 ))
258+ echo " $( echo $CUDA_VISIBLE_DEVICES | cut -d' ,' -f${start} -${end} ) "
259+ }
260+
261+ _log_file_for_worker () {
262+ local role=" $1 "
263+ local idx=" $2 "
264+ echo " ${RESULTS_DIR} /dynamo_${role} _${SLURM_NODEID} _${idx} .log"
265+ }
266+
267+ _probe_frontend_once () {
268+ local json=' {
269+ "model": "' ${dynamo_args["model"]} ' ",
270+ "messages": [{"role": "user", "content": "The color of sky is"}],
271+ "stream": false,
272+ "max_tokens": 10
273+ }'
274+ curl -s -X POST " ${dynamo_args["url"]} /v1/chat/completions" -H " Content-Type: application/json" -d " $json "
275+ }
276+
225277function wait_for_dynamo_frontend()
226278{
227- local num_prefill_workers=$(( dynamo_args["num - prefill - nodes"] * dynamo_args["prefill - workers - per - node"] ) )
228- local num_decode_workers=$(( dynamo_args["num - decode - nodes"] * dynamo_args["decode - workers - per - node"] ) )
279+ local num_prefill_workers=$( _total_workers_prefill )
280+ local num_decode_workers=$( _total_workers_decode )
229281
230282 while [[ 1 ]]; do
231- num_initialized_prefill=$(
232- grep ${dynamo_args["prefill-initialized-regex"]} $RESULTS_DIR /* prefill* -il 2> /dev/null | wc -l)
233- num_initialized_decode=$(
234- grep ${dynamo_args["decode-initialized-regex"]} $RESULTS_DIR /* decode* -il 2> /dev/null | wc -l)
283+ num_initialized_prefill=$( _count_initialized_prefill)
284+ num_initialized_decode=$( _count_initialized_decode)
235285
236286 if [[ $num_initialized_prefill == $num_prefill_workers ]] && \
237287 [[ $num_initialized_decode == $num_decode_workers ]]; then
@@ -261,15 +311,8 @@ function launch_genai_perf()
261311{
262312 wait_for_dynamo_frontend
263313
264- JSON_PAYLOAD=' {
265- "model": "' ${dynamo_args["model"]} ' ",
266- "messages": [{"role": "user", "content": "The color of sky is"}],
267- "stream": false,
268- "max_tokens": 10
269- }'
270-
271- RESPONSE=$( curl -s -X POST ${dynamo_args["url"]} /v1/chat/completions -H " Content-Type: application/json" -d " $JSON_PAYLOAD " )
272- echo " Response: $RESPONSE "
314+ local resp=$( _probe_frontend_once)
315+ echo " Response: $resp "
273316
274317 local genai_perf_arguments=$( array_to_args genai_perf_args)
275318 log " Launching genai-perf with args: $genai_perf_arguments ${genai_perf_args["--extra-args"]} "
@@ -286,10 +329,8 @@ function launch_prefill()
286329 local workers_per_node=${dynamo_args["prefill-workers-per-node"]}
287330
288331 for i in $( seq 0 $(( $workers_per_node - 1 )) ) ; do
289- local start=$(( 1 + (i * dynamo_args["prefill- gpus- per- worker"]) ))
290- local end=$(( start + dynamo_args["prefill- gpus- per- worker"] - 1 ))
291- local gpu_list=$( echo $CUDA_VISIBLE_DEVICES | cut -d' ,' -f${start} -${end} )
292- local log_file=${RESULTS_DIR} /dynamo_prefill_${SLURM_NODEID} _${i} .log
332+ local gpu_list=$( _gpu_list_for_worker " ${dynamo_args["prefill-gpus-per-worker"]} " " $i " )
333+ local log_file=$( _log_file_for_worker " prefill" " $i " )
293334
294335 log " Launching prefill worker $i on GPUs $gpu_list "
295336 CUDA_VISIBLE_DEVICES=$gpu_list \
@@ -305,10 +346,8 @@ function launch_decode()
305346 local workers_per_node=${dynamo_args["decode-workers-per-node"]}
306347
307348 for i in $( seq 0 $(( $workers_per_node - 1 )) ) ; do
308- local start=$(( 1 + (i * dynamo_args["decode- gpus- per- worker"]) ))
309- local end=$(( start + dynamo_args["decode- gpus- per- worker"] - 1 ))
310- local gpu_list=$( echo $CUDA_VISIBLE_DEVICES | cut -d' ,' -f${start} -${end} )
311- local log_file=${RESULTS_DIR} /dynamo_decode_${SLURM_NODEID} _${i} .log
349+ local gpu_list=$( _gpu_list_for_worker " ${dynamo_args["decode-gpus-per-worker"]} " " $i " )
350+ local log_file=$( _log_file_for_worker " decode" " $i " )
312351
313352 log " Launching decode worker $i on GPUs $gpu_list "
314353 CUDA_VISIBLE_DEVICES=$gpu_list \
@@ -325,43 +364,64 @@ function log_node_role()
325364 echo " ${node_name} ,${role} " >> " $roles_file "
326365}
327366
328- function main()
329- {
367+ _current_node_name () {
368+ echo " ${SLURMD_NODENAME:- $(hostname)} "
369+ }
370+
371+ _is_frontend_node () {
372+ local name=" $( _current_node_name) "
373+ [[ " ${dynamo_args["frontend-node"]} " == * " $name " * ]]
374+ }
375+
376+ _is_decode_node () {
377+ local name=" $( _current_node_name) "
378+ [[ " ${dynamo_args["decode-nodelist"]} " == * " $name " * ]]
379+ }
380+
381+ _is_prefill_node () {
382+ local name=" $( _current_node_name) "
383+ [[ " ${dynamo_args["prefill-nodelist"]} " == * " $name " * ]]
384+ }
385+
386+ _init_runtime_env () {
330387 export HF_HOME=" ${HUGGINGFACE_HOME} "
331388 export NATS_SERVER=" nats://${dynamo_args["frontend-node"]} :${dynamo_args["nats-port"]} "
332389 export ETCD_ENDPOINTS=" http://${dynamo_args["frontend-node"]} :${dynamo_args["etcd-port"]} "
333390 export UCX_LOG_FILE=" ${RESULTS_DIR} /ucx_log_%h.log"
334-
335391 DONE_MARKER=" ${RESULTS_DIR} /${DONE_MARKER} "
392+ }
393+
394+ function main()
395+ {
396+ _init_runtime_env
336397
337398 launch_node_setup_cmd
338399
339400 cd ${dynamo_args["workspace-path"]}
340401
341- if [[ " ${dynamo_args["frontend-node"]} " == * " $SLURMD_NODENAME " * ]] ; then
402+ if _is_frontend_node ; then
342403 log " Node ID: $SLURM_NODEID , Role: frontend"
343- log_node_role " $SLURMD_NODENAME " " frontend"
404+ log_node_role " $( _current_node_name ) " " frontend"
344405 launch_etcd &
345406 launch_nats &
346407 wait_for_etcd
347408 launch_ingress &
348409 fi
349410
350- if [[ " ${dynamo_args["decode-nodelist"]} " == * " $SLURMD_NODENAME " * ]] ; then
411+ if _is_decode_node ; then
351412 log " Node ID: $SLURM_NODEID , Role: decode"
352- log_node_role " $SLURMD_NODENAME " " decode"
413+ log_node_role " $( _current_node_name ) " " decode"
353414 launch_decode &
354415 fi
355416
356- if [[ " ${dynamo_args["prefill-nodelist"]} " == * " $SLURMD_NODENAME " * ]] ; then
417+ if _is_prefill_node ; then
357418 log " Node ID: $SLURM_NODEID , Role: prefill"
358- log_node_role " $SLURMD_NODENAME " " prefill"
419+ log_node_role " $( _current_node_name ) " " prefill"
359420 launch_prefill &
360421 fi
361422
362- if [[ " ${dynamo_args["frontend-node"]} " == * " $SLURMD_NODENAME " * ]] ; then
423+ if _is_frontend_node ; then
363424 launch_genai_perf
364-
365425 touch " $DONE_MARKER "
366426 fi
367427
0 commit comments