Skip to content

Commit 500b209

Browse files
authored
Clean up src/cloudai/workloads/ai_dynamo/ai_dynamo.sh (#639)
1 parent e981b2c commit 500b209

File tree

1 file changed

+100
-40
lines changed

1 file changed

+100
-40
lines changed

src/cloudai/workloads/ai_dynamo/ai_dynamo.sh

Lines changed: 100 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,7 @@ function log()
5656
echo "[$(date --iso-8601=ns) $(hostname)]: $@"
5757
}
5858

59-
function parse_args()
60-
{
59+
_parse_cli_pairs() {
6160
log "Parsing args:"
6261
while [[ $# -ge 2 ]]; do
6362
echo " $1 $2"
@@ -78,8 +77,9 @@ function parse_args()
7877
esac
7978
shift; shift;
8079
done
80+
}
8181

82-
# Patch Dynamo args
82+
_patch_dynamo_args() {
8383
if [[ -z "${dynamo_args["decode-nodelist"]}" ]]; then
8484
dynamo_args["decode-nodelist"]=$(echo $DYNAMO_NODELIST | cut -d',' -f1-${dynamo_args["num-decode-nodes"]})
8585
fi
@@ -93,19 +93,20 @@ function parse_args()
9393
fi
9494

9595
dynamo_args["url"]="http://${dynamo_args["frontend-node"]}:${dynamo_args["port"]}"
96+
}
9697

97-
# Patch Prefill/Decode args
98+
_patch_section_args() {
9899
prefill_args["--model"]=${dynamo_args["model"]}
99100
decode_args["--model"]=${dynamo_args["model"]}
100101

101-
# Patch GenAI Perf args
102102
genai_perf_args["--model"]=${dynamo_args["model"]}
103103
genai_perf_args["--url"]=${dynamo_args["url"]}
104104
genai_perf_args["--endpoint"]=${dynamo_args["endpoint"]}
105105
genai_perf_args["--artifact-dir"]="${RESULTS_DIR}/${GENAI_PERF_ARTIFACT_DIR}/"
106106
genai_perf_args["--profile-export-file"]="${GENAI_PERF_PROFILE_EXPORT_FILE}"
107+
}
107108

108-
# Worker GPU allocation logic
109+
_compute_worker_allocation() {
109110
local tp_arg_name="--${dynamo_args["tp-arg-name"]}"
110111
local pp_arg_name="--${dynamo_args["pp-arg-name"]}"
111112

@@ -141,13 +142,24 @@ function parse_args()
141142
if [[ -n "${decode_args["--num-nodes"]}" ]]; then
142143
dynamo_args["num-decode-nodes"]=${decode_args["--num-nodes"]}
143144
fi
145+
}
144146

147+
_dump_args() {
145148
log "Dynamo args: $(for key in "${!dynamo_args[@]}"; do echo -n "$key: ${dynamo_args[$key]}; "; done)"
146149
log "Prefill args: $(for key in "${!prefill_args[@]}"; do echo -n "$key: ${prefill_args[$key]}; "; done)"
147150
log "Decode args: $(for key in "${!decode_args[@]}"; do echo -n "$key: ${decode_args[$key]}; " ; done)"
148151
log "GenAI perf args: $(for key in "${!genai_perf_args[@]}"; do echo -n "$key: ${genai_perf_args[$key]}; "; done)"
149152
}
150153

154+
function parse_args()
155+
{
156+
_parse_cli_pairs "$@"
157+
_patch_dynamo_args
158+
_patch_section_args
159+
_compute_worker_allocation
160+
_dump_args
161+
}
162+
151163
function array_to_args()
152164
{
153165
local -n arr=$1
@@ -222,16 +234,54 @@ function exit_on_error()
222234
fi
223235
}
224236

237+
_total_workers_prefill() {
238+
echo $(( dynamo_args["num-prefill-nodes"] * dynamo_args["prefill-workers-per-node"] ))
239+
}
240+
241+
_total_workers_decode() {
242+
echo $(( dynamo_args["num-decode-nodes"] * dynamo_args["decode-workers-per-node"] ))
243+
}
244+
245+
_count_initialized_prefill() {
246+
grep ${dynamo_args["prefill-initialized-regex"]} $RESULTS_DIR/*prefill* -il 2> /dev/null | wc -l
247+
}
248+
249+
_count_initialized_decode() {
250+
grep ${dynamo_args["decode-initialized-regex"]} $RESULTS_DIR/*decode* -il 2> /dev/null | wc -l
251+
}
252+
253+
_gpu_list_for_worker() {
254+
local per_worker=$1
255+
local idx=$2
256+
local start=$(( 1 + (idx * per_worker) ))
257+
local end=$(( start + per_worker - 1 ))
258+
echo "$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f${start}-${end})"
259+
}
260+
261+
_log_file_for_worker() {
262+
local role="$1"
263+
local idx="$2"
264+
echo "${RESULTS_DIR}/dynamo_${role}_${SLURM_NODEID}_${idx}.log"
265+
}
266+
267+
_probe_frontend_once() {
268+
local json='{
269+
"model": "'${dynamo_args["model"]}'",
270+
"messages": [{"role": "user", "content": "The color of sky is"}],
271+
"stream": false,
272+
"max_tokens": 10
273+
}'
274+
curl -s -X POST "${dynamo_args["url"]}/v1/chat/completions" -H "Content-Type: application/json" -d "$json"
275+
}
276+
225277
function wait_for_dynamo_frontend()
226278
{
227-
local num_prefill_workers=$(( dynamo_args["num-prefill-nodes"] * dynamo_args["prefill-workers-per-node"] ))
228-
local num_decode_workers=$(( dynamo_args["num-decode-nodes"] * dynamo_args["decode-workers-per-node"] ))
279+
local num_prefill_workers=$(_total_workers_prefill)
280+
local num_decode_workers=$(_total_workers_decode)
229281

230282
while [[ 1 ]]; do
231-
num_initialized_prefill=$(
232-
grep ${dynamo_args["prefill-initialized-regex"]} $RESULTS_DIR/*prefill* -il 2> /dev/null |wc -l)
233-
num_initialized_decode=$(
234-
grep ${dynamo_args["decode-initialized-regex"]} $RESULTS_DIR/*decode* -il 2> /dev/null |wc -l)
283+
num_initialized_prefill=$(_count_initialized_prefill)
284+
num_initialized_decode=$(_count_initialized_decode)
235285

236286
if [[ $num_initialized_prefill == $num_prefill_workers ]] && \
237287
[[ $num_initialized_decode == $num_decode_workers ]]; then
@@ -261,15 +311,8 @@ function launch_genai_perf()
261311
{
262312
wait_for_dynamo_frontend
263313

264-
JSON_PAYLOAD='{
265-
"model": "'${dynamo_args["model"]}'",
266-
"messages": [{"role": "user", "content": "The color of sky is"}],
267-
"stream": false,
268-
"max_tokens": 10
269-
}'
270-
271-
RESPONSE=$(curl -s -X POST ${dynamo_args["url"]}/v1/chat/completions -H "Content-Type: application/json" -d "$JSON_PAYLOAD")
272-
echo "Response: $RESPONSE"
314+
local resp=$(_probe_frontend_once)
315+
echo "Response: $resp"
273316

274317
local genai_perf_arguments=$(array_to_args genai_perf_args)
275318
log "Launching genai-perf with args: $genai_perf_arguments ${genai_perf_args["--extra-args"]}"
@@ -286,10 +329,8 @@ function launch_prefill()
286329
local workers_per_node=${dynamo_args["prefill-workers-per-node"]}
287330

288331
for i in $(seq 0 $(( $workers_per_node - 1 ))); do
289-
local start=$(( 1 + (i * dynamo_args["prefill-gpus-per-worker"]) ))
290-
local end=$(( start + dynamo_args["prefill-gpus-per-worker"] - 1 ))
291-
local gpu_list=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f${start}-${end})
292-
local log_file=${RESULTS_DIR}/dynamo_prefill_${SLURM_NODEID}_${i}.log
332+
local gpu_list=$(_gpu_list_for_worker "${dynamo_args["prefill-gpus-per-worker"]}" "$i")
333+
local log_file=$(_log_file_for_worker "prefill" "$i")
293334

294335
log "Launching prefill worker $i on GPUs $gpu_list"
295336
CUDA_VISIBLE_DEVICES=$gpu_list \
@@ -305,10 +346,8 @@ function launch_decode()
305346
local workers_per_node=${dynamo_args["decode-workers-per-node"]}
306347

307348
for i in $(seq 0 $(( $workers_per_node - 1 ))); do
308-
local start=$(( 1 + (i * dynamo_args["decode-gpus-per-worker"]) ))
309-
local end=$(( start + dynamo_args["decode-gpus-per-worker"] - 1 ))
310-
local gpu_list=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f${start}-${end})
311-
local log_file=${RESULTS_DIR}/dynamo_decode_${SLURM_NODEID}_${i}.log
349+
local gpu_list=$(_gpu_list_for_worker "${dynamo_args["decode-gpus-per-worker"]}" "$i")
350+
local log_file=$(_log_file_for_worker "decode" "$i")
312351

313352
log "Launching decode worker $i on GPUs $gpu_list"
314353
CUDA_VISIBLE_DEVICES=$gpu_list \
@@ -325,43 +364,64 @@ function log_node_role()
325364
echo "${node_name},${role}" >> "$roles_file"
326365
}
327366

328-
function main()
329-
{
367+
_current_node_name() {
368+
echo "${SLURMD_NODENAME:-$(hostname)}"
369+
}
370+
371+
_is_frontend_node() {
372+
local name="$(_current_node_name)"
373+
[[ "${dynamo_args["frontend-node"]}" == *"$name"* ]]
374+
}
375+
376+
_is_decode_node() {
377+
local name="$(_current_node_name)"
378+
[[ "${dynamo_args["decode-nodelist"]}" == *"$name"* ]]
379+
}
380+
381+
_is_prefill_node() {
382+
local name="$(_current_node_name)"
383+
[[ "${dynamo_args["prefill-nodelist"]}" == *"$name"* ]]
384+
}
385+
386+
_init_runtime_env() {
330387
export HF_HOME="${HUGGINGFACE_HOME}"
331388
export NATS_SERVER="nats://${dynamo_args["frontend-node"]}:${dynamo_args["nats-port"]}"
332389
export ETCD_ENDPOINTS="http://${dynamo_args["frontend-node"]}:${dynamo_args["etcd-port"]}"
333390
export UCX_LOG_FILE="${RESULTS_DIR}/ucx_log_%h.log"
334-
335391
DONE_MARKER="${RESULTS_DIR}/${DONE_MARKER}"
392+
}
393+
394+
function main()
395+
{
396+
_init_runtime_env
336397

337398
launch_node_setup_cmd
338399

339400
cd ${dynamo_args["workspace-path"]}
340401

341-
if [[ "${dynamo_args["frontend-node"]}" == *"$SLURMD_NODENAME"* ]]; then
402+
if _is_frontend_node; then
342403
log "Node ID: $SLURM_NODEID, Role: frontend"
343-
log_node_role "$SLURMD_NODENAME" "frontend"
404+
log_node_role "$(_current_node_name)" "frontend"
344405
launch_etcd &
345406
launch_nats &
346407
wait_for_etcd
347408
launch_ingress &
348409
fi
349410

350-
if [[ "${dynamo_args["decode-nodelist"]}" == *"$SLURMD_NODENAME"* ]]; then
411+
if _is_decode_node; then
351412
log "Node ID: $SLURM_NODEID, Role: decode"
352-
log_node_role "$SLURMD_NODENAME" "decode"
413+
log_node_role "$(_current_node_name)" "decode"
353414
launch_decode &
354415
fi
355416

356-
if [[ "${dynamo_args["prefill-nodelist"]}" == *"$SLURMD_NODENAME"* ]]; then
417+
if _is_prefill_node; then
357418
log "Node ID: $SLURM_NODEID, Role: prefill"
358-
log_node_role "$SLURMD_NODENAME" "prefill"
419+
log_node_role "$(_current_node_name)" "prefill"
359420
launch_prefill &
360421
fi
361422

362-
if [[ "${dynamo_args["frontend-node"]}" == *"$SLURMD_NODENAME"* ]]; then
423+
if _is_frontend_node; then
363424
launch_genai_perf
364-
365425
touch "$DONE_MARKER"
366426
fi
367427

0 commit comments

Comments
 (0)