@@ -10,7 +10,7 @@ OCF_SERVICE_NAME="llm"
1010OCF_SERVICE_PORT=8080
1111OCF_BOOTSTRAP_ADDR=" /ip4/148.187.108.178/tcp/43905/p2p/QmbUKJkCfotDzbFE5uoTsXD4GRyPHjzZC1f2yAGLoeBMn9"
1212
13- if [ -n " $TELEMETRY_ENDPOINT " ]; then
13+ if [[ -n " $TELEMETRY_ENDPOINT " ] ]; then
1414 curl -sf -X POST " $TELEMETRY_ENDPOINT " \
1515 -H " Content-Type: application/json" \
1616 -d ' {"user": "' " ${SLURM_JOB_USER} " ' ", "job_id": "' " ${SLURM_JOB_ID} " ' ", "slurm_nodes": ' " ${SLURM_NNODES} " ' , "slurm_job_name": "' " ${SLURM_JOB_NAME} " ' ", "slurm_partition": "' " ${SLURM_JOB_PARTITION} " ' ", "slurm_time": "' " ${SML_TIME} " ' ", "slurm_account": "' " ${SLURM_JOB_ACCOUNT} " ' ", "slurm_environment": "' " ${SML_ENVIRONMENT} " ' ", "interactive": false, "serving_framework": "' " ${FRAMEWORK} " ' ", "framework_args": "' " ${FRAMEWORK_ARGS} " ' ", "pre_launch_cmds": "' " ${PRE_LAUNCH_CMDS} " ' ", "model_name": "' " ${SERVED_MODEL_NAME} " ' ", "workers": ' " ${WORKERS} " ' , "nodes_per_worker": ' " ${NODES_PER_WORKER} " ' , "worker_port": ' " ${WORKER_PORT} " ' , "use_router": ' " ${USE_ROUTER} " ' , "router_environment": "' " ${ROUTER_ENVIRONMENT} " ' ", "router_port": 30000, "router_args": "' " ${ROUTER_ARGS} " ' ", "ocf_enabled": ' " ${USE_OCF} " ' , "ocf_bootstrap_addr": "' " ${OCF_BOOTSTRAP_ADDR} " ' ", "ocf_service_name": "llm", "ocf_service_port": 8080}' || true
@@ -52,7 +52,7 @@ _sglang_setup() {
5252_sglang_worker_cmd () {
5353 local local_rank=$1 worker_host_ip=$2
5454 local dist_args=" "
55- if [ " $NODES_PER_WORKER " -gt 1 ]; then
55+ if [[ " $NODES_PER_WORKER " -gt 1 ] ]; then
5656 dist_args=" --dist-init-addr ${worker_host_ip} :5757 --nnodes ${NODES_PER_WORKER} --node-rank ${local_rank} "
5757 fi
5858 FRAMEWORK_CMD=" $FRAMEWORK_LAUNCH $dist_args $FRAMEWORK_ARGS "
@@ -69,18 +69,18 @@ _vllm_worker_cmd() {
6969 local local_rank=$1 worker_host_ip=$2
7070 local ray_port=6379 num_gpus=4
7171
72- if [ " $NODES_PER_WORKER " -gt 1 ]; then
72+ if [[ " $NODES_PER_WORKER " -gt 1 ] ]; then
7373 # For multi-node: only the head node runs the API server via Ray;
7474 # follower nodes join the Ray cluster and block.
75- if [ " $local_rank " -eq 0 ]; then
75+ if [[ " $local_rank " -eq 0 ] ]; then
7676 FRAMEWORK_CMD=" ray start --head --port=${ray_port} --num-gpus=${num_gpus} --block &
7777
7878echo 'Waiting for all Ray worker nodes to connect...'
7979EXPECTED_GPUS=\$ ((${NODES_PER_WORKER} * ${num_gpus} ))
8080while true; do
8181 AVAILABLE_GPUS=\$ (python3 -c 'import ray; ray.init(address=\" auto\" ); print(int(ray.available_resources().get(\" GPU\" , 0)))' 2>/dev/null || echo 0)
8282 echo \" Available GPUs: \$ {AVAILABLE_GPUS} / \$ {EXPECTED_GPUS}\"
83- if [ \"\$ {AVAILABLE_GPUS}\" -ge \"\$ {EXPECTED_GPUS}\" ]; then
83+ if [[ \"\$ {AVAILABLE_GPUS}\" -ge \"\$ {EXPECTED_GPUS}\" ] ]; then
8484 echo 'All Ray workers connected!'
8585 break
8686 fi
@@ -104,7 +104,7 @@ case "$FRAMEWORK" in
104104esac
105105
106106EXPECTED_NODES=$(( WORKERS * NODES_PER_WORKER))
107- if [ " $TOTAL_NODES " -ne " $EXPECTED_NODES " ]; then
107+ if [[ " $TOTAL_NODES " -ne " $EXPECTED_NODES " ] ]; then
108108 echo " Warning: Total nodes ($TOTAL_NODES ) doesn't match WORKERS($WORKERS ) * NODES_PER_WORKER($NODES_PER_WORKER ) = $EXPECTED_NODES "
109109 echo " Adjusting to use all available nodes with WORKERS workers"
110110 NODES_PER_WORKER=$(( TOTAL_NODES / WORKERS))
@@ -118,7 +118,7 @@ for ((worker_id=0; worker_id<WORKERS; worker_id++)); do
118118 worker_host_node=${nodes[$start_node]}
119119 worker_host_ip=$( srun --nodes=1 --ntasks=1 -w " ${worker_host_node} " hostname -i)
120120
121- if [ -z " $worker_host_ip " ]; then
121+ if [[ -z " $worker_host_ip " ] ]; then
122122 echo " Error: Could not retrieve IP address for worker $worker_id host ${worker_host_node} "
123123 exit 1
124124 fi
@@ -143,7 +143,7 @@ for ((worker_id=0; worker_id<WORKERS; worker_id++)); do
143143 vllm) _vllm_worker_cmd " $local_rank " " $worker_host_ip " ;;
144144 esac
145145
146- if [ " $USE_OCF " = " true" ] && [ " $local_rank " -eq 0 ]; then
146+ if [[ " $USE_OCF " = " true" ]] && [[ " $local_rank " -eq 0 ] ]; then
147147 FRAMEWORK_CMD=" \$ OCF_BIN start --bootstrap.addr \" $OCF_BOOTSTRAP_ADDR \" --service.name $OCF_SERVICE_NAME --service.port $OCF_SERVICE_PORT --subprocess \" $FRAMEWORK_CMD \" "
148148 fi
149149
@@ -153,7 +153,7 @@ for ((worker_id=0; worker_id<WORKERS; worker_id++)); do
153153 bash --norc --noprofile -c " \
154154set -ex
155155$FRAMEWORK_ENV_SETUP
156- if [ -n \" $PRE_LAUNCH_CMDS \" ]; then
156+ if [[ -n \" $PRE_LAUNCH_CMDS \" ] ]; then
157157 echo \" Running pre-launch commands...\"
158158 eval \" $PRE_LAUNCH_CMDS \"
159159fi
164164# vmagent runs on the batch node rather than inside a container: pyxis containers
165165# share the host network namespace, so the framework's API server is reachable
166166# at localhost:8080 from here without any extra networking.
167- if [ -n " $METRICS_REMOTE_WRITE_URL " ] && [ -x " $METRICS_AGENT_BIN " ]; then
167+ if [[ -n " $METRICS_REMOTE_WRITE_URL " ]] && [[ -x " $METRICS_AGENT_BIN " ] ]; then
168168 " $METRICS_AGENT_BIN " \
169169 -promscrape.config=/capstor/store/cscs/swissai/infra01/ocf-share/vmagent-scrape.yaml \
170170 -remoteWrite.url=" ${METRICS_REMOTE_WRITE_URL} " \
@@ -174,11 +174,11 @@ if [ -n "$METRICS_REMOTE_WRITE_URL" ] && [ -x "$METRICS_AGENT_BIN" ]; then
174174 -remoteWrite.label=" user=${SLURM_JOB_USER} " \
175175 " -remoteWrite.tmpDataPath=/tmp/vmagent-data-${SLURM_JOB_ID} " \
176176 > " /tmp/vmagent-${SLURM_JOB_ID} .log" 2>&1 &
177- elif [ -n " $METRICS_REMOTE_WRITE_URL " ]; then
177+ elif [[ -n " $METRICS_REMOTE_WRITE_URL " ] ]; then
178178 echo " metrics: $METRICS_AGENT_BIN not found, skipping push" >&2
179179fi
180180
181- if [ " $USE_ROUTER " = " true" ] && [ " $WORKERS " -gt 1 ]; then
181+ if [[ " $USE_ROUTER " = " true" ]] && [[ " $WORKERS " -gt 1 ] ]; then
182182 router_host_node=${nodes[0]}
183183 router_host_ip=${worker_head_ips[0]}
184184 worker_urls_str=" ${worker_urls[*]} "
@@ -199,7 +199,7 @@ unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY all_proxy ALL_PROXY
199199echo \" Waiting for all workers to fully initialize the GPU engine before starting router...\"
200200for worker_ip in ${worker_head_ips[*]} ; do
201201 echo \" Checking worker at \$ worker_ip...\"
202- while [ \"\$ (curl --noproxy \" *\" -s -o /dev/null -w '%{http_code}' http://\$ {worker_ip}:${WORKER_PORT} /health)\" != \" 200\" ]; do # NOSONAR
202+ while [[ \"\$ (curl --noproxy \" *\" -s -o /dev/null -w '%{http_code}' http://\$ {worker_ip}:${WORKER_PORT} /health)\" != \" 200\" ] ]; do # NOSONAR
203203 sleep 10
204204 done
205205 echo \" Worker \$ worker_ip is fully ready!\"
0 commit comments