Skip to content

Commit d7e0cf0

Browse files
committed
Refactor conditional checks to use [[ ]] for improved readability and consistency
1 parent 8198ed4 commit d7e0cf0

1 file changed

Lines changed: 13 additions & 13 deletions

File tree

src/swiss_ai_model_launch/assets/script.sh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ OCF_SERVICE_NAME="llm"
1010
OCF_SERVICE_PORT=8080
1111
OCF_BOOTSTRAP_ADDR="/ip4/148.187.108.178/tcp/43905/p2p/QmbUKJkCfotDzbFE5uoTsXD4GRyPHjzZC1f2yAGLoeBMn9"
1212

13-
if [ -n "$TELEMETRY_ENDPOINT" ]; then
13+
if [[ -n "$TELEMETRY_ENDPOINT" ]]; then
1414
curl -sf -X POST "$TELEMETRY_ENDPOINT" \
1515
-H "Content-Type: application/json" \
1616
-d '{"user": "'"${SLURM_JOB_USER}"'", "job_id": "'"${SLURM_JOB_ID}"'", "slurm_nodes": '"${SLURM_NNODES}"', "slurm_job_name": "'"${SLURM_JOB_NAME}"'", "slurm_partition": "'"${SLURM_JOB_PARTITION}"'", "slurm_time": "'"${SML_TIME}"'", "slurm_account": "'"${SLURM_JOB_ACCOUNT}"'", "slurm_environment": "'"${SML_ENVIRONMENT}"'", "interactive": false, "serving_framework": "'"${FRAMEWORK}"'", "framework_args": "'"${FRAMEWORK_ARGS}"'", "pre_launch_cmds": "'"${PRE_LAUNCH_CMDS}"'", "model_name": "'"${SERVED_MODEL_NAME}"'", "workers": '"${WORKERS}"', "nodes_per_worker": '"${NODES_PER_WORKER}"', "worker_port": '"${WORKER_PORT}"', "use_router": '"${USE_ROUTER}"', "router_environment": "'"${ROUTER_ENVIRONMENT}"'", "router_port": 30000, "router_args": "'"${ROUTER_ARGS}"'", "ocf_enabled": '"${USE_OCF}"', "ocf_bootstrap_addr": "'"${OCF_BOOTSTRAP_ADDR}"'", "ocf_service_name": "llm", "ocf_service_port": 8080}' || true
@@ -52,7 +52,7 @@ _sglang_setup() {
5252
_sglang_worker_cmd() {
5353
local local_rank=$1 worker_host_ip=$2
5454
local dist_args=""
55-
if [ "$NODES_PER_WORKER" -gt 1 ]; then
55+
if [[ "$NODES_PER_WORKER" -gt 1 ]]; then
5656
dist_args="--dist-init-addr ${worker_host_ip}:5757 --nnodes ${NODES_PER_WORKER} --node-rank ${local_rank}"
5757
fi
5858
FRAMEWORK_CMD="$FRAMEWORK_LAUNCH $dist_args $FRAMEWORK_ARGS"
@@ -69,18 +69,18 @@ _vllm_worker_cmd() {
6969
local local_rank=$1 worker_host_ip=$2
7070
local ray_port=6379 num_gpus=4
7171

72-
if [ "$NODES_PER_WORKER" -gt 1 ]; then
72+
if [[ "$NODES_PER_WORKER" -gt 1 ]]; then
7373
# For multi-node: only the head node runs the API server via Ray;
7474
# follower nodes join the Ray cluster and block.
75-
if [ "$local_rank" -eq 0 ]; then
75+
if [[ "$local_rank" -eq 0 ]]; then
7676
FRAMEWORK_CMD="ray start --head --port=${ray_port} --num-gpus=${num_gpus} --block &
7777
7878
echo 'Waiting for all Ray worker nodes to connect...'
7979
EXPECTED_GPUS=\$((${NODES_PER_WORKER} * ${num_gpus}))
8080
while true; do
8181
AVAILABLE_GPUS=\$(python3 -c 'import ray; ray.init(address=\"auto\"); print(int(ray.available_resources().get(\"GPU\", 0)))' 2>/dev/null || echo 0)
8282
echo \"Available GPUs: \${AVAILABLE_GPUS} / \${EXPECTED_GPUS}\"
83-
if [ \"\${AVAILABLE_GPUS}\" -ge \"\${EXPECTED_GPUS}\" ]; then
83+
if [[ \"\${AVAILABLE_GPUS}\" -ge \"\${EXPECTED_GPUS}\" ]]; then
8484
echo 'All Ray workers connected!'
8585
break
8686
fi
@@ -104,7 +104,7 @@ case "$FRAMEWORK" in
104104
esac
105105

106106
EXPECTED_NODES=$((WORKERS * NODES_PER_WORKER))
107-
if [ "$TOTAL_NODES" -ne "$EXPECTED_NODES" ]; then
107+
if [[ "$TOTAL_NODES" -ne "$EXPECTED_NODES" ]]; then
108108
echo "Warning: Total nodes ($TOTAL_NODES) doesn't match WORKERS($WORKERS) * NODES_PER_WORKER($NODES_PER_WORKER) = $EXPECTED_NODES"
109109
echo "Adjusting to use all available nodes with WORKERS workers"
110110
NODES_PER_WORKER=$((TOTAL_NODES / WORKERS))
@@ -118,7 +118,7 @@ for ((worker_id=0; worker_id<WORKERS; worker_id++)); do
118118
worker_host_node=${nodes[$start_node]}
119119
worker_host_ip=$(srun --nodes=1 --ntasks=1 -w "${worker_host_node}" hostname -i)
120120

121-
if [ -z "$worker_host_ip" ]; then
121+
if [[ -z "$worker_host_ip" ]]; then
122122
echo "Error: Could not retrieve IP address for worker $worker_id host ${worker_host_node}"
123123
exit 1
124124
fi
@@ -143,7 +143,7 @@ for ((worker_id=0; worker_id<WORKERS; worker_id++)); do
143143
vllm) _vllm_worker_cmd "$local_rank" "$worker_host_ip" ;;
144144
esac
145145

146-
if [ "$USE_OCF" = "true" ] && [ "$local_rank" -eq 0 ]; then
146+
if [[ "$USE_OCF" = "true" ]] && [[ "$local_rank" -eq 0 ]]; then
147147
FRAMEWORK_CMD="\$OCF_BIN start --bootstrap.addr \"$OCF_BOOTSTRAP_ADDR\" --service.name $OCF_SERVICE_NAME --service.port $OCF_SERVICE_PORT --subprocess \"$FRAMEWORK_CMD\""
148148
fi
149149

@@ -153,7 +153,7 @@ for ((worker_id=0; worker_id<WORKERS; worker_id++)); do
153153
bash --norc --noprofile -c "\
154154
set -ex
155155
$FRAMEWORK_ENV_SETUP
156-
if [ -n \"$PRE_LAUNCH_CMDS\" ]; then
156+
if [[ -n \"$PRE_LAUNCH_CMDS\" ]]; then
157157
echo \"Running pre-launch commands...\"
158158
eval \"$PRE_LAUNCH_CMDS\"
159159
fi
@@ -164,7 +164,7 @@ done
164164
# vmagent runs on the batch node rather than inside a container: pyxis containers
165165
# share the host network namespace, so the framework's API server is reachable
166166
# at localhost:8080 from here without any extra networking.
167-
if [ -n "$METRICS_REMOTE_WRITE_URL" ] && [ -x "$METRICS_AGENT_BIN" ]; then
167+
if [[ -n "$METRICS_REMOTE_WRITE_URL" ]] && [[ -x "$METRICS_AGENT_BIN" ]]; then
168168
"$METRICS_AGENT_BIN" \
169169
-promscrape.config=/capstor/store/cscs/swissai/infra01/ocf-share/vmagent-scrape.yaml \
170170
-remoteWrite.url="${METRICS_REMOTE_WRITE_URL}" \
@@ -174,11 +174,11 @@ if [ -n "$METRICS_REMOTE_WRITE_URL" ] && [ -x "$METRICS_AGENT_BIN" ]; then
174174
-remoteWrite.label="user=${SLURM_JOB_USER}" \
175175
"-remoteWrite.tmpDataPath=/tmp/vmagent-data-${SLURM_JOB_ID}" \
176176
> "/tmp/vmagent-${SLURM_JOB_ID}.log" 2>&1 &
177-
elif [ -n "$METRICS_REMOTE_WRITE_URL" ]; then
177+
elif [[ -n "$METRICS_REMOTE_WRITE_URL" ]]; then
178178
echo "metrics: $METRICS_AGENT_BIN not found, skipping push" >&2
179179
fi
180180

181-
if [ "$USE_ROUTER" = "true" ] && [ "$WORKERS" -gt 1 ]; then
181+
if [[ "$USE_ROUTER" = "true" ]] && [[ "$WORKERS" -gt 1 ]]; then
182182
router_host_node=${nodes[0]}
183183
router_host_ip=${worker_head_ips[0]}
184184
worker_urls_str="${worker_urls[*]}"
@@ -199,7 +199,7 @@ unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY all_proxy ALL_PROXY
199199
echo \"Waiting for all workers to fully initialize the GPU engine before starting router...\"
200200
for worker_ip in ${worker_head_ips[*]}; do
201201
echo \"Checking worker at \$worker_ip...\"
202-
while [ \"\$(curl --noproxy \"*\" -s -o /dev/null -w '%{http_code}' http://\${worker_ip}:${WORKER_PORT}/health)\" != \"200\" ]; do # NOSONAR
202+
while [[ \"\$(curl --noproxy \"*\" -s -o /dev/null -w '%{http_code}' http://\${worker_ip}:${WORKER_PORT}/health)\" != \"200\" ]]; do # NOSONAR
203203
sleep 10
204204
done
205205
echo \"Worker \$worker_ip is fully ready!\"

0 commit comments

Comments
 (0)