-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinit_cluster.sh
More file actions
93 lines (77 loc) · 3.07 KB
/
init_cluster.sh
File metadata and controls
93 lines (77 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/bin/bash
# 1. Get the node names
# For interactive sessions, we need to query the job info
if [ -z "$SLURM_JOB_ID" ]; then
echo "Error: Not running in a SLURM job."
echo "Please allocate nodes first: salloc --nodes=1 --ntasks-per-node=1 --gres=gpu:4"
return 1
fi
# Extract NodeList from the job
nodelist=$(scontrol show job $SLURM_JOB_ID | grep " NodeList=" | head -1 | sed 's/.*NodeList=//' | awk '{print $1}')
if [ -z "$nodelist" ]; then
echo "Error: Could not detect node allocation from job $SLURM_JOB_ID"
return 1
fi
echo "Detected node allocation: $nodelist"
nodes=$(scontrol show hostnames "$nodelist")
nodes_array=($nodes)
num_nodes=${#nodes_array[@]}
echo "Number of nodes: $num_nodes"
echo "Nodes: ${nodes_array[@]}"
# Detect number of GPUs from SLURM allocation
# SLURM_GPUS_ON_NODE contains the number of GPUs on the current node
# SLURM_JOB_GPUS or SLURM_STEP_GPUS contains GPU IDs
if [ -n "$SLURM_GPUS_ON_NODE" ]; then
num_gpus=$SLURM_GPUS_ON_NODE
elif [ -n "$SLURM_GPUS_PER_NODE" ]; then
num_gpus=$SLURM_GPUS_PER_NODE
else
# Try to count from SLURM_JOB_GPUS or CUDA_VISIBLE_DEVICES
if [ -n "$CUDA_VISIBLE_DEVICES" ]; then
num_gpus=$(echo $CUDA_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
else
# Default fallback - query from scontrol
num_gpus=$(scontrol show job $SLURM_JOB_ID | grep -oP 'GRES=gpu:\K\d+' | head -1)
if [ -z "$num_gpus" ]; then
num_gpus=4 # Ultimate fallback
echo "Warning: Could not detect GPU count, defaulting to 4"
fi
fi
fi
echo "GPUs per node: $num_gpus"
head_node=${nodes_array[0]}
export VLLM_HEAD_NODE=$head_node
echo "Head Node: $head_node"
# 2. Get the IP address of the head node (on the high-speed interface)
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address | awk '{print $1}')
port=6379
echo "Starting Ray Head on $head_node ($head_node_ip) with $num_gpus GPUs..."
# 3. Start Ray on the HEAD node (current node)
# We export VLLM_HOST_IP so vLLM binds to the correct network interface
export VLLM_HOST_IP=$head_node_ip
ray start --head --node-ip-address="$head_node_ip" --port=$port --num-gpus=$num_gpus --block &
sleep 10 # Give it a moment to initialize
# 4. Start Ray on WORKER nodes (if any)
if [ $num_nodes -gt 1 ]; then
echo "Starting Ray Workers on ${num_nodes} worker node(s)..."
for ((i=1; i<$num_nodes; i++)); do
worker_node=${nodes_array[$i]}
echo " Starting worker on $worker_node..."
srun --nodes=1 --ntasks=1 -w "$worker_node" \
bash -c "export VLLM_HOST_IP=\$(hostname --ip-address | awk '{print \$1}') && \
ray start --address='$head_node_ip:$port' --num-gpus=$num_gpus --block" &
sleep 2
done
sleep 5
else
echo "Single node setup - no worker nodes to start"
fi
echo ""
echo "========== Ray Cluster Summary =========="
echo "Head Node: $head_node ($head_node_ip)"
echo "Total Nodes: $num_nodes"
echo "GPUs per Node: $num_gpus"
if [ $num_nodes -gt 1 ]; then
echo "Worker Nodes: ${nodes_array[@]:1}"
fi
echo "========================================="