|
24 | 24 | # $SLURM_NODEID: |
25 | 25 | # nodes == 1 -> co-located: vllm serve on $SERVE_GPU, trainer on the rest of |
26 | 26 | # the local GPUs (original single-node behavior). |
27 | | -# nodes >= 2 -> split across nodes: node 0 runs vllm serve on all its GPUs, |
28 | | -# node 1 runs the trainer on all its GPUs. The two roles |
29 | | -# rendezvous through the shared /scratchspace mount (node 0 |
30 | | -# publishes its address; node 1 signals completion). For large |
31 | | -# models whose serve needs a whole node (e.g. Kimi-K2.5 TP=8), |
32 | | -# allocate exactly 2 nodes. |
| 27 | +# nodes == 2 -> split: node 0 runs vllm serve on all its GPUs, node 1 runs |
| 28 | +# the trainer on all its GPUs. Roles rendezvous through the |
| 29 | +# shared /scratchspace mount (node 0 publishes its serve |
| 30 | +# address; the trainer signals completion). |
| 31 | +# nodes >= 3 -> 1 serve node (node 0) + N trainer nodes (nodes 1..NNODES-1) |
| 32 | +# doing multi-node DDP. The head trainer (node 1, accelerate |
| 33 | +# machine_rank 0) publishes its IP for accelerate's c10d |
| 34 | +# rendezvous; all trainer nodes read both the serve address and |
| 35 | +# the head-trainer address from /scratchspace. NOTE: only global |
| 36 | +# rank 0 fetches hidden states from the single serve and |
| 37 | +# broadcasts to the rest (DataLoaderDispatcher), so the single |
| 38 | +# serve is the throughput ceiling — adding trainer nodes scales |
| 39 | +# effective batch / compute, not data-production throughput. |
33 | 40 | # |
34 | 41 | # Env vars (required): |
35 | 42 | # HF_MODEL_CKPT Target model path. Used by both vllm serve (as the |
|
56 | 63 | # TRAIN_GPUS single-node only: CUDA_VISIBLE_DEVICES for the trainer. |
57 | 64 | # default = all local GPUs except SERVE_GPU. |
58 | 65 | # SERVE_ADVERTISE_IP multi-node only: address node 1 should dial. default is |
59 | | -# node 0's first `hostname -I` IP. |
| 66 | +# node 0's routable IP (its resolved Slurm node name, else |
| 67 | +# its first non-loopback / non-link-local IP). |
60 | 68 | # |
61 | 69 | # All script args are forwarded to launch_train.sh (typically: --config <yaml> |
62 | 70 | # plus OmegaConf dotlist overrides). |
@@ -112,7 +120,7 @@ export PATH=$PATH:/workspace/.local/bin |
112 | 120 |
|
113 | 121 | ################################################################################################### |
114 | 122 |
|
115 | | -trap 'error_handler $0 $LINENO' ERR # ERROR HANDLER |
| 123 | +trap 'error_handler $0 $LINENO' ERR |
116 | 124 |
|
117 | 125 | if [ -z "$HF_MODEL_CKPT" ]; then |
118 | 126 | echo "ERROR: HF_MODEL_CKPT must be set." >&2; exit 1 |
@@ -154,11 +162,9 @@ launch_vllm() { |
154 | 162 | # would expose *zero* GPUs (not all), so leave it unset to use the whole node. |
155 | 163 | local -a gpu_env=() |
156 | 164 | [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") |
157 | | - # Optional single-value memory knobs (each a space-free env value, so they |
158 | | - # survive nemo_run's unquoted `export FOO=value`; assembled into --flag value |
159 | | - # pairs here). --cpu-offload-gb spills N GB of weights/GPU to host RAM, the |
160 | | - # key lever for fitting a large model on too-few GPUs (slower, prefill-only |
161 | | - # use tolerates it). --max-model-len / --max-num-seqs trim KV/activation. |
| 165 | + # Optional single-value memory knobs (see header), assembled into --flag |
| 166 | + # value pairs. Each is a space-free env value so it survives nemo_run's |
| 167 | + # unquoted `export FOO=value`. |
162 | 168 | local -a opt_args=() |
163 | 169 | [ -n "${SERVE_CPU_OFFLOAD_GB:-}" ] && opt_args+=(--cpu-offload-gb "$SERVE_CPU_OFFLOAD_GB") |
164 | 170 | [ -n "${SERVE_MAX_MODEL_LEN:-}" ] && opt_args+=(--max-model-len "$SERVE_MAX_MODEL_LEN") |
@@ -222,28 +228,52 @@ wait_vllm_ready() { |
222 | 228 | # per process; multiple workers would duplicate requests against the server. |
223 | 229 | run_trainer_and_export() { |
224 | 230 | local url="$1" cvd="$2" |
225 | | - echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all})..." |
| 231 | + # Optional multi-node trainer routing (see dispatch section). Defaults keep |
| 232 | + # the original single-trainer-node behavior: no --num_nodes, export on rank 0. |
| 233 | + local num_tnodes="${3:-1}" head_ip="${4:-}" mrank="${5:-0}" |
| 234 | + echo "Launching trainer (server=${url}, CUDA_VISIBLE_DEVICES=${cvd:-all}, trainer_nodes=${num_tnodes}, machine_rank=${mrank})..." |
226 | 235 | # Empty cvd -> use all GPUs on the node (don't set the var; "" would hide all). |
227 | 236 | local -a gpu_env=() |
228 | 237 | [ -n "$cvd" ] && gpu_env=(env "CUDA_VISIBLE_DEVICES=$cvd") |
| 238 | + # Engage accelerate multi-node routing only when >1 trainer node; a single |
| 239 | + # trainer node keeps the original invocation (no --num_nodes) verbatim. |
| 240 | + local -a mn_args=() |
| 241 | + if [ "${num_tnodes}" -gt 1 ]; then |
| 242 | + mn_args=(--num_nodes "$num_tnodes" --head_node_ip "$head_ip" --machine_rank "$mrank") |
| 243 | + fi |
229 | 244 | "${gpu_env[@]}" bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \ |
230 | 245 | "${SCRIPT_ARGS[@]}" \ |
| 246 | + "${mn_args[@]}" \ |
231 | 247 | data.streaming_server_url="$url" \ |
232 | 248 | data.streaming_model_name="$HF_MODEL_CKPT" \ |
233 | 249 | data.streaming_shared_storage_path="$SERVE_SCRATCH" \ |
234 | 250 | training.dataloader_num_workers=0 || { echo "ERROR: trainer failed." >&2; return 1; } |
235 | 251 |
|
| 252 | + # Export only on the head trainer (machine_rank 0); non-head trainer nodes |
| 253 | + # would race writing the same export dir. The export reads the saved |
| 254 | + # checkpoint (training.output_dir), not the serve, so it is serve-independent. |
| 255 | + if [ "${mrank}" -ne 0 ]; then |
| 256 | + echo "machine_rank=${mrank}: training done, skipping export (head trainer handles it)." |
| 257 | + return 0 |
| 258 | + fi |
| 259 | + |
| 260 | + # Export the trained draft to HF format. Derive the checkpoint dir from the |
| 261 | + # forwarded `training.output_dir=` dotlist (defaulting to the EAGLE |
| 262 | + # convention) so EAGLE and DFlash runs each export their own output_dir. |
| 263 | + # EXPORT_EXTRA_ARGS lets DFlash on a custom-modeling base (e.g. Kimi) pass |
| 264 | + # --trust_remote_code; empty by default so EAGLE behavior is unchanged. |
| 265 | + local out_dir |
| 266 | + out_dir=$(printf '%s\n' "${SCRIPT_ARGS[@]}" | sed -n 's/^training\.output_dir=//p' | tail -1) |
| 267 | + out_dir="${out_dir:-/scratchspace/eagle3}" |
236 | 268 | python3 modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \ |
237 | | - --model_path /scratchspace/eagle3 \ |
238 | | - --export_path /scratchspace/export |
| 269 | + --model_path "$out_dir" \ |
| 270 | + --export_path "${EXPORT_PATH:-/scratchspace/export}" \ |
| 271 | + ${EXPORT_EXTRA_ARGS:-} |
239 | 272 | } |
240 | 273 |
|
241 | 274 | # --------------------------------------------------------------------------- |
242 | | -# Topology dispatch (driven by the Slurm allocation, i.e. the yaml `nodes:`): |
243 | | -# SLURM_NNODES == 1 -> co-located: vllm on $SERVE_GPU, trainer on the rest. |
244 | | -# SLURM_NNODES >= 2 -> split: node 0 serves on all its GPUs, node 1 trains on |
245 | | -# all its GPUs; they rendezvous via /scratchspace. |
246 | | -# nemo_run runs this script once per node, so we branch on $SLURM_NODEID. |
| 275 | +# Topology dispatch (see header): nemo_run runs this script once per node, so |
| 276 | +# branch on $SLURM_NNODES / $SLURM_NODEID. Per-branch detail in section heads. |
247 | 277 | # --------------------------------------------------------------------------- |
248 | 278 | NNODES="${SLURM_NNODES:-1}" |
249 | 279 | NODEID="${SLURM_NODEID:-0}" |
@@ -299,27 +329,55 @@ elif [ "$NODEID" -eq 0 ]; then |
299 | 329 | while [ ! -f "$DONE_FILE" ]; do sleep 10; done |
300 | 330 | echo "Training-done sentinel seen; serve node exiting (EXIT trap stops vllm)." |
301 | 331 |
|
302 | | -elif [ "$NODEID" -eq 1 ]; then |
303 | | - # ---------------------- multi-node: trainer node ----------------------- |
304 | | - # Release the serve node on any exit (success or failure) so it doesn't hang. |
305 | | - trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT |
| 332 | +elif [ "$NODEID" -ge 1 ]; then |
| 333 | + # -------------------- multi-node: trainer node(s) ---------------------- |
| 334 | + # Node 0 is the vllm serve; trainer nodes are SLURM nodes 1..NNODES-1, which |
| 335 | + # map to 0-based accelerate machine ranks (head trainer = SLURM node 1). |
| 336 | + NUM_TRAINER_NODES=$(( NNODES - 1 )) |
| 337 | + TRAINER_RANK=$(( NODEID - 1 )) |
| 338 | + TRAINER_ADDR_FILE="/scratchspace/.trainer_addr" |
| 339 | + |
| 340 | + # Only the head trainer (rank 0) signals the serve node to release on exit; |
| 341 | + # a non-head node exiting first must NOT tear the serve down early. |
| 342 | + if [ "$TRAINER_RANK" -eq 0 ]; then |
| 343 | + trap 'touch "$DONE_FILE" 2>/dev/null || true' EXIT |
| 344 | + rm -f "$TRAINER_ADDR_FILE" # clear stale rendezvous state |
| 345 | + fi |
306 | 346 |
|
307 | | - echo "Trainer node waiting (up to ${SERVE_READY_TIMEOUT}s) for the serve address..." |
| 347 | + echo "Trainer node (rank ${TRAINER_RANK}/${NUM_TRAINER_NODES}) waiting for the serve address..." |
308 | 348 | for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do |
309 | 349 | [ -f "$SERVE_ADDR_FILE" ] && break |
310 | 350 | sleep 1 |
311 | 351 | done |
312 | 352 | [ -f "$SERVE_ADDR_FILE" ] || { echo "ERROR: serve node never published its address." >&2; exit 1; } |
313 | 353 | URL="http://$(cat "$SERVE_ADDR_FILE"):${SERVE_PORT}" |
314 | | - |
315 | 354 | wait_vllm_ready "$URL" || exit 1 |
316 | | - run_trainer_and_export "$URL" "" || exit 1 |
317 | 355 |
|
318 | | -else |
319 | | - # ------------- multi-node: extra nodes (unused by default) ------------- |
320 | | - echo "Node rank ${NODEID} idle: the default split uses node 0 = vllm serve, node 1 = trainer." |
321 | | - echo "Multi-node *training* (>1 trainer node) is not wired up yet; allocate exactly 2 nodes." |
322 | | - while [ ! -f "$DONE_FILE" ]; do sleep 10; done |
| 356 | + if [ "$NUM_TRAINER_NODES" -le 1 ]; then |
| 357 | + # Original 1-serve + 1-trainer topology: single-node DDP, unchanged. |
| 358 | + run_trainer_and_export "$URL" "" || exit 1 |
| 359 | + else |
| 360 | + # >1 trainer node: head (rank 0) publishes its routable IP for accelerate's |
| 361 | + # c10d rendezvous (port 29500); all trainer nodes read it and join. Reuse |
| 362 | + # the serve node's IP-resolution logic (avoid link-local / loopback). |
| 363 | + if [ "$TRAINER_RANK" -eq 0 ]; then |
| 364 | + head_addr="${TRAINER_ADVERTISE_IP:-}" |
| 365 | + [ -z "$head_addr" ] && head_addr=$(getent hosts "${SLURMD_NODENAME:-$(hostname)}" 2>/dev/null | awk '{print $1}' | head -1) |
| 366 | + [ -z "$head_addr" ] && head_addr=$(hostname -I | tr ' ' '\n' | grep -vE '^(127\.|169\.254\.|fe80:|::1)' | head -1) |
| 367 | + [ -z "$head_addr" ] && head_addr=$(hostname -I | awk '{print $1}') |
| 368 | + echo "$head_addr" > "$TRAINER_ADDR_FILE" |
| 369 | + echo "Head trainer (rank 0) published ${head_addr} for c10d rendezvous." |
| 370 | + else |
| 371 | + echo "Trainer rank ${TRAINER_RANK} waiting for head-trainer address..." |
| 372 | + for ((i = 0; i < SERVE_READY_TIMEOUT; i++)); do |
| 373 | + [ -f "$TRAINER_ADDR_FILE" ] && break |
| 374 | + sleep 1 |
| 375 | + done |
| 376 | + [ -f "$TRAINER_ADDR_FILE" ] || { echo "ERROR: head trainer never published its address." >&2; exit 1; } |
| 377 | + fi |
| 378 | + HEAD_IP=$(cat "$TRAINER_ADDR_FILE") |
| 379 | + run_trainer_and_export "$URL" "" "$NUM_TRAINER_NODES" "$HEAD_IP" "$TRAINER_RANK" || exit 1 |
| 380 | + fi |
323 | 381 | fi |
324 | 382 |
|
325 | 383 | ################################################################################################### |
|
0 commit comments