@@ -486,7 +486,10 @@ def srun_call(node_index: int, script: str, args: str, comment: str) -> str:
486486 # per-srun basis instead of mutating the user's env toml.
487487 f' --container-mounts="$RANKS_DIR:$RANKS_DIR" \\ \n '
488488 f' --environment="{ env } " \\ \n '
489- f' bash "$RANKS_DIR/{ script } " { args } &'
489+ f' bash "$RANKS_DIR/{ script } " { args } &\n '
490+ # Track this srun's PID so the footer's `wait -n` exits as soon
491+ # as the first critical bg job dies (and the trap kills the rest).
492+ f"critical_pids+=($!)"
490493 )
491494
492495 blocks = []
@@ -523,6 +526,9 @@ def _render_vmagent(launch_args: LaunchArgs) -> str:
523526 return (
524527 "# vmagent runs on the batch node; pyxis containers share the host network\n "
525528 "# namespace so the framework API server is reachable at localhost:8080.\n "
529+ "# vmagent is non-critical: disowned so it's not in `wait -n`'s scope, and\n "
530+ "# the EXIT trap in the footer kills it when master.sh terminates so the\n "
531+ "# allocation can be released as soon as the framework process is gone.\n "
526532 'if [[ -x "$metrics_agent_bin" ]]; then\n '
527533 ' "$metrics_agent_bin" \\ \n '
528534 f" -promscrape.config={ _VMAGENT_SCRAPE_CONFIG } \\ \n "
@@ -533,6 +539,8 @@ def _render_vmagent(launch_args: LaunchArgs) -> str:
533539 ' -remoteWrite.label="user=${SLURM_JOB_USER}" \\ \n '
534540 ' "-remoteWrite.tmpDataPath=/tmp/vmagent-data-${SLURM_JOB_ID}" \\ \n '
535541 ' > "/tmp/vmagent-${SLURM_JOB_ID}.log" 2>&1 &\n '
542+ " vmagent_pid=$!\n "
543+ ' disown "$vmagent_pid"\n '
536544 "else\n "
537545 ' echo "metrics: $metrics_agent_bin not found, skipping push" >&2\n '
538546 "fi"
@@ -555,6 +563,7 @@ def _render_router_launch(launch_args: LaunchArgs) -> str:
555563 f' --environment="{ launch_args .environment } " \\ \n '
556564 " --overlap \\ \n "
557565 f' bash "$RANKS_DIR/router.sh" { ip_args } &\n '
566+ "critical_pids+=($!)\n "
558567 "\n "
559568 "echo\n "
560569 f'echo "Router URL: http://$router_host_ip:{ SGLANG_ROUTER_PORT } " # NOSONAR'
@@ -571,8 +580,29 @@ def _render_footer() -> str:
571580 'echo "Make sure to cancel the job at the end:"\n '
572581 'echo "scancel $SLURM_JOB_ID"\n '
573582 "\n "
574- "wait\n "
575- 'echo "Script finished at $(date)"'
583+ # Tear down as soon as the first critical bg job (head / follower /
584+ # router srun) exits. A healthy launch keeps those running until the
585+ # SLURM time limit; any exit means the inference server is gone, so
586+ # vmagent has nothing to scrape and SLURM should release the nodes.
587+ # The previous `wait` with no args waited for *all* bg jobs including
588+ # vmagent — which never exits — so a failed head srun left the job
589+ # RUNNING until time-limit.
590+ "cleanup() {\n "
591+ ' if [[ -n "$vmagent_pid" ]]; then\n '
592+ ' kill "$vmagent_pid" 2>/dev/null || true\n '
593+ " fi\n "
594+ " if (( ${#critical_pids[@]} > 0 )); then\n "
595+ ' kill "${critical_pids[@]}" 2>/dev/null || true\n '
596+ " fi\n "
597+ "}\n "
598+ "trap cleanup EXIT\n "
599+ "trap 'exit 143' TERM\n "
600+ "trap 'exit 130' INT\n "
601+ "\n "
602+ "rc=0\n "
603+ "wait -n || rc=$?\n "
604+ 'echo "Master finished at $(date) with code $rc"\n '
605+ 'exit "$rc"'
576606 )
577607
578608
@@ -623,6 +653,13 @@ def render_master(launch_args: LaunchArgs) -> str:
623653 sections : list [str ] = [
624654 "# shellcheck shell=bash" ,
625655 "set -euo pipefail" ,
656+ # Lifecycle tracking. critical_pids collects the head / follower /
657+ # router srun PIDs; the footer's `wait -n` exits as soon as the first
658+ # one dies. vmagent_pid (if metrics are enabled) is held separately
659+ # so it stays out of `wait -n`'s scope but is still killed by the
660+ # EXIT trap. Initialised here so `set -u` is happy even when no
661+ # vmagent is rendered, or if cleanup runs before launches start.
662+ 'critical_pids=()\n vmagent_pid=""' ,
626663 _render_self_extracting_ranks (render_rank_scripts (launch_args )),
627664 ]
628665
0 commit comments