Skip to content

Commit 72bbe27

Browse files
committed
Job Termination on Error
1 parent d4d41a3 commit 72bbe27

1 file changed

Lines changed: 40 additions & 3 deletions

File tree

src/swiss_ai_model_launch/launchers/framework.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,10 @@ def srun_call(node_index: int, script: str, args: str, comment: str) -> str:
486486
# per-srun basis instead of mutating the user's env toml.
487487
f' --container-mounts="$RANKS_DIR:$RANKS_DIR" \\\n'
488488
f' --environment="{env}" \\\n'
489-
f' bash "$RANKS_DIR/{script}" {args} &'
489+
f' bash "$RANKS_DIR/{script}" {args} &\n'
490+
# Track this srun's PID so the footer's `wait -n` exits as soon
491+
# as the first critical bg job dies (and the trap kills the rest).
492+
f"critical_pids+=($!)"
490493
)
491494

492495
blocks = []
@@ -523,6 +526,9 @@ def _render_vmagent(launch_args: LaunchArgs) -> str:
523526
return (
524527
"# vmagent runs on the batch node; pyxis containers share the host network\n"
525528
"# namespace so the framework API server is reachable at localhost:8080.\n"
529+
"# vmagent is non-critical: disowned so it's not in `wait -n`'s scope, and\n"
530+
"# the EXIT trap in the footer kills it when master.sh terminates so the\n"
531+
"# allocation can be released as soon as the framework process is gone.\n"
526532
'if [[ -x "$metrics_agent_bin" ]]; then\n'
527533
' "$metrics_agent_bin" \\\n'
528534
f" -promscrape.config={_VMAGENT_SCRAPE_CONFIG} \\\n"
@@ -533,6 +539,8 @@ def _render_vmagent(launch_args: LaunchArgs) -> str:
533539
' -remoteWrite.label="user=${SLURM_JOB_USER}" \\\n'
534540
' "-remoteWrite.tmpDataPath=/tmp/vmagent-data-${SLURM_JOB_ID}" \\\n'
535541
' > "/tmp/vmagent-${SLURM_JOB_ID}.log" 2>&1 &\n'
542+
" vmagent_pid=$!\n"
543+
' disown "$vmagent_pid"\n'
536544
"else\n"
537545
' echo "metrics: $metrics_agent_bin not found, skipping push" >&2\n'
538546
"fi"
@@ -555,6 +563,7 @@ def _render_router_launch(launch_args: LaunchArgs) -> str:
555563
f' --environment="{launch_args.environment}" \\\n'
556564
" --overlap \\\n"
557565
f' bash "$RANKS_DIR/router.sh" {ip_args} &\n'
566+
"critical_pids+=($!)\n"
558567
"\n"
559568
"echo\n"
560569
f'echo "Router URL: http://$router_host_ip:{SGLANG_ROUTER_PORT}" # NOSONAR'
@@ -571,8 +580,29 @@ def _render_footer() -> str:
571580
'echo "Make sure to cancel the job at the end:"\n'
572581
'echo "scancel $SLURM_JOB_ID"\n'
573582
"\n"
574-
"wait\n"
575-
'echo "Script finished at $(date)"'
583+
# Tear down as soon as the first critical bg job (head / follower /
584+
# router srun) exits. A healthy launch keeps those running until the
585+
# SLURM time limit; any exit means the inference server is gone, so
586+
# vmagent has nothing to scrape and SLURM should release the nodes.
587+
# The previous `wait` with no args waited for *all* bg jobs including
588+
# vmagent — which never exits — so a failed head srun left the job
589+
# RUNNING until time-limit.
590+
"cleanup() {\n"
591+
' if [[ -n "$vmagent_pid" ]]; then\n'
592+
' kill "$vmagent_pid" 2>/dev/null || true\n'
593+
" fi\n"
594+
" if (( ${#critical_pids[@]} > 0 )); then\n"
595+
' kill "${critical_pids[@]}" 2>/dev/null || true\n'
596+
" fi\n"
597+
"}\n"
598+
"trap cleanup EXIT\n"
599+
"trap 'exit 143' TERM\n"
600+
"trap 'exit 130' INT\n"
601+
"\n"
602+
"rc=0\n"
603+
"wait -n || rc=$?\n"
604+
'echo "Master finished at $(date) with code $rc"\n'
605+
'exit "$rc"'
576606
)
577607

578608

@@ -623,6 +653,13 @@ def render_master(launch_args: LaunchArgs) -> str:
623653
sections: list[str] = [
624654
"# shellcheck shell=bash",
625655
"set -euo pipefail",
656+
# Lifecycle tracking. critical_pids collects the head / follower /
657+
# router srun PIDs; the footer's `wait -n` exits as soon as the first
658+
# one dies. vmagent_pid (if metrics are enabled) is held separately
659+
# so it stays out of `wait -n`'s scope but is still killed by the
660+
# EXIT trap. Initialised here so `set -u` is happy even when no
661+
# vmagent is rendered, or if cleanup runs before launches start.
662+
'critical_pids=()\nvmagent_pid=""',
626663
_render_self_extracting_ranks(render_rank_scripts(launch_args)),
627664
]
628665

0 commit comments

Comments
 (0)