Skip to content

Commit a52fc51

Browse files
make job_id extraction robust
1 parent 28435e5 commit a52fc51

File tree

1 file changed

+17
-4
lines changed

1 file changed

+17
-4
lines changed

src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,28 +116,41 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str) -> str:
116116

117117
script_lines = [
118118
"#!/usr/bin/env bash",
119-
"set -euo pipefail",
119+
"set -uo pipefail",
120120
"",
121121
f'export NEMORUN_HOME="{output_dir}"',
122122
'mkdir -p "$NEMORUN_HOME"',
123123
f'LOG="{log_path}"',
124+
f'WRAPPER_STDOUT="{output_dir / "cloudai_megatron_bridge_wrapper.stdout"}"',
125+
f'WRAPPER_STDERR="{output_dir / "cloudai_megatron_bridge_wrapper.stderr"}"',
126+
# Mirror wrapper stdout/stderr to files for debugging while still emitting to the parent process.
127+
'exec > >(tee -a "$WRAPPER_STDOUT") 2> >(tee -a "$WRAPPER_STDERR" >&2)',
124128
"",
125129
# Launch Megatron-Bridge (log stdout/stderr to file)
126130
"",
127131
': >"$LOG"',
128-
f'{launcher_cmd} >>"$LOG" 2>&1',
132+
"LAUNCH_RC=0",
133+
f'{launcher_cmd} >>"$LOG" 2>&1 || LAUNCH_RC=$?',
129134
"",
130-
# Parse job id from Megatron-Bridge output (format: 'Job id: <num>')
135+
# Parse job id from Megatron-Bridge output (multiple possible formats)
131136
"",
132137
'JOB_ID=""',
133-
'JOB_ID=$(grep -Eio "Job id[: ]+[0-9]+" "$LOG" | tail -n1 | grep -Eo "[0-9]+" | tail -n1 || true)',
138+
'JOB_ID=$(grep -Eio "(^|[^a-zA-Z])Job id[: ]+[0-9]+" "$LOG" | '
139+
'tail -n1 | grep -Eo "[0-9]+" | tail -n1 || true)',
134140
"",
135141
# Emit a canonical line for CloudAI to parse
136142
"",
137143
'if [ -n "${JOB_ID}" ]; then',
144+
' if [ "${LAUNCH_RC}" -ne 0 ]; then',
145+
' echo "Megatron-Bridge launcher exited non-zero (${LAUNCH_RC}) after submitting job ${JOB_ID}." >&2',
146+
' tail -n 200 "$LOG" >&2 || true',
147+
" fi",
138148
' echo "Submitted batch job ${JOB_ID}"',
139149
"else",
140150
' echo "Failed to retrieve job ID." >&2',
151+
' if [ "${LAUNCH_RC}" -ne 0 ]; then',
152+
' echo "Launcher exit code: ${LAUNCH_RC}" >&2',
153+
" fi",
141154
' tail -n 200 "$LOG" >&2 || true',
142155
" exit 1",
143156
"fi",

0 commit comments

Comments
 (0)