@@ -116,28 +116,41 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str) -> str:
116116
117117 script_lines = [
118118 "#!/usr/bin/env bash" ,
119- "set -euo pipefail" ,
119+ "set -uo pipefail" ,
120120 "" ,
121121 f'export NEMORUN_HOME="{ output_dir } "' ,
122122 'mkdir -p "$NEMORUN_HOME"' ,
123123 f'LOG="{ log_path } "' ,
124+ f'WRAPPER_STDOUT="{ output_dir / "cloudai_megatron_bridge_wrapper.stdout" } "' ,
125+ f'WRAPPER_STDERR="{ output_dir / "cloudai_megatron_bridge_wrapper.stderr" } "' ,
126+ # Mirror wrapper stdout/stderr to files for debugging while still emitting to the parent process.
127+ 'exec > >(tee -a "$WRAPPER_STDOUT") 2> >(tee -a "$WRAPPER_STDERR" >&2)' ,
124128 "" ,
125129 # Launch Megatron-Bridge (log stdout/stderr to file)
126130 "" ,
127131 ': >"$LOG"' ,
128- f'{ launcher_cmd } >>"$LOG" 2>&1' ,
132+ "LAUNCH_RC=0" ,
133+ f'{ launcher_cmd } >>"$LOG" 2>&1 || LAUNCH_RC=$?' ,
129134 "" ,
130- # Parse job id from Megatron-Bridge output (format: 'Job id: <num>' )
135+ # Parse job id from Megatron-Bridge output (multiple possible formats )
131136 "" ,
132137 'JOB_ID=""' ,
133- 'JOB_ID=$(grep -Eio "Job id[: ]+[0-9]+" "$LOG" | tail -n1 | grep -Eo "[0-9]+" | tail -n1 || true)' ,
138+ 'JOB_ID=$(grep -Eio "(^|[^a-zA-Z])Job id[: ]+[0-9]+" "$LOG" | '
139+ 'tail -n1 | grep -Eo "[0-9]+" | tail -n1 || true)' ,
134140 "" ,
135141 # Emit a canonical line for CloudAI to parse
136142 "" ,
137143 'if [ -n "${JOB_ID}" ]; then' ,
144+ ' if [ "${LAUNCH_RC}" -ne 0 ]; then' ,
145+ ' echo "Megatron-Bridge launcher exited non-zero (${LAUNCH_RC}) after submitting job ${JOB_ID}." >&2' ,
146+ ' tail -n 200 "$LOG" >&2 || true' ,
147+ " fi" ,
138148 ' echo "Submitted batch job ${JOB_ID}"' ,
139149 "else" ,
140150 ' echo "Failed to retrieve job ID." >&2' ,
151+ ' if [ "${LAUNCH_RC}" -ne 0 ]; then' ,
152+ ' echo "Launcher exit code: ${LAUNCH_RC}" >&2' ,
153+ " fi" ,
141154 ' tail -n 200 "$LOG" >&2 || true' ,
142155 " exit 1" ,
143156 "fi" ,
0 commit comments