Skip to content

Commit 12046aa

Browse files
yonromaiclaude
andcommitted
Preserve underlying exception across executor step-failure wrap
Marin's StepRunner used to wrap every pipeline failure into a generic `RuntimeError("N step(s) failed")` at the end of its orchestration loop. That wrap threw away the concrete exception type and pushed the real traceback two levels deep — in #5026 the outer log said only `RuntimeError: 1 step(s) failed`, burying the actual `LeaseLostError` that triage needed to see. When exactly one step fails, re-raise the per-step `RuntimeError("Step failed: <name>")` directly; its `__cause__` is already the original exception, so the real type and traceback now surface at the top of the log. With multiple independent failures we keep the summary wrap (callers need the count) but still chain via `from failures[0]` as before. Tests updated: the executor and step-runner tests previously asserted on the old `"1 step(s) failed"` message — they now match on the per-step message directly and keep validating that the original exception is preserved via `__cause__`. Added a new test for the multi-failure summary path. Refs #5026 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5624419 commit 12046aa

3 files changed

Lines changed: 53 additions & 8 deletions

File tree

lib/marin/src/marin/execution/step_runner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,12 @@ def _do_launch(step: StepSpec) -> None:
232232
_flush_waiting()
233233

234234
if failures:
235+
# Preserve the original failing exception so triage can see its real
236+
# type and traceback instead of a generic `RuntimeError: N step(s)
237+
# failed` wrapper. See marin-community/marin#5026 for the incident
238+
# where a LeaseLostError was buried under this wrap.
239+
if len(failures) == 1:
240+
raise failures[0]
235241
raise RuntimeError(f"{len(failures)} step(s) failed") from failures[0]
236242

237243
def _launch_step(

tests/execution/test_executor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,10 @@ def fn_pass(config: MyConfig | None):
234234

235235
with tempfile.TemporaryDirectory(prefix="executor-") as temp_dir:
236236
executor_initial = Executor(prefix=temp_dir, executor_info_base_path=temp_dir)
237-
with pytest.raises(RuntimeError, match=r"1 step\(s\) failed"):
237+
# Single-failure runs re-raise the per-step error directly so the
238+
# original exception type/traceback stays visible; see
239+
# marin-community/marin#5026.
240+
with pytest.raises(RuntimeError, match=r"Step failed: b_"):
238241
executor_initial.run(steps=[a])
239242

240243
with pytest.raises(FileNotFoundError):

tests/execution/test_step_runner.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,14 @@ def test_runner_raises_clear_error_for_unmet_deps(tmp_path: Path):
398398

399399

400400
def test_runner_preserves_underlying_step_exception(tmp_path: Path):
401-
"""The top-level runner error should retain the original failing exception as a cause."""
401+
"""The top-level runner error should retain the original failing exception as a cause.
402+
403+
With a single failed step, the runner re-raises the per-step RuntimeError
404+
directly rather than wrapping it in a generic "N step(s) failed" — the
405+
per-step error already has the original exception chained via __cause__,
406+
so the real exception type and traceback surface at the top of the report.
407+
See marin-community/marin#5026.
408+
"""
402409

403410
def failing_step(_output_path: str) -> None:
404411
raise ValueError("sentinel step failure")
@@ -410,14 +417,43 @@ def failing_step(_output_path: str) -> None:
410417
)
411418

412419
runner = StepRunner()
413-
with pytest.raises(RuntimeError, match=r"1 step\(s\) failed") as exc_info:
420+
with pytest.raises(RuntimeError, match=r"Step failed: failing_step") as exc_info:
414421
runner.run([step])
415422

416-
step_failure = exc_info.value.__cause__
417-
assert isinstance(step_failure, RuntimeError)
418-
assert "Step failed: failing_step" in str(step_failure)
419-
assert isinstance(step_failure.__cause__, ValueError)
420-
assert "sentinel step failure" in str(step_failure.__cause__)
423+
# Original ValueError is preserved as the __cause__ of the per-step error.
424+
assert isinstance(exc_info.value.__cause__, ValueError)
425+
assert "sentinel step failure" in str(exc_info.value.__cause__)
426+
427+
428+
def test_runner_summarizes_multiple_step_failures(tmp_path: Path):
429+
"""With multiple independent failures, the runner summarizes and chains the first."""
430+
431+
def failing_step_a(_output_path: str) -> None:
432+
raise ValueError("failure A")
433+
434+
def failing_step_b(_output_path: str) -> None:
435+
raise RuntimeError("failure B")
436+
437+
step_a = StepSpec(
438+
name="step_a",
439+
override_output_path=(tmp_path / "step_a").as_posix(),
440+
fn=failing_step_a,
441+
)
442+
step_b = StepSpec(
443+
name="step_b",
444+
override_output_path=(tmp_path / "step_b").as_posix(),
445+
fn=failing_step_b,
446+
)
447+
448+
runner = StepRunner()
449+
with pytest.raises(RuntimeError, match=r"2 step\(s\) failed") as exc_info:
450+
runner.run([step_a, step_b])
451+
452+
# The first failure is chained so its original exception type/traceback
453+
# remains reachable via __cause__.__cause__.
454+
first = exc_info.value.__cause__
455+
assert isinstance(first, RuntimeError)
456+
assert "Step failed:" in str(first)
421457

422458

423459
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)