Skip to content

Commit a4b8454

Browse files
yoblinclaude
andcommitted
fix(zephyr): add lifecycle logging to coordinator thread
The coordinator thread had no entry, exit, or error logging, making production hangs impossible to diagnose. Wrap the loop in try/except with full traceback logging, and have _wait_for_stage fail fast when the coordinator thread is dead instead of spinning forever. Closes #4004 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c5a00c7 commit a4b8454

1 file changed

Lines changed: 23 additions & 8 deletions

File tree

lib/zephyr/src/zephyr/execution.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -619,18 +619,25 @@ def register_worker(self, worker_id: str, worker_handle: ActorHandle) -> None:
619619

620620
def _coordinator_loop(self) -> None:
621621
"""Background loop for heartbeat checking and worker job monitoring."""
622+
logger.info("Coordinator loop started (thread=%s)", threading.current_thread().name)
622623
last_log_time = 0.0
623624

624-
while not self._shutdown_event.is_set():
625-
self.check_heartbeats()
626-
self._check_worker_group()
625+
try:
626+
while not self._shutdown_event.is_set():
627+
self.check_heartbeats()
628+
self._check_worker_group()
629+
630+
now = time.monotonic()
631+
if self._has_active_execution() and now - last_log_time > 5.0:
632+
self._log_status()
633+
last_log_time = now
627634

628-
now = time.monotonic()
629-
if self._has_active_execution() and now - last_log_time > 5.0:
630-
self._log_status()
631-
last_log_time = now
635+
self._shutdown_event.wait(timeout=0.5)
632636

633-
self._shutdown_event.wait(timeout=0.5)
637+
logger.info("Coordinator loop exiting: shutdown event set")
638+
except Exception:
639+
logger.error("Coordinator loop crashed with unhandled exception", exc_info=True)
640+
self._fatal_error = "Coordinator thread crashed — see logs for traceback"
634641

635642
def _check_worker_group(self) -> None:
636643
"""Abort the pipeline if the worker job has permanently terminated."""
@@ -861,6 +868,14 @@ def _wait_for_stage(self) -> None:
861868
# Workers are alive — reset the dead timer
862869
all_dead_since = None
863870

871+
# Checked after completion so a clean shutdown racing the final
872+
# task can never false-positive — only true crashes reach here.
873+
if not self._coordinator_thread.is_alive():
874+
raise ZephyrWorkerError(
875+
"Coordinator thread is no longer alive. "
876+
"Check logs for 'Coordinator loop crashed' for the root cause."
877+
)
878+
864879
if completed != last_log_completed:
865880
logger.info("[%s] %d/%d tasks completed", self._stage_name, completed, total)
866881
last_log_completed = completed

0 commit comments

Comments
 (0)