fix(zephyr): add lifecycle logging to coordinator thread

yoblin · claude · yoblin · commit a4b84548c3a0 · 2026-03-23T20:05:00.000Z
The coordinator thread had no entry, exit, or error logging, making production hangs impossible to diagnose. Wrap the loop in try/except with full traceback logging, and have _wait_for_stage fail fast when the coordinator thread is dead instead of spinning forever. Closes #4004 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -619,18 +619,25 @@ def register_worker(self, worker_id: str, worker_handle: ActorHandle) -> None:
 
     def _coordinator_loop(self) -> None:
         """Background loop for heartbeat checking and worker job monitoring."""
+        logger.info("Coordinator loop started (thread=%s)", threading.current_thread().name)
         last_log_time = 0.0
 
-        while not self._shutdown_event.is_set():
-            self.check_heartbeats()
-            self._check_worker_group()
+        try:
+            while not self._shutdown_event.is_set():
+                self.check_heartbeats()
+                self._check_worker_group()
+
+                now = time.monotonic()
+                if self._has_active_execution() and now - last_log_time > 5.0:
+                    self._log_status()
+                    last_log_time = now
 
-            now = time.monotonic()
-            if self._has_active_execution() and now - last_log_time > 5.0:
-                self._log_status()
-                last_log_time = now
+                self._shutdown_event.wait(timeout=0.5)
 
-            self._shutdown_event.wait(timeout=0.5)
+            logger.info("Coordinator loop exiting: shutdown event set")
+        except Exception:
+            logger.error("Coordinator loop crashed with unhandled exception", exc_info=True)
+            self._fatal_error = "Coordinator thread crashed — see logs for traceback"
 
     def _check_worker_group(self) -> None:
         """Abort the pipeline if the worker job has permanently terminated."""
@@ -861,6 +868,14 @@ def _wait_for_stage(self) -> None:
                     # Workers are alive — reset the dead timer
                     all_dead_since = None
 
+            # Checked after completion so a clean shutdown racing the final
+            # task can never false-positive — only true crashes reach here.
+            if not self._coordinator_thread.is_alive():
+                raise ZephyrWorkerError(
+                    "Coordinator thread is no longer alive. "
+                    "Check logs for 'Coordinator loop crashed' for the root cause."
+                )
+
             if completed != last_log_completed:
                 logger.info("[%s] %d/%d tasks completed", self._stage_name, completed, total)
                 last_log_completed = completed