marin-community · yonromai · Mar 23, 2026 · Mar 23, 2026 · yonromai · Mar 23, 2026
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -622,13 +622,18 @@ def _coordinator_loop(self) -> None:
         last_log_time = 0.0
 
         while not self._shutdown_event.is_set():
-            self.check_heartbeats()
-            self._check_worker_group()
-
-            now = time.monotonic()
-            if self._has_active_execution() and now - last_log_time > 5.0:
-                self._log_status()
-                last_log_time = now
+            try:
+                self.check_heartbeats()
+                self._check_worker_group()
+
+                now = time.monotonic()
+                if self._has_active_execution() and now - last_log_time > 5.0:
+                    self._log_status()
+                    last_log_time = now
+            except Exception:
+                logger.exception("Coordinator loop crashed, aborting pipeline")
+                self.abort("Coordinator loop crashed unexpectedly")
+                return
 
             self._shutdown_event.wait(timeout=0.5)
 
@@ -649,8 +654,10 @@ def _has_active_execution(self) -> bool:
         return self._execution_id != "" and self._total_shards > 0 and self._completed_shards < self._total_shards
 
     def _log_status(self) -> None:
-        alive = sum(1 for s in self._worker_states.values() if s in {WorkerState.READY, WorkerState.BUSY})
-        dead = sum(1 for s in self._worker_states.values() if s in {WorkerState.FAILED, WorkerState.DEAD})
+        with self._lock:
+            states = list(self._worker_states.values())
+        alive = sum(1 for s in states if s in {WorkerState.READY, WorkerState.BUSY})
+        dead = sum(1 for s in states if s in {WorkerState.FAILED, WorkerState.DEAD})
         logger.info(
             "[%s] [%s] %d/%d complete, %d in-flight, %d queued, %d/%d workers alive, %d dead",
             self._execution_id,

diff --git a/lib/zephyr/tests/test_execution.py b/lib/zephyr/tests/test_execution.py
@@ -618,6 +618,31 @@ def test_pull_task_returns_shutdown_on_last_stage_empty_queue(actor_context, tmp
     assert result == "SHUTDOWN"
 
 
+def test_coordinator_loop_crash_aborts_pipeline(actor_context, tmp_path):
+    """Coordinator loop crash sets _fatal_error instead of dying silently. #3996."""
+    from zephyr.execution import ZephyrCoordinator
+
+    coord = ZephyrCoordinator()
+    coord.set_chunk_config(str(tmp_path / "chunks"), "test-exec")
+
+    crashed = threading.Event()
+    original = coord.check_heartbeats
+
+    def crashing_heartbeats(*a, **kw):
+        if not crashed.is_set():
+            crashed.set()
+            raise RuntimeError("dictionary changed size during iteration")
+        return original(*a, **kw)
+
+    coord.check_heartbeats = crashing_heartbeats
+
+    t = threading.Thread(target=coord._coordinator_loop, daemon=True, name="zephyr-coordinator-loop")
+    t.start()
+    assert crashed.wait(timeout=5.0)
+    t.join(timeout=2.0)
+    assert coord._fatal_error is not None
+
+
 def test_run_pipeline_rejects_concurrent_calls(actor_context, tmp_path):
     """Calling run_pipeline while another is already running raises RuntimeError."""
     from unittest.mock import MagicMock