fix: worker processes exit cleanly after SHUTDOWN (#4099)

yonromai · yoblin · claude · web-flow · commit e4121eec76db · 2026-03-24T14:43:21.000-07:00
## Summary Fixes #4098 — worker processes now exit cleanly after receiving SHUTDOWN instead of blocking forever. - **Root cause**: `_host_actor()` called `threading.Event().wait()` on an anonymous Event with no external reference — nothing could ever call `.set()`. The non-daemon `ActorServer` thread independently kept the process alive. - **Fix**: Add `request_shutdown()` to the fray actor API. It sets a module-level shutdown event that `_host_actor` waits on. When signaled, `_host_actor` calls `server.stop()` for clean teardown. `ZephyrWorker` calls `request_shutdown()` when its polling loop ends (after SHUTDOWN or coordinator death). No-op on Ray/local backends. Uses a module-level global (not ContextVar) so child threads can reach it. ## Test plan - [x] `test_request_shutdown_unblocks_wait` — sets the event from the same thread - [x] `test_request_shutdown_noop_outside_actor` — no-op on Ray/local backends - [x] `test_host_actor_shutdown_stops_server` — full lifecycle: start server, signal shutdown, verify server + threads exit - [x] `test_request_shutdown_works_from_child_thread` — cross-thread: child thread signals shutdown event - [ ] Integration: run a Zephyr pipeline and verify workers exit after SHUTDOWN without external kill --- 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: yoblin <268258002+yoblin@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/lib/fray/src/fray/v2/__init__.py b/lib/fray/src/fray/v2/__init__.py
@@ -3,7 +3,15 @@
 
 """Fray v2: minimal job and actor scheduling interface."""
 
-from fray.v2.actor import ActorContext, ActorFuture, ActorGroup, ActorHandle, ActorMethod, current_actor
+from fray.v2.actor import (
+    ActorContext,
+    ActorFuture,
+    ActorGroup,
+    ActorHandle,
+    ActorMethod,
+    current_actor,
+    request_shutdown,
+)
 from fray.v2.client import Client, JobAlreadyExists, JobFailed, JobHandle, current_client, set_current_client, wait_all
 from fray.v2.local_backend import LocalActorHandle, LocalActorMethod, LocalClient, LocalJobHandle
 from fray.v2.types import (
@@ -61,6 +69,7 @@
     "current_actor",
     "current_client",
     "get_tpu_topology",
+    "request_shutdown",
     "set_current_client",
     "wait_all",
 ]
diff --git a/lib/fray/src/fray/v2/actor.py b/lib/fray/src/fray/v2/actor.py
@@ -10,6 +10,7 @@
 
 from __future__ import annotations
 
+import threading
 from contextvars import ContextVar
 from dataclasses import dataclass
 from typing import Any, Protocol
@@ -50,6 +51,10 @@ def shutdown(self) -> None:
 
 _current_actor_ctx: ContextVar[ActorContext | None] = ContextVar("actor_context", default=None)
 
+# Module-level (not ContextVar) so child threads spawned by the actor can call
+# request_shutdown(). _host_actor runs one actor per process, so global scope is correct.
+_actor_shutdown_event: threading.Event | None = None
+
 
 def current_actor() -> ActorContext:
     """Get the current actor's context. Must be called from within an actor.
@@ -75,6 +80,27 @@ def _reset_current_actor(token):
     _current_actor_ctx.reset(token)
 
 
+def request_shutdown() -> None:
+    """Signal that the hosting actor process should exit.
+
+    Call from within an actor (e.g. after receiving SHUTDOWN from a coordinator)
+    to unblock _host_actor and trigger a clean server teardown.  No-op when
+    running under a backend that doesn't use _host_actor (Ray, LocalClient).
+    """
+    if _actor_shutdown_event is not None:
+        _actor_shutdown_event.set()
+
+
+def _set_shutdown_event(event: threading.Event) -> None:
+    global _actor_shutdown_event
+    _actor_shutdown_event = event
+
+
+def _clear_shutdown_event() -> None:
+    global _actor_shutdown_event
+    _actor_shutdown_event = None
+
+
 class ActorFuture(Protocol):
     """Future for an actor method call."""
 
diff --git a/lib/fray/src/fray/v2/iris_backend.py b/lib/fray/src/fray/v2/iris_backend.py
@@ -35,7 +35,16 @@
 from iris.cluster.types import Entrypoint as IrisEntrypoint
 from iris.rpc import cluster_pb2
 
-from fray.v2.actor import ActorContext, ActorFuture, ActorHandle, HostedActor, _reset_current_actor, _set_current_actor
+from fray.v2.actor import (
+    ActorContext,
+    ActorFuture,
+    ActorHandle,
+    HostedActor,
+    _clear_shutdown_event,
+    _reset_current_actor,
+    _set_current_actor,
+    _set_shutdown_event,
+)
 from fray.v2.client import JobAlreadyExists as FrayJobAlreadyExists
 from fray.v2.types import (
     ActorConfig,
@@ -216,6 +225,11 @@ def _host_actor(actor_class: type, args: tuple, kwargs: dict, name_prefix: str)
     actor_name = f"{ctx.job_id}/{name_prefix}-{job_info.task_index}"
     logger.info(f"Starting actor: {actor_name} (job_id={ctx.job_id})")
 
+    # Shutdown event lets the actor signal that the hosting process should exit.
+    # request_shutdown() sets this event, unblocking the wait below.
+    shutdown_event = threading.Event()
+    _set_shutdown_event(shutdown_event)
+
     # Create handle BEFORE instance so actor can access it during __init__
     handle = IrisActorHandle(actor_name)
     actor_ctx = ActorContext(handle=handle, index=job_info.task_index, group_name=name_prefix)
@@ -236,8 +250,11 @@ def _host_actor(actor_class: type, args: tuple, kwargs: dict, name_prefix: str)
     ctx.registry.register(actor_name, address)
     logger.info(f"Actor {actor_name} ready and listening")
 
-    # Block forever — job termination kills the process
-    threading.Event().wait()
+    # Block until the actor signals shutdown via request_shutdown()
+    shutdown_event.wait()
+    logger.info(f"Actor {actor_name} shutting down")
+    _clear_shutdown_event()
+    server.stop()
 
 
 class IrisActorHandle:
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -34,6 +34,7 @@
 import pyarrow as pa
 from iris.marin_fs import open_url, url_to_fs
 from fray.v2 import ActorConfig, ActorFuture, ActorHandle, Client, ResourceConfig
+from fray.v2.actor import request_shutdown
 from fray.v2.client import JobHandle
 from fray.v2.types import Entrypoint, JobRequest
 from iris.marin_fs import marin_temp_bucket
@@ -928,7 +929,8 @@ def _run_polling(self, coordinator: ActorHandle) -> None:
         finally:
             self._shutdown_event.set()
             heartbeat_thread.join(timeout=5.0)
-            logger.debug("[%s] Polling loop ended", self._worker_id)
+            logger.debug("[%s] Polling loop ended, requesting host shutdown", self._worker_id)
+            request_shutdown()
 
     def _heartbeat_loop(
         self, coordinator: ActorHandle, interval: float = 5.0, max_consecutive_failures: int = 5
diff --git a/tests/test_host_actor_shutdown.py b/tests/test_host_actor_shutdown.py
@@ -0,0 +1,77 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Regression tests for #4098: _host_actor blocks forever after SHUTDOWN."""
+
+import threading
+
+from iris.actor.server import ActorServer
+from iris.managed_thread import thread_container_scope
+
+from fray.v2.actor import _clear_shutdown_event, _set_shutdown_event, request_shutdown
+
+
+class _Noop:
+    def ping(self) -> str:
+        return "pong"
+
+
+def test_request_shutdown_unblocks_wait():
+    event = threading.Event()
+    _set_shutdown_event(event)
+    try:
+        assert not event.is_set()
+        request_shutdown()
+        assert event.is_set()
+    finally:
+        _clear_shutdown_event()
+
+
+def test_request_shutdown_noop_outside_actor():
+    """No-op when not in a hosted actor — supports Ray/local backends."""
+    request_shutdown()  # should not raise
+
+
+def test_host_actor_shutdown_stops_server():
+    """Shutdown signal unblocks the host thread and tears down the ActorServer."""
+    with thread_container_scope("test-shutdown") as threads:
+        server = ActorServer(host="127.0.0.1", port=0, threads=threads)
+        server.register("test-actor", _Noop())
+        server.serve_background()
+
+        shutdown_event = threading.Event()
+        host_done = threading.Event()
+
+        def host_main():
+            shutdown_event.wait()
+            server.stop()
+            host_done.set()
+
+        host_thread = threading.Thread(target=host_main, daemon=True)
+        host_thread.start()
+        assert threads.is_alive
+
+        shutdown_event.set()
+        assert host_done.wait(timeout=5.0)
+        host_thread.join(timeout=2.0)
+        assert not host_thread.is_alive()
+        assert not threads.is_alive
+
+
+def test_request_shutdown_works_from_child_thread():
+    """request_shutdown() must work from threads spawned by the actor (e.g. polling thread)."""
+    event = threading.Event()
+    _set_shutdown_event(event)
+    try:
+        triggered = threading.Event()
+
+        def child():
+            request_shutdown()
+            triggered.set()
+
+        t = threading.Thread(target=child, daemon=True)
+        t.start()
+        assert triggered.wait(timeout=2.0)
+        assert event.is_set()
+    finally:
+        _clear_shutdown_event()