[iris] Reap TPU hosts that keep failing launches; fire on_stop on natural return (#5038)

rjpower · web-flow · commit 493c9bb78d3b · 2026-04-22T12:10:13.000-07:00
Three linked fixes for a TPU co-schedule loop. ASSIGNED-&gt;WORKER_FAILED
now bumps the worker health tracker so a host that repeatedly fails to
bring up a task (e.g. iommu/vfio group already held) gets reaped instead
of looping forever. Excludes reservation-holder tasks from
PollTasksRequest.expected_tasks, since holders are virtual and polling
them produced bogus WORKER_FAILEDs that drained preemption budget.
ManagedThread now fires on_stop on the natural-return path so docker
kill+rm actually runs, releasing the TPU vfio group for the next task.
Adds regression tests for each.
diff --git a/lib/iris/src/iris/cluster/controller/transitions.py b/lib/iris/src/iris/cluster/controller/transitions.py
@@ -1878,6 +1878,13 @@ def _apply_task_transitions(
                 if update.new_state == job_pb2.TASK_STATE_WORKER_FAILED and prior_state == job_pb2.TASK_STATE_ASSIGNED:
                     task_state = job_pb2.TASK_STATE_PENDING
                     terminal_ms = None
+                    # ASSIGNED -> WORKER_FAILED means the worker accepted the task but
+                    # couldn't bring it up (e.g. TPU iommu/vfio already held by another
+                    # process on the VM). Attribute the failure to the worker so a host
+                    # that keeps failing launches gets reaped; otherwise the task loops
+                    # forever without draining preemption budget.
+                    if worker_id is not None:
+                        self._health.build_failed(WorkerId(str(worker_id)))
                 if update.new_state == job_pb2.TASK_STATE_FAILED and failure_count <= int(
                     task_row["max_retries_failure"]
                 ):
@@ -3128,10 +3135,17 @@ def get_running_tasks_for_poll(
                 return {}, {}
 
             placeholders = ",".join("?" for _ in worker_ids)
+            # Reservation holders are virtual — they live on ``current_worker_id``
+            # only as a scheduling anchor and never get a RunTaskRequest. Sending
+            # them in PollTasksRequest.expected_tasks makes the worker reconcile
+            # against its _tasks dict, miss, and return WORKER_FAILED every cycle,
+            # which drains the holder's preemption budget and (post the build-
+            # failure health hook) reaps the claimed worker for a harmless miss.
             task_rows = snap.fetchall(
                 f"SELECT t.task_id, t.current_attempt_id, t.current_worker_id "
-                f"FROM tasks t "
+                f"FROM tasks t JOIN jobs j ON j.job_id = t.job_id "
                 f"WHERE t.current_worker_id IN ({placeholders}) AND t.state IN (?, ?, ?) "
+                f"AND j.is_reservation_holder = 0 "
                 f"ORDER BY t.task_id ASC",
                 (*worker_ids, *ACTIVE_TASK_STATES),
             )
diff --git a/lib/iris/src/iris/managed_thread.py b/lib/iris/src/iris/managed_thread.py
@@ -110,7 +110,14 @@ def _watch_stop() -> None:
                 raise
             finally:
                 if watcher:
-                    watcher.join(timeout=1.0)
+                    # Wake the watcher regardless of how the target exited so
+                    # on_stop runs on the natural-completion path too. Otherwise
+                    # cleanup (e.g. docker kill+rm for task containers) is
+                    # silently skipped whenever the target returns without an
+                    # explicit stop() — leaving wedged containers that keep
+                    # holding TPU vfio/iommu groups and break subsequent tasks.
+                    self._stop_event.set()
+                    watcher.join(timeout=5.0)
                     if watcher.is_alive():
                         logger.warning("on_stop callback for %s did not complete", name)
 
diff --git a/lib/iris/tests/cluster/controller/test_reservation.py b/lib/iris/tests/cluster/controller/test_reservation.py
@@ -53,13 +53,15 @@
 from tests.cluster.controller.conftest import (
     FakeProvider,
     hydrate_worker_attributes as _with_attrs,
+    make_job_request,
     query_job as _query_job,
     query_job_row as _query_job_row,
     query_task as _query_task,
     query_task_with_attempts as _query_task_with_attempts,
     query_tasks_for_job as _query_tasks_for_job,
     query_worker as _query_worker,
     schedulable_tasks as _schedulable_tasks,
+    submit_job as _submit_job_tasks,
     worker_running_tasks as _worker_running_tasks,
 )
 
@@ -1436,6 +1438,49 @@ def test_holder_task_worker_death_no_failure_record(state):
         assert task_row_can_be_scheduled(holder_task), "holder task must be schedulable again"
 
 
+def test_get_running_tasks_for_poll_excludes_reservation_holders(state):
+    """get_running_tasks_for_poll must filter reservation-holder tasks.
+
+    Regression: the ping/poll loop feeds its output directly into
+    PollTasksRequest.expected_tasks. Holders are virtual — they never reach
+    the worker's _tasks dict — so including them makes the worker reconcile,
+    miss, and return WORKER_FAILED("Task not found on worker") every cycle.
+    That drains the holder's preemption budget and (with the ASSIGNED→
+    WORKER_FAILED health hook) reaps the claimed worker every few minutes.
+
+    Produced observed ~51 attempts/hour per holder in production.
+    """
+    request = _make_job_request_with_reservation(
+        reservation_entries=[_make_reservation_entry(_cpu_device())],
+    )
+    parent_job_id = _submit_job(state, "res-job", request)
+    holder_job_id = parent_job_id.child(RESERVATION_HOLDER_JOB_NAME)
+
+    holder_tasks = _query_tasks_for_job(state, holder_job_id)
+    assert len(holder_tasks) == 1
+    holder_task = holder_tasks[0]
+
+    real_request = make_job_request("real-job")
+    (real_task,) = _submit_job_tasks(state, "real-job", real_request)
+
+    worker_id = _register_worker(state, "w1")
+    state.queue_assignments(
+        [
+            Assignment(task_id=holder_task.task_id, worker_id=worker_id),
+            Assignment(task_id=real_task.task_id, worker_id=worker_id),
+        ]
+    )
+
+    running, _addresses = state.get_running_tasks_for_poll()
+
+    task_ids = {entry.task_id for entry in running.get(worker_id, [])}
+    assert real_task.task_id in task_ids, "real task must still appear for polling"
+    assert holder_task.task_id not in task_ids, (
+        "reservation holder must be excluded — worker has no in-memory state "
+        "for virtual holders, so polling them produces bogus WORKER_FAILEDs"
+    )
+
+
 def test_holder_task_removed_from_worker_when_parent_succeeds(state):
     """Holder task is cleaned from worker.running_tasks when the parent job succeeds.
 
diff --git a/lib/iris/tests/cluster/controller/test_transitions.py b/lib/iris/tests/cluster/controller/test_transitions.py
@@ -2225,6 +2225,40 @@ def test_worker_failed_from_building_counts_as_preemption(state):
     assert _query_task(state, task.task_id).failure_count == 0
 
 
+def test_worker_failed_from_assigned_bumps_health_tracker(state):
+    """ASSIGNED -> WORKER_FAILED attributes the failure to the worker.
+
+    Regression for the TPU-iommu co-schedule loop: the task retries to PENDING
+    (no preemption-budget cost) but the health tracker must still bump so that
+    a host that repeatedly fails launches eventually crosses the threshold and
+    gets reaped.
+    """
+    worker_id = register_worker(state, "w1", "host:8080", make_worker_metadata())
+    req = make_job_request("job1")
+    req.max_retries_preemption = 5
+    tasks = submit_job(state, "j1", req)
+    task = tasks[0]
+
+    state.queue_assignments([Assignment(task_id=task.task_id, worker_id=worker_id)])
+    assert _query_task(state, task.task_id).state == job_pb2.TASK_STATE_ASSIGNED
+    assert state._health.snapshot().get(worker_id) is None
+
+    transition_task(
+        state,
+        task.task_id,
+        job_pb2.TASK_STATE_WORKER_FAILED,
+        error='TPU init failure ("Couldn\'t open iommu group")',
+    )
+
+    # Task retries without consuming preemption budget...
+    t = _query_task(state, task.task_id)
+    assert t.state == job_pb2.TASK_STATE_PENDING
+    assert t.preemption_count == 0
+    # ...but the worker is charged a build failure.
+    _, build_failures = state._health.snapshot()[worker_id]
+    assert build_failures == 1
+
+
 def test_failed_from_building_bumps_health_tracker(state):
     """FAILED originating from BUILDING increments the build failure counter.
 
diff --git a/lib/iris/tests/test_managed_thread.py b/lib/iris/tests/test_managed_thread.py
@@ -0,0 +1,97 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Regression tests for ManagedThread lifecycle.
+
+Focus: on_stop callback must run both when stop() is called externally AND
+when the thread target returns on its own. A missed on_stop on the natural-
+completion path left task containers un-reaped in production — the container
+process stayed wedged on the TPU vfio/iommu group, poisoning the VM for
+subsequent tasks.
+"""
+
+import threading
+import time
+
+from iris.managed_thread import ManagedThread
+
+
+def test_on_stop_runs_when_stop_is_called():
+    stopped = threading.Event()
+    released = threading.Event()
+
+    def target(stop_event: threading.Event) -> None:
+        stop_event.wait(timeout=5.0)
+
+    def on_stop() -> None:
+        stopped.set()
+        released.set()
+
+    t = ManagedThread(target=target, name="stop-called", on_stop=on_stop)
+    t.start()
+    t.stop()
+    t.join()
+    assert stopped.is_set()
+
+
+def test_on_stop_runs_when_target_returns_naturally():
+    """Regression: target returning on its own must still fire on_stop.
+
+    Before the fix, on_stop was only invoked when an explicit stop() set the
+    stop event. When the target returned naturally (e.g. a task container
+    exited and the monitoring loop finished), the watcher stayed parked on
+    stop_event.wait() and the finally block timed out silently, skipping
+    on_stop. For task threads this meant docker kill + docker rm never ran,
+    leaving wedged containers holding TPU vfio groups.
+    """
+    on_stop_ran = threading.Event()
+
+    def target(_stop_event: threading.Event) -> None:
+        # Return immediately without touching the stop event.
+        return
+
+    def on_stop() -> None:
+        on_stop_ran.set()
+
+    t = ManagedThread(target=target, name="natural-return", on_stop=on_stop)
+    t.start()
+    t.join()
+    assert on_stop_ran.is_set(), "on_stop must run when target completes naturally"
+
+
+def test_on_stop_runs_when_target_raises():
+    """on_stop must also fire when the target raises — exception path."""
+    on_stop_ran = threading.Event()
+
+    class _Boom(Exception):
+        pass
+
+    def target(_stop_event: threading.Event) -> None:
+        raise _Boom("task blew up")
+
+    def on_stop() -> None:
+        on_stop_ran.set()
+
+    t = ManagedThread(target=target, name="raising-target", on_stop=on_stop)
+    t.start()
+    t.join()
+    assert on_stop_ran.is_set(), "on_stop must run even when target raises"
+
+
+def test_on_stop_runs_only_once():
+    """on_stop must not double-fire when both stop() and natural return occur."""
+    calls = []
+    lock = threading.Lock()
+
+    def target(stop_event: threading.Event) -> None:
+        stop_event.wait(timeout=0.2)
+
+    def on_stop() -> None:
+        with lock:
+            calls.append(time.monotonic())
+
+    t = ManagedThread(target=target, name="no-double-fire", on_stop=on_stop)
+    t.start()
+    t.stop()
+    t.join()
+    assert len(calls) == 1, f"on_stop fired {len(calls)} times, expected 1"