perf(iris): reduce drain_dispatch_all lock hold time from 80ms to <5ms (#4222)

rjpower · web-flow · commit 641d625eaed1 · 2026-03-27T20:57:18.000-07:00
## Summary - `drain_dispatch_all`: move the running-tasks 3-way JOIN out of the write lock into a `read_snapshot()`, drop the `JOIN jobs` (filter `is_reservation_holder` in Python instead). Write lock now only covers the dispatch_queue SELECT + DELETE. - `prune_old_data`: add a read-snapshot pre-check to skip the write lock when nothing needs pruning. - `db.py`: add missing `decode_task` method to `ControllerDB`. Benchmark on production checkpoint (4208 jobs, 148K tasks, 225 workers): - `drain_dispatch_all` lock hold: **86ms → <5ms** - `prune_old_data` (0 deletions): **427ms → <1ms** Fixes #4220
diff --git a/lib/iris/scripts/benchmark_db_queries.py b/lib/iris/scripts/benchmark_db_queries.py
@@ -373,15 +373,14 @@ def benchmark_heartbeat(db: ControllerDB, iterations: int) -> list[tuple[str, fl
     sample_worker_id = str(workers[0].worker_id)
     active_states = tuple(ACTIVE_TASK_STATES)
 
-    # Single-worker running tasks query (simulates drain_dispatch inner query)
+    # Single-worker running tasks query (simulates drain_dispatch inner query, 2-way JOIN)
     def _single_worker_running_tasks():
         with db.read_snapshot() as q:
             q.raw(
-                "SELECT t.task_id, t.current_attempt_id "
+                "SELECT t.task_id, t.current_attempt_id, t.job_id "
                 "FROM tasks t "
                 "JOIN task_attempts ta ON t.task_id = ta.task_id AND t.current_attempt_id = ta.attempt_id "
-                "JOIN jobs j ON j.job_id = t.job_id "
-                "WHERE ta.worker_id = ? AND t.state IN (?, ?, ?) AND j.is_reservation_holder = 0 "
+                "WHERE ta.worker_id = ? AND t.state IN (?, ?, ?) "
                 "ORDER BY t.task_id ASC",
                 (sample_worker_id, *active_states),
             )
@@ -390,16 +389,15 @@ def _single_worker_running_tasks():
     results.append(("drain_dispatch running_tasks (1 worker)", p50, p95))
     print_result("drain_dispatch running_tasks (1 worker)", p50, p95)
 
-    # Full loop: running tasks for ALL workers (simulates phase 1)
+    # Full loop: running tasks for ALL workers (simulates phase 1, 2-way JOIN)
     def _all_workers_running_tasks():
         for w in workers:
             with db.read_snapshot() as q:
                 q.raw(
-                    "SELECT t.task_id, t.current_attempt_id "
+                    "SELECT t.task_id, t.current_attempt_id, t.job_id "
                     "FROM tasks t "
                     "JOIN task_attempts ta ON t.task_id = ta.task_id AND t.current_attempt_id = ta.attempt_id "
-                    "JOIN jobs j ON j.job_id = t.job_id "
-                    "WHERE ta.worker_id = ? AND t.state IN (?, ?, ?) AND j.is_reservation_holder = 0 "
+                    "WHERE ta.worker_id = ? AND t.state IN (?, ?, ?) "
                     "ORDER BY t.task_id ASC",
                     (str(w.worker_id), *active_states),
                 )
diff --git a/lib/iris/src/iris/cluster/controller/db.py b/lib/iris/src/iris/cluster/controller/db.py
@@ -706,6 +706,10 @@ def read_snapshot(self) -> Iterator[QuerySnapshot]:
                 logging.getLogger(__name__).warning("read_snapshot rollback failed", exc_info=True)
             self._read_pool.put(conn)
 
+    @staticmethod
+    def decode_task(row: sqlite3.Row) -> Task:
+        return _decode_row(Task, row)
+
     def apply_migrations(self) -> None:
         """Apply pending migrations from the migrations/ directory.
 
diff --git a/lib/iris/src/iris/cluster/controller/transitions.py b/lib/iris/src/iris/cluster/controller/transitions.py
@@ -1629,15 +1629,25 @@ def drain_dispatch(self, worker_id: WorkerId) -> DispatchBatch | None:
             ).fetchall()
             if dispatch_rows:
                 cur.execute("DELETE FROM dispatch_queue WHERE worker_id = ?", (str(worker_id),))
-            running_rows = cur.execute(
-                "SELECT t.task_id, t.current_attempt_id "
+            running_rows_raw = cur.execute(
+                "SELECT t.task_id, t.current_attempt_id, t.job_id "
                 "FROM tasks t "
                 "JOIN task_attempts ta ON t.task_id = ta.task_id AND t.current_attempt_id = ta.attempt_id "
-                "JOIN jobs j ON j.job_id = t.job_id "
-                "WHERE ta.worker_id = ? AND t.state IN (?, ?, ?) AND j.is_reservation_holder = 0 "
+                "WHERE ta.worker_id = ? AND t.state IN (?, ?, ?) "
                 "ORDER BY t.task_id ASC",
                 (str(worker_id), *ACTIVE_TASK_STATES),
             ).fetchall()
+            running_job_ids = {str(row["job_id"]) for row in running_rows_raw}
+            if running_job_ids:
+                holder_placeholders = ",".join("?" for _ in running_job_ids)
+                holder_rows = cur.execute(
+                    f"SELECT job_id FROM jobs WHERE job_id IN ({holder_placeholders}) AND is_reservation_holder = 1",
+                    tuple(running_job_ids),
+                ).fetchall()
+                holder_ids = {str(r["job_id"]) for r in holder_rows}
+            else:
+                holder_ids = set()
+            running_rows = [r for r in running_rows_raw if str(r["job_id"]) not in holder_ids]
             tasks_to_run: list[cluster_pb2.Worker.RunTaskRequest] = []
             tasks_to_kill: list[str] = []
             for row in dispatch_rows:
@@ -1662,16 +1672,47 @@ def drain_dispatch(self, worker_id: WorkerId) -> DispatchBatch | None:
             )
 
     def drain_dispatch_all(self) -> list[DispatchBatch]:
-        """Drain buffered dispatches and snapshot running tasks for all healthy active workers in one transaction."""
-        with self._db.transaction() as cur:
-            worker_rows = cur.execute(
+        """Drain buffered dispatches and snapshot running tasks for all healthy active workers.
+
+        Reads (workers, running tasks, reservation filter) use a read snapshot
+        to avoid holding the write lock. The write lock is only held for the
+        dispatch_queue SELECT + DELETE.
+        """
+        # -- Phase 1: read-only queries (no write lock) --
+        with self._db.read_snapshot() as snap:
+            worker_rows = snap.fetchall(
                 "SELECT worker_id, address, metadata_proto FROM workers WHERE active = 1 AND healthy = 1"
-            ).fetchall()
+            )
             if not worker_rows:
                 return []
 
             worker_id_set = {str(row["worker_id"]) for row in worker_rows}
-            placeholders = ",".join("?" for _ in worker_id_set)
+
+            running_rows = snap.fetchall(
+                "SELECT ta.worker_id, t.task_id, t.current_attempt_id, t.job_id "
+                "FROM tasks t "
+                "JOIN task_attempts ta ON t.task_id = ta.task_id AND t.current_attempt_id = ta.attempt_id "
+                "WHERE t.state IN (?, ?, ?) "
+                "ORDER BY t.task_id ASC",
+                tuple(ACTIVE_TASK_STATES),
+            )
+
+            # Batch-check reservation holders instead of joining the jobs table
+            running_job_ids = {str(row["job_id"]) for row in running_rows}
+            reservation_holder_ids: set[str] = set()
+            if running_job_ids:
+                job_placeholders = ",".join("?" for _ in running_job_ids)
+                res_rows = snap.fetchall(
+                    f"SELECT job_id FROM jobs WHERE job_id IN ({job_placeholders}) AND is_reservation_holder = 1",
+                    tuple(running_job_ids),
+                )
+                reservation_holder_ids = {str(row["job_id"]) for row in res_rows}
+
+        running_rows = [row for row in running_rows if str(row["job_id"]) not in reservation_holder_ids]
+
+        # -- Phase 2: write lock only for dispatch_queue drain --
+        placeholders = ",".join("?" for _ in worker_id_set)
+        with self._db.transaction() as cur:
             dispatch_rows = cur.execute(
                 f"SELECT worker_id, id, kind, payload_proto, task_id FROM dispatch_queue "
                 f"WHERE worker_id IN ({placeholders}) ORDER BY id ASC",
@@ -1683,57 +1724,48 @@ def drain_dispatch_all(self) -> list[DispatchBatch]:
                     tuple(worker_id_set),
                 )
 
-            running_rows = cur.execute(
-                "SELECT ta.worker_id, t.task_id, t.current_attempt_id "
-                "FROM tasks t "
-                "JOIN task_attempts ta ON t.task_id = ta.task_id AND t.current_attempt_id = ta.attempt_id "
-                "JOIN jobs j ON j.job_id = t.job_id "
-                "WHERE t.state IN (?, ?, ?) AND j.is_reservation_holder = 0 "
-                "ORDER BY t.task_id ASC",
-                (*ACTIVE_TASK_STATES,),
-            ).fetchall()
+        # -- Phase 3: build results (pure Python, no lock) --
+        dispatch_by_worker: dict[str, list[Any]] = defaultdict(list)
+        for row in dispatch_rows:
+            dispatch_by_worker[str(row["worker_id"])].append(row)
 
-            dispatch_by_worker: dict[str, list[Any]] = defaultdict(list)
-            for row in dispatch_rows:
-                dispatch_by_worker[str(row["worker_id"])].append(row)
+        running_by_worker: dict[str, list[Any]] = defaultdict(list)
+        for row in running_rows:
+            running_by_worker[str(row["worker_id"])].append(row)
 
-            running_by_worker: dict[str, list[Any]] = defaultdict(list)
-            for row in running_rows:
-                running_by_worker[str(row["worker_id"])].append(row)
-
-            batches: list[DispatchBatch] = []
-            for worker_row in worker_rows:
-                wid = str(worker_row["worker_id"])
-                w_dispatch = dispatch_by_worker.get(wid, [])
-                w_running = running_by_worker.get(wid, [])
-
-                tasks_to_run: list[cluster_pb2.Worker.RunTaskRequest] = []
-                tasks_to_kill: list[str] = []
-                for row in w_dispatch:
-                    if str(row["kind"]) == "run" and row["payload_proto"] is not None:
-                        req = cluster_pb2.Worker.RunTaskRequest()
-                        req.ParseFromString(bytes(row["payload_proto"]))
-                        tasks_to_run.append(req)
-                    elif row["task_id"] is not None:
-                        tasks_to_kill.append(str(row["task_id"]))
-
-                batches.append(
-                    DispatchBatch(
-                        worker_id=WorkerId(wid),
-                        worker_address=str(worker_row["address"]),
-                        running_tasks=[
-                            RunningTaskEntry(
-                                task_id=JobName.from_wire(str(row["task_id"])),
-                                attempt_id=int(row["current_attempt_id"]),
-                            )
-                            for row in w_running
-                        ],
-                        tasks_to_run=tasks_to_run,
-                        tasks_to_kill=tasks_to_kill,
-                    )
+        batches: list[DispatchBatch] = []
+        for worker_row in worker_rows:
+            wid = str(worker_row["worker_id"])
+            w_dispatch = dispatch_by_worker.get(wid, [])
+            w_running = running_by_worker.get(wid, [])
+
+            tasks_to_run: list[cluster_pb2.Worker.RunTaskRequest] = []
+            tasks_to_kill: list[str] = []
+            for row in w_dispatch:
+                if str(row["kind"]) == "run" and row["payload_proto"] is not None:
+                    req = cluster_pb2.Worker.RunTaskRequest()
+                    req.ParseFromString(bytes(row["payload_proto"]))
+                    tasks_to_run.append(req)
+                elif row["task_id"] is not None:
+                    tasks_to_kill.append(str(row["task_id"]))
+
+            batches.append(
+                DispatchBatch(
+                    worker_id=WorkerId(wid),
+                    worker_address=str(worker_row["address"]),
+                    running_tasks=[
+                        RunningTaskEntry(
+                            task_id=JobName.from_wire(str(row["task_id"])),
+                            attempt_id=int(row["current_attempt_id"]),
+                        )
+                        for row in w_running
+                    ],
+                    tasks_to_run=tasks_to_run,
+                    tasks_to_kill=tasks_to_kill,
                 )
+            )
 
-            return batches
+        return batches
 
     def requeue_dispatch(self, batch: DispatchBatch) -> None:
         """Re-queue drained dispatch payloads for later delivery."""
@@ -1819,11 +1851,37 @@ def prune_old_data(
         txn_cutoff_ms = now_ms - txn_action_retention.to_ms()
 
         terminal_states = tuple(TERMINAL_JOB_STATES)
+        placeholders = ",".join("?" * len(terminal_states))
+
+        # Cheap pre-check via read snapshot: skip the write lock when nothing is old enough
+        with self._db.read_snapshot() as snap:
+            has_work = (
+                snap.fetchone(
+                    f"SELECT 1 FROM jobs WHERE state IN ({placeholders})"
+                    " AND finished_at_ms IS NOT NULL AND finished_at_ms < ? LIMIT 1",
+                    (*terminal_states, job_cutoff_ms),
+                )
+                or snap.fetchone(
+                    "SELECT 1 FROM workers WHERE (active = 0 OR healthy = 0) AND last_heartbeat_ms < ? LIMIT 1",
+                    (worker_cutoff_ms,),
+                )
+                or snap.fetchone(
+                    "SELECT 1 FROM logs WHERE epoch_ms < ? LIMIT 1",
+                    (log_cutoff_ms,),
+                )
+                or snap.fetchone(
+                    "SELECT 1 FROM txn_actions WHERE created_at_ms < ? LIMIT 1",
+                    (txn_cutoff_ms,),
+                )
+            )
+
+        if not has_work:
+            return PruneResult(jobs_deleted=0, workers_deleted=0, logs_deleted=0, txn_actions_deleted=0)
+
         actions: list[tuple[str, str, dict[str, object]]] = []
 
         with self._db.transaction() as cur:
             # 1. Terminal jobs finished before the cutoff
-            placeholders = ",".join("?" * len(terminal_states))
             job_rows = cur.execute(
                 f"SELECT job_id FROM jobs WHERE state IN ({placeholders})"
                 " AND finished_at_ms IS NOT NULL AND finished_at_ms < ?",
diff --git a/lib/iris/tests/cluster/controller/test_transitions.py b/lib/iris/tests/cluster/controller/test_transitions.py
@@ -3144,6 +3144,74 @@ def test_prune_noop_when_nothing_old(state):
     assert result.total == 0
 
 
+# =============================================================================
+# drain_dispatch_all Tests
+# =============================================================================
+
+
+def test_drain_dispatch_all_excludes_reservation_holders(state):
+    """drain_dispatch_all returns running tasks but filters out reservation-holder tasks."""
+    wid = register_worker(state, "w1", "host:8080", make_worker_metadata())
+
+    normal_req = make_job_request("normal-job")
+    normal_tasks = submit_job(state, "normal-job", normal_req)
+    dispatch_task(state, normal_tasks[0], wid)
+
+    holder_req = make_job_request("holder-job")
+    holder_tasks = submit_job(state, "holder-job", holder_req)
+    holder_job_id = JobName.root("test-user", "holder-job")
+    state._db.execute(
+        "UPDATE jobs SET is_reservation_holder = 1 WHERE job_id = ?",
+        (holder_job_id.to_wire(),),
+    )
+    dispatch_task(state, holder_tasks[0], wid)
+
+    batches = state.drain_dispatch_all()
+    assert len(batches) == 1
+    batch = batches[0]
+    running_task_ids = {entry.task_id for entry in batch.running_tasks}
+
+    assert normal_tasks[0].task_id in running_task_ids
+    assert holder_tasks[0].task_id not in running_task_ids
+
+
+def test_drain_dispatch_all_drains_dispatch_queue(state):
+    """drain_dispatch_all drains queued dispatches and deletes them from the queue."""
+    wid = register_worker(state, "w1", "host:8080", make_worker_metadata())
+
+    req = make_job_request("j1")
+    tasks = submit_job(state, "j1", req)
+    state.queue_assignments([Assignment(task_id=tasks[0].task_id, worker_id=wid)])
+
+    rows_before = state._db.fetchall("SELECT * FROM dispatch_queue WHERE worker_id = ?", (str(wid),))
+    assert len(rows_before) > 0
+
+    batches = state.drain_dispatch_all()
+    assert len(batches) == 1
+    assert len(batches[0].tasks_to_run) > 0
+
+    rows_after = state._db.fetchall("SELECT * FROM dispatch_queue WHERE worker_id = ?", (str(wid),))
+    assert len(rows_after) == 0
+
+
+def test_prune_old_data_short_circuits_when_nothing_prunable(state):
+    """prune_old_data skips the write lock when a read_snapshot shows nothing to prune."""
+    wid = register_worker(state, "w1", "host:8080", make_worker_metadata())
+    req = make_job_request("active-job")
+    tasks = submit_job(state, "active-job", req)
+    dispatch_task(state, tasks[0], wid)
+
+    result = state.prune_old_data(
+        job_retention=Duration.from_seconds(86400),
+        worker_retention=Duration.from_seconds(86400),
+        log_retention=Duration.from_seconds(86400),
+        txn_action_retention=Duration.from_seconds(86400),
+    )
+
+    assert result == PruneResult()
+    assert result.total == 0
+
+
 # =============================================================================
 # Direct Provider Transition Tests
 # =============================================================================