[iris] Drop per-RPC autoscaler hint in GetJobStatus; cache worker roster; cover task-summary index (#4846)

rjpower · web-flow · commit 6e7f48bed6a0 · 2026-04-16T15:39:08.000-07:00
GetJobStatus rebuilt and serialized the full autoscaler routing table
per call (35% of controller wall-time in a live CPU profile); drop the
hint there and keep it on ListJobs. Adds a 1s TTL cache for the worker
roster so back-to-back ListWorkers and GetAutoscalerStatus share one
scan. Adds a covering index on tasks(job_id, state, failure_count,
preemption_count) so _task_summaries_for_jobs can satisfy the GROUP BY +
SUM from the index alone.
diff --git a/lib/iris/src/iris/cluster/controller/migrations/0034_task_summaries_covering_index.py b/lib/iris/src/iris/cluster/controller/migrations/0034_task_summaries_covering_index.py
@@ -0,0 +1,24 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import sqlite3
+
+
+def migrate(conn: sqlite3.Connection) -> None:
+    # _task_summaries_for_jobs (service.py) runs
+    #     SELECT job_id, state, COUNT(*), SUM(failure_count), SUM(preemption_count)
+    #     FROM tasks WHERE job_id IN (...) GROUP BY job_id, state
+    # on every ListJobs and GetJobStatus call. The existing
+    # idx_tasks_job_failures (job_id, failure_count, preemption_count) lacks
+    # `state` and so SQLite has to read the base row for every matched task
+    # to get the state column. idx_tasks_job_state (job_id, state) covers the
+    # filter+GROUP BY keys but not the SUM targets.
+    #
+    # This index covers the whole query: leading (job_id, state) serves
+    # WHERE + GROUP BY, and the trailing (failure_count, preemption_count)
+    # columns let SQLite satisfy the SUMs directly from the index without
+    # touching the tasks heap.
+    conn.execute(
+        "CREATE INDEX IF NOT EXISTS idx_tasks_job_state_counts "
+        "ON tasks(job_id, state, failure_count, preemption_count)"
+    )
diff --git a/lib/iris/src/iris/cluster/controller/schema.py b/lib/iris/src/iris/cluster/controller/schema.py
@@ -767,6 +767,9 @@ def generate_full_ddl(tables: Sequence[Table]) -> str:
         # Migration 0020
         "CREATE INDEX IF NOT EXISTS idx_tasks_current_worker"
         " ON tasks(current_worker_id) WHERE current_worker_id IS NOT NULL",
+        # Migration 0034: covers _task_summaries_for_jobs GROUP BY + SUM.
+        "CREATE INDEX IF NOT EXISTS idx_tasks_job_state_counts"
+        " ON tasks(job_id, state, failure_count, preemption_count)",
     ),
 )
 
diff --git a/lib/iris/src/iris/cluster/controller/service.py b/lib/iris/src/iris/cluster/controller/service.py
@@ -12,6 +12,8 @@
 import logging
 import re
 import secrets
+import threading
+import time
 import uuid
 import dataclasses
 from dataclasses import dataclass
@@ -1004,13 +1006,39 @@ def __init__(
         self._timer = Timer()
         self._auth = auth or ControllerAuth()
         self._system_endpoints: dict[str, str] = system_endpoints or {}
+        # Short-TTL cache of the worker roster. Dashboards call ListWorkers
+        # and GetAutoscalerStatus back-to-back; both enumerate every worker.
+        # 1s is short enough that stale rows don't matter (workers have
+        # slower health/heartbeat cadence) and long enough to fuse adjacent
+        # refreshes into one SELECT.
+        self._worker_roster_cache: tuple[float, list[WorkerDetailRow]] | None = None
+        self._worker_roster_cache_lock = threading.Lock()
+        self._worker_roster_ttl_s = 1.0
 
     def bundle_zip(self, bundle_id: str) -> bytes:
         return self._bundle_store.get_zip(bundle_id)
 
     def blob_data(self, blob_id: str) -> bytes:
         return self._bundle_store.get_zip(blob_id)
 
+    def _worker_roster_cached(self) -> list[WorkerDetailRow]:
+        """Return the worker roster, refreshed at most once per TTL window.
+
+        `ListWorkers` and `GetAutoscalerStatus` both enumerate every worker
+        and get polled back-to-back by the dashboard. The SELECT + attribute
+        fan-out is expensive (no WHERE, full scan of workers + worker_attributes)
+        and repeating it twice per refresh is pure duplication.
+        """
+        now = time.monotonic()
+        with self._worker_roster_cache_lock:
+            cached = self._worker_roster_cache
+            if cached is not None and (now - cached[0]) < self._worker_roster_ttl_s:
+                return cached[1]
+        roster = _worker_roster(self._db)
+        with self._worker_roster_cache_lock:
+            self._worker_roster_cache = (now, roster)
+        return roster
+
     def _get_autoscaler_pending_hints(self) -> dict[str, PendingHint]:
         """Build autoscaler-based pending hints keyed by job id."""
         autoscaler = self._controller.autoscaler
@@ -1194,14 +1222,16 @@ def get_job_status(
 
         # Get scheduling diagnostics for pending jobs from cache
         # (populated each scheduling cycle by the controller).
+        #
+        # The autoscaler pending-hint used to be appended here, but
+        # ``_get_autoscaler_pending_hints`` rebuilds + serializes the full
+        # autoscaler routing table on every call (35%+ of wall-time in a
+        # live CPU profile). Skip it for now; use ListJobs for the richer
+        # pending explanation while we work out a cached hint path.
         pending_reason = ""
         if job.state == job_pb2.JOB_STATE_PENDING:
             sched_reason = self._controller.get_job_scheduling_diagnostics(job.job_id.to_wire())
             pending_reason = sched_reason or "Pending scheduler feedback"
-            hint = self._get_autoscaler_pending_hints().get(job.job_id.to_wire())
-            if hint is not None:
-                scaling_prefix = "(scaling up) " if hint.is_scaling_up else ""
-                pending_reason = f"Scheduler: {pending_reason}\n\nAutoscaler: {scaling_prefix}{hint.message}"
 
         resources = _resource_spec_from_job_row(job)
 
@@ -1575,7 +1605,7 @@ def list_workers(
         if self._controller.has_direct_provider:
             return controller_pb2.Controller.ListWorkersResponse()
         workers = []
-        worker_rows = _worker_roster(self._db)
+        worker_rows = self._worker_roster_cached()
         running_by_worker = running_tasks_by_worker(self._db, {worker.worker_id for worker in worker_rows})
         for worker in worker_rows:
             workers.append(
@@ -1727,7 +1757,7 @@ def get_autoscaler_status(
         status = autoscaler.get_status()
 
         # Build a map of worker_id -> (worker_id, healthy) for enriching VmInfo
-        workers = _worker_roster(self._db)
+        workers = self._worker_roster_cached()
         worker_id_to_info: dict[str, tuple[str, bool]] = {}
         for w in workers:
             worker_id_to_info[w.worker_id] = (w.worker_id, w.healthy)
diff --git a/lib/iris/tests/cluster/controller/test_dashboard.py b/lib/iris/tests/cluster/controller/test_dashboard.py
@@ -652,11 +652,15 @@ def test_pending_reason_uses_autoscaler_hint_for_scale_up(
         )
     )
 
+    # GetJobStatus intentionally does not append the autoscaler hint — it
+    # was the dominant hot path in a live CPU profile (35% of wall time
+    # spent rebuilding / serializing the routing table per RPC). ListJobs
+    # still includes the hint since it's only computed once per page.
     job_resp = rpc_post(
         client_with_autoscaler, "GetJobStatus", {"jobId": JobName.root("test-user", "pending-scale").to_wire()}
     )
     pending_reason = job_resp.get("job", {}).get("pendingReason", "")
-    assert "Waiting for worker scale-up in scale group 'tpu_v5e_32'" in pending_reason
+    assert "Waiting for worker scale-up in scale group 'tpu_v5e_32'" not in pending_reason
 
     jobs_resp = rpc_post(client_with_autoscaler, "ListJobs")
     listed = [
@@ -700,12 +704,15 @@ def test_pending_reason_uses_passive_autoscaler_hint_over_scheduler(
         )
     )
 
+    # GetJobStatus no longer appends the autoscaler hint (see
+    # test_pending_reason_uses_autoscaler_hint_for_scale_up for rationale).
+    # It still surfaces the scheduler diagnostic.
     job_resp = rpc_post(
         client_with_autoscaler, "GetJobStatus", {"jobId": JobName.root("test-user", "diag-constraint").to_wire()}
     )
     pending_reason = job_resp.get("job", {}).get("pendingReason", "")
     assert pending_reason
-    assert "Waiting for workers in scale group 'tpu_v5e_32' to become ready" in pending_reason
+    assert "Waiting for workers in scale group 'tpu_v5e_32' to become ready" not in pending_reason
 
 
 def test_list_jobs_shows_passive_autoscaler_wait_hint(

Original file line number	Diff line number	Diff line change
`@@ -767,6 +767,9 @@ def generate_full_ddl(tables: Sequence[Table]) -> str:`
`767`	`767`	`# Migration 0020`
`768`	`768`	`"CREATE INDEX IF NOT EXISTS idx_tasks_current_worker"`
`769`	`769`	`" ON tasks(current_worker_id) WHERE current_worker_id IS NOT NULL",`
	`770`	`+ # Migration 0034: covers _task_summaries_for_jobs GROUP BY + SUM.`
	`771`	`+ "CREATE INDEX IF NOT EXISTS idx_tasks_job_state_counts"`
	`772`	`+ " ON tasks(job_id, state, failure_count, preemption_count)",`
`770`	`773`	`),`
`771`	`774`	`)`
`772`	`775`
Original file line number	Diff line number	Diff line change
`@@ -652,11 +652,15 @@ def test_pending_reason_uses_autoscaler_hint_for_scale_up(`
`652`	`652`	`)`
`653`	`653`	`)`
`654`	`654`
	`655`	`+ # GetJobStatus intentionally does not append the autoscaler hint — it`
	`656`	`+ # was the dominant hot path in a live CPU profile (35% of wall time`
	`657`	`+ # spent rebuilding / serializing the routing table per RPC). ListJobs`
	`658`	`+ # still includes the hint since it's only computed once per page.`
`655`	`659`	`job_resp = rpc_post(`
`656`	`660`	`client_with_autoscaler, "GetJobStatus", {"jobId": JobName.root("test-user", "pending-scale").to_wire()}`
`657`	`661`	`)`
`658`	`662`	`pending_reason = job_resp.get("job", {}).get("pendingReason", "")`
`659`		`- assert "Waiting for worker scale-up in scale group 'tpu_v5e_32'" in pending_reason`
	`663`	`+ assert "Waiting for worker scale-up in scale group 'tpu_v5e_32'" not in pending_reason`
`660`	`664`
`661`	`665`	`jobs_resp = rpc_post(client_with_autoscaler, "ListJobs")`
`662`	`666`	`listed = [`
`@@ -700,12 +704,15 @@ def test_pending_reason_uses_passive_autoscaler_hint_over_scheduler(`
`700`	`704`	`)`
`701`	`705`	`)`
`702`	`706`
	`707`	`+ # GetJobStatus no longer appends the autoscaler hint (see`
	`708`	`+ # test_pending_reason_uses_autoscaler_hint_for_scale_up for rationale).`
	`709`	`+ # It still surfaces the scheduler diagnostic.`
`703`	`710`	`job_resp = rpc_post(`
`704`	`711`	`client_with_autoscaler, "GetJobStatus", {"jobId": JobName.root("test-user", "diag-constraint").to_wire()}`
`705`	`712`	`)`
`706`	`713`	`pending_reason = job_resp.get("job", {}).get("pendingReason", "")`
`707`	`714`	`assert pending_reason`
`708`		`- assert "Waiting for workers in scale group 'tpu_v5e_32' to become ready" in pending_reason`
	`715`	`+ assert "Waiting for workers in scale group 'tpu_v5e_32' to become ready" not in pending_reason`
`709`	`716`
`710`	`717`
`711`	`718`	`def test_list_jobs_shows_passive_autoscaler_wait_hint(`