[iris] Cache autoscaler pending hints per evaluate() cycle

github-actions[bot] · github-actions[bot] · commit 3a2e49840be5 · 2026-04-16T23:22:46.000Z
GetJobStatus rebuilt the full AutoscalerStatus proto on every dashboard poll, re-running routing_decision_to_proto over every demand entry and unmet entry. Cache the routing-decision proto and derived pending-hint dict on Autoscaler, invalidated in evaluate(). Fixes #4844
diff --git a/lib/iris/src/iris/cluster/controller/autoscaler/runtime.py b/lib/iris/src/iris/cluster/controller/autoscaler/runtime.py
@@ -47,7 +47,7 @@
 )
 from iris.cluster.controller.autoscaler.routing import job_feasibility, route_demand
 from iris.cluster.controller.autoscaler.scaling_group import ScalingGroup
-from iris.cluster.controller.autoscaler.status import routing_decision_to_proto
+from iris.cluster.controller.autoscaler.status import PendingHint, build_job_pending_hints, routing_decision_to_proto
 from iris.cluster.controller.autoscaler.worker_registry import TrackedWorker, WorkerRegistry
 from iris.cluster.controller.db import ControllerDB
 from iris.cluster.types import WorkerStatusMap
@@ -118,6 +118,13 @@ def __init__(
         self._last_scale_plan: ScalePlan | None = None
         self._last_evaluation: Timestamp = Timestamp.from_ms(0)
 
+        # Derived views of _last_scale_plan, built lazily and invalidated by
+        # evaluate(). Dashboard polls (GetJobStatus, ListJobs) hit these on
+        # every pending job; building them per request was the bottleneck
+        # described in #4844.
+        self._last_routing_decision_proto: vm_pb2.RoutingDecision | None = None
+        self._last_pending_hints: dict[str, PendingHint] | None = None
+
         # Thread management
         self._threads = threads if threads is not None else get_thread_container()
 
@@ -246,6 +253,8 @@ def evaluate(
         routing_decision = route_demand(list(self._groups.values()), demand_entries, ts)
         scale_plan = build_scale_plan(self._groups, routing_decision, ts)
         self._last_scale_plan = scale_plan
+        self._last_routing_decision_proto = None
+        self._last_pending_hints = None
 
         if routing_decision.unmet_entries:
             logger.debug(
@@ -555,6 +564,32 @@ def job_feasibility(
         result = job_feasibility(self._groups.values(), constraints, replicas=replicas)
         return result.reason
 
+    def get_last_routing_decision_proto(self) -> vm_pb2.RoutingDecision | None:
+        """Return the last routing decision as a proto, lazily built and cached.
+
+        The routing decision only changes in evaluate(); intermediate callers
+        (GetJobStatus, ListJobs) reuse the cached proto without paying the
+        per-entry conversion cost.
+        """
+        if self._last_scale_plan is None:
+            return None
+        if self._last_routing_decision_proto is None:
+            self._last_routing_decision_proto = routing_decision_to_proto(
+                self._last_scale_plan.routing_decision,
+                group_to_launch=self._last_scale_plan.launch_counts(),
+            )
+        return self._last_routing_decision_proto
+
+    def get_pending_hints(self) -> dict[str, PendingHint]:
+        """Return autoscaler pending hints keyed by job id.
+
+        Cached per evaluate() cycle so repeated GetJobStatus calls don't
+        rebuild the hint dict (see #4844).
+        """
+        if self._last_pending_hints is None:
+            self._last_pending_hints = build_job_pending_hints(self.get_last_routing_decision_proto())
+        return self._last_pending_hints
+
     def get_status(self) -> vm_pb2.AutoscalerStatus:
         """Build status for the status API."""
         status = vm_pb2.AutoscalerStatus(
@@ -563,13 +598,9 @@ def get_status(self) -> vm_pb2.AutoscalerStatus:
             last_evaluation=timestamp_to_proto(self._last_evaluation),
             recent_actions=list(self._action_log),
         )
-        if self._last_scale_plan is not None:
-            status.last_routing_decision.CopyFrom(
-                routing_decision_to_proto(
-                    self._last_scale_plan.routing_decision,
-                    group_to_launch=self._last_scale_plan.launch_counts(),
-                )
-            )
+        routing_proto = self.get_last_routing_decision_proto()
+        if routing_proto is not None:
+            status.last_routing_decision.CopyFrom(routing_proto)
         return status
 
     def get_group(self, name: str) -> ScalingGroup | None:
diff --git a/lib/iris/src/iris/cluster/controller/service.py b/lib/iris/src/iris/cluster/controller/service.py
@@ -86,7 +86,7 @@
     WorkerRow,
     tasks_with_attempts,
 )
-from iris.cluster.controller.autoscaler.status import PendingHint, build_job_pending_hints
+from iris.cluster.controller.autoscaler.status import PendingHint
 from iris.cluster.controller.query import execute_raw_query
 from iris.rpc import query_pb2
 from iris.cluster.controller.scheduler import SchedulingContext
@@ -900,6 +900,10 @@ def get_status(self) -> vm_pb2.AutoscalerStatus:
         """Get autoscaler status."""
         ...
 
+    def get_pending_hints(self) -> dict[str, PendingHint]:
+        """Get cached pending-hint dict keyed by job id."""
+        ...
+
     def get_vm(self, vm_id: str) -> vm_pb2.VmInfo | None:
         """Get info for a specific VM."""
         ...
@@ -1044,10 +1048,10 @@ def _get_autoscaler_pending_hints(self) -> dict[str, PendingHint]:
         autoscaler = self._controller.autoscaler
         if autoscaler is None:
             return {}
-        status = autoscaler.get_status()
-        if not status.HasField("last_routing_decision"):
-            return {}
-        return build_job_pending_hints(status.last_routing_decision)
+        # Autoscaler caches the hint dict per evaluate() cycle; this avoids
+        # rebuilding the full AutoscalerStatus proto on every GetJobStatus
+        # RPC (#4844).
+        return autoscaler.get_pending_hints()
 
     def _authorize_job_owner(self, job_id: JobName) -> None:
         """Raise PERMISSION_DENIED if the authenticated user doesn't own this job.
diff --git a/lib/iris/tests/cluster/controller/test_autoscaler.py b/lib/iris/tests/cluster/controller/test_autoscaler.py
@@ -645,6 +645,33 @@ def test_get_status_includes_last_routing_decision(self):
         assert status.HasField("last_routing_decision")
         assert "test-group" in status.last_routing_decision.routed_entries
 
+    def test_pending_hints_and_routing_proto_are_cached_between_evaluates(self):
+        """Dashboard polls reuse one proto + hint dict per evaluate() (#4844).
+
+        get_job_status calls this per pending job on every dashboard refresh.
+        Rebuilding the status proto each time was measurably slow on busy
+        clusters; repeated calls should return the same cached objects, and a
+        new evaluate() must invalidate the cache.
+        """
+        config = make_scale_group_config(name="test-group", buffer_slices=0, max_slices=5)
+        group = ScalingGroup(config, make_mock_platform())
+        autoscaler = make_autoscaler({"test-group": group})
+
+        autoscaler.evaluate(make_demand_entries(2, device_type=DeviceType.TPU, device_variant="v5p-8"))
+
+        # Cached: repeated reads return the same objects without rebuilding.
+        proto_first = autoscaler.get_last_routing_decision_proto()
+        hints_first = autoscaler.get_pending_hints()
+        assert proto_first is autoscaler.get_last_routing_decision_proto()
+        assert hints_first is autoscaler.get_pending_hints()
+        # get_status() reuses the same cached routing-decision proto.
+        assert autoscaler.get_status().last_routing_decision == proto_first
+
+        # Invalidated on next evaluate().
+        autoscaler.evaluate(make_demand_entries(3, device_type=DeviceType.TPU, device_variant="v5p-8"))
+        assert autoscaler.get_last_routing_decision_proto() is not proto_first
+        assert autoscaler.get_pending_hints() is not hints_first
+
 
 class TestAutoscalerBootstrapLogs:
     """Tests for bootstrap log reporting."""
diff --git a/lib/iris/tests/cluster/controller/test_dashboard.py b/lib/iris/tests/cluster/controller/test_dashboard.py
@@ -14,6 +14,7 @@
 from starlette.testclient import TestClient
 
 from iris.cluster.bundle import BundleStore
+from iris.cluster.controller.autoscaler.status import PendingHint
 from iris.cluster.controller.codec import constraints_from_json, resource_spec_from_scalars
 from iris.cluster.controller.dashboard import ControllerDashboard
 from iris.log_server.server import LogServiceImpl
@@ -532,6 +533,7 @@ def test_get_autoscaler_status_returns_disabled_when_no_autoscaler(client):
 def mock_autoscaler():
     """Create a mock autoscaler that returns a status proto."""
     autoscaler = Mock()
+    autoscaler.get_pending_hints.return_value = {}
     autoscaler.get_status.return_value = vm_pb2.AutoscalerStatus(
         groups=[
             vm_pb2.ScaleGroupStatus(
@@ -642,15 +644,13 @@ def test_pending_reason_uses_autoscaler_hint_for_scale_up(
     """Pending jobs surface autoscaler scale-up wait hints in job/detail APIs."""
     submit_job(state, "pending-scale", job_request)
 
-    task_id = JobName.root("test-user", "pending-scale").task(0).to_wire()
-    mock_autoscaler.get_status.return_value = vm_pb2.AutoscalerStatus(
-        last_routing_decision=vm_pb2.RoutingDecision(
-            group_to_launch={"tpu_v5e_32": 1},
-            routed_entries={
-                "tpu_v5e_32": vm_pb2.DemandEntryStatusList(entries=[vm_pb2.DemandEntryStatus(task_ids=[task_id])])
-            },
+    job_wire = JobName.root("test-user", "pending-scale").to_wire()
+    mock_autoscaler.get_pending_hints.return_value = {
+        job_wire: PendingHint(
+            message="Waiting for worker scale-up in scale group 'tpu_v5e_32' (1 slice(s) requested)",
+            is_scaling_up=True,
         )
-    )
+    }
 
     # GetJobStatus intentionally does not append the autoscaler hint — it
     # was the dominant hot path in a live CPU profile (35% of wall time
@@ -693,16 +693,14 @@ def test_pending_reason_uses_passive_autoscaler_hint_over_scheduler(
         ],
     )
     submit_job(state, "diag-constraint", request)
-    task_id = JobName.root("test-user", "diag-constraint").task(0).to_wire()
+    job_wire = JobName.root("test-user", "diag-constraint").to_wire()
 
-    mock_autoscaler.get_status.return_value = vm_pb2.AutoscalerStatus(
-        last_routing_decision=vm_pb2.RoutingDecision(
-            group_to_launch={"tpu_v5e_32": 0},
-            routed_entries={
-                "tpu_v5e_32": vm_pb2.DemandEntryStatusList(entries=[vm_pb2.DemandEntryStatus(task_ids=[task_id])])
-            },
+    mock_autoscaler.get_pending_hints.return_value = {
+        job_wire: PendingHint(
+            message="Waiting for workers in scale group 'tpu_v5e_32' to become ready",
+            is_scaling_up=False,
         )
-    )
+    }
 
     # GetJobStatus no longer appends the autoscaler hint (see
     # test_pending_reason_uses_autoscaler_hint_for_scale_up for rationale).
@@ -723,16 +721,14 @@ def test_list_jobs_shows_passive_autoscaler_wait_hint(
 ):
     """ListJobs should show passive autoscaler wait hints for pending jobs."""
     submit_job(state, "pending-no-launch", job_request)
-    task_id = JobName.root("test-user", "pending-no-launch").task(0).to_wire()
+    job_wire = JobName.root("test-user", "pending-no-launch").to_wire()
 
-    mock_autoscaler.get_status.return_value = vm_pb2.AutoscalerStatus(
-        last_routing_decision=vm_pb2.RoutingDecision(
-            group_to_launch={"tpu_v5e_32": 0},
-            routed_entries={
-                "tpu_v5e_32": vm_pb2.DemandEntryStatusList(entries=[vm_pb2.DemandEntryStatus(task_ids=[task_id])])
-            },
+    mock_autoscaler.get_pending_hints.return_value = {
+        job_wire: PendingHint(
+            message="Waiting for workers in scale group 'tpu_v5e_32' to become ready",
+            is_scaling_up=False,
         )
-    )
+    }
 
     jobs_resp = rpc_post(client_with_autoscaler, "ListJobs")
     listed = [