Skip to content

Commit 66fdc8d

Browse files
committed
[iris] Include per-job autoscaler hint in GetJobStatus
The hint was dropped from GetJobStatus because rebuilding the full routing table per call was 35% of wall time. #4848 now caches the per-job hint dict per evaluate() cycle, so the lookup is a single dict get — attach this job's hint without serializing the routing decision.
1 parent e58b955 commit 66fdc8d

File tree

2 files changed

+14
-17
lines changed

2 files changed

+14
-17
lines changed

lib/iris/src/iris/cluster/controller/service.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,17 +1225,18 @@ def get_job_status(
12251225
)
12261226

12271227
# Get scheduling diagnostics for pending jobs from cache
1228-
# (populated each scheduling cycle by the controller).
1229-
#
1230-
# The autoscaler pending-hint used to be appended here, but
1231-
# ``_get_autoscaler_pending_hints`` rebuilds + serializes the full
1232-
# autoscaler routing table on every call (35%+ of wall-time in a
1233-
# live CPU profile). Skip it for now; use ListJobs for the richer
1234-
# pending explanation while we work out a cached hint path.
1228+
# (populated each scheduling cycle by the controller). The autoscaler
1229+
# hint dict is cached per evaluate() cycle (#4848), so the lookup here
1230+
# is a single dict get — we only attach this job's hint, never the
1231+
# full routing decision.
12351232
pending_reason = ""
12361233
if job.state == job_pb2.JOB_STATE_PENDING:
12371234
sched_reason = self._controller.get_job_scheduling_diagnostics(job.job_id.to_wire())
12381235
pending_reason = sched_reason or "Pending scheduler feedback"
1236+
hint = self._get_autoscaler_pending_hints().get(job.job_id.to_wire())
1237+
if hint is not None:
1238+
scaling_prefix = "(scaling up) " if hint.is_scaling_up else ""
1239+
pending_reason = f"Scheduler: {pending_reason}\n\nAutoscaler: {scaling_prefix}{hint.message}"
12391240

12401241
resources = _resource_spec_from_job_row(job)
12411242

lib/iris/tests/cluster/controller/test_dashboard.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -652,15 +652,14 @@ def test_pending_reason_uses_autoscaler_hint_for_scale_up(
652652
)
653653
}
654654

655-
# GetJobStatus intentionally does not append the autoscaler hint — it
656-
# was the dominant hot path in a live CPU profile (35% of wall time
657-
# spent rebuilding / serializing the routing table per RPC). ListJobs
658-
# still includes the hint since it's only computed once per page.
655+
# GetJobStatus appends this job's autoscaler hint via the per-cycle hint
656+
# cache (#4848) — a single dict lookup, no routing-table serialization.
659657
job_resp = rpc_post(
660658
client_with_autoscaler, "GetJobStatus", {"jobId": JobName.root("test-user", "pending-scale").to_wire()}
661659
)
662660
pending_reason = job_resp.get("job", {}).get("pendingReason", "")
663-
assert "Waiting for worker scale-up in scale group 'tpu_v5e_32'" not in pending_reason
661+
assert "Waiting for worker scale-up in scale group 'tpu_v5e_32'" in pending_reason
662+
assert "(scaling up)" in pending_reason
664663

665664
jobs_resp = rpc_post(client_with_autoscaler, "ListJobs")
666665
listed = [
@@ -702,15 +701,12 @@ def test_pending_reason_uses_passive_autoscaler_hint_over_scheduler(
702701
)
703702
}
704703

705-
# GetJobStatus no longer appends the autoscaler hint (see
706-
# test_pending_reason_uses_autoscaler_hint_for_scale_up for rationale).
707-
# It still surfaces the scheduler diagnostic.
704+
# GetJobStatus appends this job's autoscaler passive-wait hint.
708705
job_resp = rpc_post(
709706
client_with_autoscaler, "GetJobStatus", {"jobId": JobName.root("test-user", "diag-constraint").to_wire()}
710707
)
711708
pending_reason = job_resp.get("job", {}).get("pendingReason", "")
712-
assert pending_reason
713-
assert "Waiting for workers in scale group 'tpu_v5e_32' to become ready" not in pending_reason
709+
assert "Waiting for workers in scale group 'tpu_v5e_32' to become ready" in pending_reason
714710

715711

716712
def test_list_jobs_shows_passive_autoscaler_wait_hint(

0 commit comments

Comments
 (0)