6767from iris .cluster .controller .dashboard import ControllerDashboard
6868from iris .cluster .controller .db import (
6969 ControllerDB ,
70+ SchedulableWorker ,
7071 healthy_active_workers_with_attributes ,
7172 insert_task_profile ,
7273 job_scheduling_deadline ,
8081 Scheduler ,
8182 SchedulingContext ,
8283 WorkerCapacity ,
83- WorkerSnapshot ,
8484)
8585from iris .cluster .controller .schema import (
8686 ATTEMPT_PROJECTION ,
9696 TaskDetailRow ,
9797 TaskRow ,
9898 WorkerDetailRow ,
99- WorkerRow ,
10099 proto_decoder ,
101100 tasks_with_attempts ,
102101)
114113 TaskUpdate ,
115114 log_event ,
116115)
117- from iris .cluster .controller .worker_health import WorkerCommitTracker , WorkerHealthTracker
116+ from iris .cluster .controller .worker_health import WorkerHealthTracker
118117from iris .cluster .log_store_helpers import CONTROLLER_LOG_KEY
119118from iris .cluster .providers .k8s .tasks import K8sTaskProvider
120119from iris .cluster .providers .types import find_free_port , resolve_external_host
@@ -201,7 +200,7 @@ class _SchedulingStateRead:
201200 """Snapshot of pending tasks and workers read at the start of a scheduling cycle."""
202201
203202 pending_tasks : list [TaskRow ]
204- workers : list [WorkerRow ]
203+ workers : list [SchedulableWorker ]
205204 state_read_ms : int
206205
207206
@@ -245,7 +244,7 @@ def job_requirements_from_job(job: JobSchedulingRow) -> JobRequirements:
245244def compute_demand_entries (
246245 queries : ControllerDB ,
247246 scheduler : Scheduler | None = None ,
248- workers : list [WorkerSnapshot ] | None = None ,
247+ workers : list [SchedulableWorker ] | None = None ,
249248 reservation_claims : dict [WorkerId , ReservationClaim ] | None = None ,
250249) -> list [DemandEntry ]:
251250 """Compute demand entries for the autoscaler from controller state.
@@ -708,7 +707,7 @@ def _tasks_by_ids_with_attempts(queries: ControllerDB, task_ids: set[JobName]) -
708707 return {task .task_id : task for task in tasks_with_attempts (tasks , attempts )}
709708
710709
711- def _building_counts (queries : ControllerDB , workers : list [WorkerRow ]) -> dict [WorkerId , int ]:
710+ def _building_counts (queries : ControllerDB , workers : list [SchedulableWorker ]) -> dict [WorkerId , int ]:
712711 """Count tasks in BUILDING or ASSIGNED state per worker, excluding reservation-holder jobs."""
713712 if not workers :
714713 return {}
@@ -763,7 +762,7 @@ def _task_worker_mapping(queries: ControllerDB, task_ids: set[JobName]) -> dict[
763762
764763
765764def _worker_matches_reservation_entry (
766- worker : WorkerRow ,
765+ worker : SchedulableWorker ,
767766 res_entry : job_pb2 .ReservationEntry ,
768767) -> bool :
769768 """Check if a worker is eligible for a reservation entry.
@@ -785,9 +784,9 @@ def _worker_matches_reservation_entry(
785784
786785
787786def _inject_reservation_taints (
788- workers : list [WorkerRow ],
787+ workers : list [SchedulableWorker ],
789788 claims : dict [WorkerId , ReservationClaim ],
790- ) -> list [WorkerRow ]:
789+ ) -> list [SchedulableWorker ]:
791790 """Create modified worker copies with reservation taints and prioritization.
792791
793792 Claimed workers receive a ``reservation-job`` attribute set to the claiming
@@ -800,8 +799,8 @@ def _inject_reservation_taints(
800799 if not claims :
801800 return workers
802801
803- claimed : list [WorkerRow ] = []
804- unclaimed : list [WorkerRow ] = []
802+ claimed : list [SchedulableWorker ] = []
803+ unclaimed : list [SchedulableWorker ] = []
805804 for worker in workers :
806805 claim = claims .get (worker .worker_id )
807806 if claim is not None :
@@ -882,7 +881,6 @@ def _reservation_region_constraints(
882881 claims : dict [WorkerId , ReservationClaim ],
883882 queries : ControllerDB ,
884883 health : WorkerHealthTracker ,
885- committed : WorkerCommitTracker ,
886884 existing_constraints : list [Constraint ],
887885) -> list [Constraint ]:
888886 """Derive region constraints from claimed reservation workers.
@@ -899,7 +897,7 @@ def _reservation_region_constraints(
899897 claimed_worker_ids = {worker_id for worker_id , claim in claims .items () if claim .job_id == job_id_wire }
900898 workers_by_id = {
901899 worker .worker_id : worker
902- for worker in healthy_active_workers_with_attributes (queries , health , committed )
900+ for worker in healthy_active_workers_with_attributes (queries , health )
903901 if worker .worker_id in claimed_worker_ids
904902 }
905903 regions : set [str ] = set ()
@@ -1632,13 +1630,13 @@ def _profile_all_running_tasks(self) -> None:
16321630 Memory profiling via memray is currently disabled because memray attach
16331631 has been triggering segfaults in target processes.
16341632 """
1635- workers = healthy_active_workers_with_attributes (self ._db , self ._health , self . _store . committed )
1633+ workers = healthy_active_workers_with_attributes (self ._db , self ._health )
16361634 if not workers :
16371635 return
16381636 workers_by_id = {w .worker_id : w for w in workers }
16391637 tasks_by_worker = running_tasks_by_worker (self ._db , set (workers_by_id .keys ()))
16401638
1641- profile_targets : list [tuple [JobName , WorkerRow ]] = []
1639+ profile_targets : list [tuple [JobName , SchedulableWorker ]] = []
16421640 for worker_id , task_ids in tasks_by_worker .items ():
16431641 worker = workers_by_id [worker_id ]
16441642 for task_id in task_ids :
@@ -1656,7 +1654,7 @@ def _profile_all_running_tasks(self) -> None:
16561654
16571655 def _dispatch_profiles (
16581656 self ,
1659- targets : list [tuple [JobName , WorkerRow ]],
1657+ targets : list [tuple [JobName , SchedulableWorker ]],
16601658 profile_type : job_pb2 .ProfileType ,
16611659 profile_kind : str ,
16621660 duration : int ,
@@ -1674,7 +1672,7 @@ def _dispatch_profiles(
16741672 def _capture_one_profile (
16751673 self ,
16761674 task_id : JobName ,
1677- worker : WorkerRow ,
1675+ worker : SchedulableWorker ,
16781676 profile_type : job_pb2 .ProfileType ,
16791677 profile_kind : str ,
16801678 duration : int ,
@@ -1776,7 +1774,7 @@ def _claim_workers_for_reservations(self, claims: dict[WorkerId, ReservationClai
17761774 persisted = True
17771775 claimed_entries : set [tuple [str , int ]] = {(c .job_id , c .entry_idx ) for c in claims .values ()}
17781776 claimed_worker_ids : set [WorkerId ] = set (claims .keys ())
1779- all_workers = healthy_active_workers_with_attributes (self ._db , self ._health , self . _store . committed )
1777+ all_workers = healthy_active_workers_with_attributes (self ._db , self ._health )
17801778 changed = False
17811779
17821780 reservable_states = (
@@ -1794,8 +1792,6 @@ def _claim_workers_for_reservations(self, claims: dict[WorkerId, ReservationClai
17941792 for worker in all_workers :
17951793 if worker .worker_id in claimed_worker_ids :
17961794 continue
1797- if not worker .healthy :
1798- continue
17991795 if not _worker_matches_reservation_entry (worker , res_entry ):
18001796 continue
18011797
@@ -1914,7 +1910,7 @@ def _read_scheduling_state(self) -> _SchedulingStateRead:
19141910 timer = Timer ()
19151911 with slow_log (logger , "scheduling state reads" , threshold_ms = 50 ):
19161912 pending_tasks = _schedulable_tasks (self ._db )
1917- workers = healthy_active_workers_with_attributes (self ._db , self ._health , self . _store . committed )
1913+ workers = healthy_active_workers_with_attributes (self ._db , self ._health )
19181914 return _SchedulingStateRead (
19191915 pending_tasks = pending_tasks ,
19201916 workers = workers ,
@@ -2238,7 +2234,7 @@ def _mark_task_unschedulable(self, task: TaskRow) -> None:
22382234 if result .tasks_to_kill :
22392235 self .kill_tasks_on_workers (result .tasks_to_kill , result .task_kill_workers )
22402236
2241- def create_scheduling_context (self , workers : list [WorkerRow ]) -> SchedulingContext :
2237+ def create_scheduling_context (self , workers : list [SchedulableWorker ]) -> SchedulingContext :
22422238 """Create a scheduling context for the given workers."""
22432239 building_counts = _building_counts (self ._db , workers )
22442240 return self ._scheduler .create_scheduling_context (
@@ -2376,7 +2372,7 @@ def _stop_tasks_direct(
23762372
23772373 def _get_active_worker_addresses (self ) -> list [tuple [WorkerId , str | None ]]:
23782374 """Get healthy active workers as (worker_id, address) tuples for ping."""
2379- workers = healthy_active_workers_with_attributes (self ._db , self ._health , self . _store . committed )
2375+ workers = healthy_active_workers_with_attributes (self ._db , self ._health )
23802376 return [(w .worker_id , w .address ) for w in workers ]
23812377
23822378 def _run_ping_loop (self , stop_event : threading .Event ) -> None :
@@ -2531,7 +2527,7 @@ def _run_autoscaler_once(self) -> None:
25312527
25322528 worker_status_map = self ._build_worker_status_map ()
25332529 self ._autoscaler .refresh (worker_status_map )
2534- workers = healthy_active_workers_with_attributes (self ._db , self ._health , self . _store . committed )
2530+ workers = healthy_active_workers_with_attributes (self ._db , self ._health )
25352531 demand_entries = compute_demand_entries (
25362532 self ._db ,
25372533 self ._scheduler ,
0 commit comments