@@ -2236,9 +2236,13 @@ def _run_ping_loop(self, stop_event: threading.Event) -> None:
22362236 worker. This is safe: fail_workers_batch, on_worker_failed, and
22372237 terminate_slices_for_workers are all idempotent.
22382238 """
2239- limiter = RateLimiter (interval_seconds = self ._config .heartbeat_interval .to_seconds ())
2239+ ping_interval_s = self ._config .heartbeat_interval .to_seconds ()
2240+ limiter = RateLimiter (interval_seconds = ping_interval_s )
22402241 ping_failures : dict [str , int ] = {}
22412242 threshold = self ._config .heartbeat_failure_threshold
2243+ # Refresh resource snapshots every ~60s; other cycles just note liveness.
2244+ resource_update_every = max (1 , round (60.0 / ping_interval_s ))
2245+ cycle = 0
22422246
22432247 while not stop_event .is_set ():
22442248 if not limiter .wait (cancel = stop_event ):
@@ -2249,8 +2253,11 @@ def _run_ping_loop(self, stop_event: threading.Event) -> None:
22492253 self ._reap_stale_workers ()
22502254 workers = self ._get_active_worker_addresses ()
22512255 results = self ._provider .ping_workers (workers )
2256+ update_resources = cycle % resource_update_every == 0
2257+ cycle += 1
22522258
22532259 dead_workers : list [str ] = []
2260+ liveness_ids : list [WorkerId ] = []
22542261 for result in results :
22552262 wid_str = str (result .worker_id )
22562263 if result .error is not None :
@@ -2264,8 +2271,13 @@ def _run_ping_loop(self, stop_event: threading.Event) -> None:
22642271 )
22652272 else :
22662273 ping_failures .pop (wid_str , None )
2267- if result .resource_snapshot :
2274+ if update_resources and result .resource_snapshot :
22682275 self ._transitions .update_worker_ping_success (result .worker_id , result .resource_snapshot )
2276+ else :
2277+ liveness_ids .append (result .worker_id )
2278+
2279+ if liveness_ids :
2280+ self ._transitions .touch_worker_liveness (liveness_ids )
22692281
22702282 if dead_workers :
22712283 failure_result = self ._transitions .fail_workers_batch (
@@ -2302,10 +2314,10 @@ def _run_ping_loop(self, stop_event: threading.Event) -> None:
23022314 def _run_poll_loop (self , stop_event : threading .Event ) -> None :
23032315 """Periodic full-state reconciliation for split heartbeat mode.
23042316
2305- Polls all workers via PollTasks every 30s and feeds results into the
2317+ Polls all workers via PollTasks every 60s and feeds results into the
23062318 task-updater queue for batched application.
23072319 """
2308- limiter = RateLimiter (interval_seconds = 30 .0 )
2320+ limiter = RateLimiter (interval_seconds = 60 .0 )
23092321 while not stop_event .is_set ():
23102322 if not limiter .wait (cancel = stop_event ):
23112323 break
0 commit comments