1919logger = logging .getLogger (__name__ )
2020
2121HEAD_NODE_RESOURCE_LABEL = "node:__internal_head__"
22+ # Label key the cluster autoscaler uses to bucket nodes by subcluster.
23+ # Hardcoded so all components agree without per-Dataset configuration.
24+ SUBCLUSTER_LABEL_KEY = "__subcluster__"
25+ # Sentinel for "no subcluster" — used as both a node-label fallback and
26+ # the bucket key for unlabeled nodes in ``_cluster_node_resources``.
27+ DEFAULT_SUBCLUSTER : Optional [str ] = None
28+
2229
2330RAY_DATA_AUTOSCALING_COORDINATOR_LOG_TRACEBACK = env_bool (
2431 "RAY_DATA_AUTOSCALING_COORDINATOR_LOG_TRACEBACK" , True
@@ -71,8 +78,12 @@ def __init__(
7178 self ,
7279 requester_id : str ,
7380 autoscaling_coordinator_actor = None , # For testing only: injects an actor instead of using the shared named singleton.
81+ subcluster_selector : Optional [Dict [str , str ]] = None ,
7482 ):
7583 self ._requester_id = requester_id
84+ # Label selector keyed by ``SUBCLUSTER_LABEL_KEY`` pinning this
85+ # requester to a single subcluster.
86+ self ._subcluster_selector = subcluster_selector
7687 self ._cached_allocated_resources : List [ResourceDict ] = []
7788 # In-flight get_allocated_resources ref, or None if no request is pending.
7889 self ._pending_allocated_resources : Optional [ray .ObjectRef ] = None
@@ -83,7 +94,7 @@ def __init__(
8394
8495 @functools .cached_property
8596 def _autoscaling_coordinator (self ):
86- # Create the coordinator actor lazily rather than eagerly in the constructor .
97+ # Lazy: avoids creating the actor in __init__ .
8798 return get_or_create_autoscaling_coordinator ()
8899
89100 def request_resources (
@@ -105,6 +116,7 @@ def request_resources(
105116 request_remaining = request_remaining ,
106117 priority = priority ,
107118 label_selectors = label_selectors ,
119+ subcluster_selector = self ._subcluster_selector ,
108120 )
109121
110122 def cancel_request (self ) -> None :
@@ -190,7 +202,11 @@ def __init__(
190202 self ._get_cluster_nodes = get_cluster_nodes
191203
192204 self ._ongoing_reqs : Dict [str , OngoingRequest ] = {}
193- self ._cluster_node_resources : List [ResourceDict ] = []
205+ # Map from requester id to its subcluster selector.
206+ self ._subcluster_selectors : Dict [str , Optional [Dict [str , str ]]] = {}
207+ # Node resources bucketed by their ``SUBCLUSTER_LABEL_KEY`` value.
208+ # Nodes without the key fall under ``DEFAULT_SUBCLUSTER``.
209+ self ._cluster_node_resources : Dict [Optional [str ], List [ResourceDict ]] = {}
194210 # Lock for thread-safe access to shared state from the background
195211 self ._lock = threading .Lock ()
196212 self ._update_cluster_node_resources ()
@@ -223,12 +239,15 @@ def request_resources(
223239 request_remaining : bool = False ,
224240 priority : ResourceRequestPriority = ResourceRequestPriority .MEDIUM ,
225241 label_selectors : Optional [List [Dict [str , str ]]] = None ,
242+ subcluster_selector : Optional [Dict [str , str ]] = None ,
226243 ) -> None :
227244 logger .debug (
228- "Received request from %s: %s (label_selectors=%s)." ,
245+ "Received request from %s: %s "
246+ "(label_selectors=%s, subcluster_selector=%s)." ,
229247 requester_id ,
230248 resources ,
231249 label_selectors ,
250+ subcluster_selector ,
232251 )
233252 if label_selectors is None :
234253 label_selectors = [{} for _ in resources ]
@@ -237,6 +256,20 @@ def request_resources(
237256 f"label_selectors length ({ len (label_selectors )} ) must match "
238257 f"resources length ({ len (resources )} )."
239258 )
259+ if subcluster_selector and label_selectors :
260+ req_subcluster = subcluster_selector .get (SUBCLUSTER_LABEL_KEY )
261+ for i , sel in enumerate (label_selectors ):
262+ bundle_subcluster = sel .get (SUBCLUSTER_LABEL_KEY )
263+ if (
264+ bundle_subcluster is not None
265+ and bundle_subcluster != req_subcluster
266+ ):
267+ raise ValueError (
268+ f"Bundle { i } label_selector targets subcluster "
269+ f"{ bundle_subcluster !r} , but requester is registered to "
270+ f"{ req_subcluster !r} . Per-bundle cross-subcluster "
271+ f"allocation is not supported."
272+ )
240273 with self ._lock :
241274 now = self ._get_current_time ()
242275 request_updated = False
@@ -248,6 +281,15 @@ def request_resources(
248281 )
249282 if priority .value != old_req .priority :
250283 raise ValueError ("Cannot change priority of an ongoing request." )
284+ if (
285+ requester_id in self ._subcluster_selectors
286+ and self ._subcluster_selectors [requester_id ] != subcluster_selector
287+ ):
288+ raise ValueError (
289+ "Cannot change subcluster_selector of an ongoing request "
290+ f"from { self ._subcluster_selectors [requester_id ]!r} to "
291+ f"{ subcluster_selector !r} ."
292+ )
251293
252294 request_updated = (
253295 resources != old_req .requested_resources
@@ -267,6 +309,9 @@ def request_resources(
267309 expiration_time = now + expire_after_s ,
268310 allocated_resources = [],
269311 )
312+ # Write subcluster after all validations so a rejected call
313+ # never leaves the registry on a new subcluster.
314+ self ._subcluster_selectors [requester_id ] = subcluster_selector
270315 if request_updated :
271316 # If the request has updated, immediately send
272317 # a new request and reallocate resources.
@@ -282,25 +327,38 @@ def cancel_request(
282327 if requester_id not in self ._ongoing_reqs :
283328 return
284329 del self ._ongoing_reqs [requester_id ]
330+ self ._subcluster_selectors .pop (requester_id , None )
285331 self ._merge_and_send_requests ()
286332 self ._reallocate_resources ()
287333
288334 def _purge_expired_requests (self ):
289335 now = self ._get_current_time ()
290- self . _ongoing_reqs = {
336+ live = {
291337 requester_id : req
292338 for requester_id , req in self ._ongoing_reqs .items ()
293339 if req .expiration_time > now
294340 }
341+ for expired_id in self ._ongoing_reqs .keys () - live .keys ():
342+ self ._subcluster_selectors .pop (expired_id , None )
343+ self ._ongoing_reqs = live
295344
296345 def _merge_and_send_requests (self ):
297- """Merge requests and send them to Ray Autoscaler."""
346+ """Merge requests and send them to Ray Autoscaler.
347+
348+ Each bundle's forwarded selector is the union of its per-bundle
349+ ``requested_label_selectors`` entry and the requester's
350+ ``subcluster_selector``. The subcluster pin wins on key conflict,
351+ so the autoscaler always sees the correct subcluster regardless
352+ of what the per-bundle selectors contain.
353+ """
298354 self ._purge_expired_requests ()
299355 merged_req : List [ResourceDict ] = []
300356 merged_selectors : List [Dict [str , str ]] = []
301- for req in self ._ongoing_reqs .values ():
357+ for requester_id , req in self ._ongoing_reqs .items ():
302358 merged_req .extend (req .requested_resources )
303- merged_selectors .extend (req .requested_label_selectors )
359+ subcluster_selector = self ._subcluster_selectors .get (requester_id ) or {}
360+ for per_bundle in req .requested_label_selectors :
361+ merged_selectors .append ({** per_bundle , ** subcluster_selector })
304362 if any (merged_selectors ):
305363 self ._send_resources_request (merged_req , label_selectors = merged_selectors )
306364 else :
@@ -324,7 +382,7 @@ def _maybe_subtract_resources(self, res1: ResourceDict, res2: ResourceDict) -> b
324382 return True
325383
326384 def _update_cluster_node_resources (self ) -> bool :
327- """Update cluster's total resources. Return True if changed."""
385+ """Update cluster resources bucketed by subcluster . Return True if changed."""
328386
329387 def _is_node_eligible (node ):
330388 # Exclude dead nodes.
@@ -341,47 +399,69 @@ def _is_node_eligible(node):
341399
342400 nodes = list (filter (_is_node_eligible , self ._get_cluster_nodes ()))
343401 nodes = sorted (nodes , key = lambda node : node .get ("NodeID" , "" ))
344- cluster_node_resources = [node ["Resources" ] for node in nodes ]
402+ cluster_node_resources : Dict [Optional [str ], List [ResourceDict ]] = {}
403+ for node in nodes :
404+ # Safeguard against case where the value of Labels is None.
405+ labels = node .get ("Labels" ) or {}
406+ subcluster = labels .get (SUBCLUSTER_LABEL_KEY , DEFAULT_SUBCLUSTER )
407+ cluster_node_resources .setdefault (subcluster , []).append (node ["Resources" ])
345408 if cluster_node_resources == self ._cluster_node_resources :
346409 return False
347- else :
348- logger .debug ("Cluster resources updated: %s." , cluster_node_resources )
349- self ._cluster_node_resources = cluster_node_resources
350- return True
410+ logger .debug ("Cluster resources updated: %s." , cluster_node_resources )
411+ self ._cluster_node_resources = cluster_node_resources
412+ return True
351413
352414 def _reallocate_resources (self ):
353- """Reallocate cluster resources."""
415+ """Reallocate cluster resources.
416+
417+ Each requester's subcluster comes from its ``subcluster_selector``.
418+ A requester without one is eligible only for the ``None`` bucket.
419+ """
354420 now = self ._get_current_time ()
355- cluster_node_resources = copy .deepcopy (self ._cluster_node_resources )
356- ongoing_reqs = sorted (
357- [req for req in self ._ongoing_reqs .values () if req .expiration_time >= now ]
421+ cluster_node_resources : Dict [Optional [str ], List [ResourceDict ]] = copy .deepcopy (
422+ self ._cluster_node_resources
358423 )
359- # Allocate resources to ongoing requests.
424+ live_items = [
425+ (req_id , req )
426+ for req_id , req in self ._ongoing_reqs .items ()
427+ if req .expiration_time >= now
428+ ]
429+ live_items .sort (key = lambda item : item [1 ])
430+
431+ def _subcluster_of (requester_id : str ) -> Optional [str ]:
432+ selector = self ._subcluster_selectors .get (requester_id )
433+ return (selector or {}).get (SUBCLUSTER_LABEL_KEY , DEFAULT_SUBCLUSTER )
434+
360435 # TODO(hchen): Optimize the following triple loop.
361- for ongoing_req in ongoing_reqs :
436+ for requester_id , ongoing_req in live_items :
362437 ongoing_req .allocated_resources = []
363- for req in ongoing_req .requested_resources :
364- for node_resource in cluster_node_resources :
365- if self ._maybe_subtract_resources (node_resource , req ):
366- ongoing_req .allocated_resources .append (req )
438+ subcluster = _subcluster_of (requester_id )
439+ for bundle in ongoing_req .requested_resources :
440+ for node_resource in cluster_node_resources .get (subcluster , []):
441+ if self ._maybe_subtract_resources (node_resource , bundle ):
442+ ongoing_req .allocated_resources .append (bundle )
367443 break
368- # Allocate remaining resources.
369- # NOTE: to handle the case where multiple datasets are running concurrently,
370- # we divide remaining resources equally to all requesters with `request_remaining=True` .
371- remaining_resource_requesters = [
372- req for req in ongoing_reqs if req .request_remaining
444+
445+ # Allocate remaining resources. Multiple concurrent requesters in
446+ # the same subcluster split that subcluster's leftovers equally .
447+ remaining_items = [
448+ ( req_id , req ) for req_id , req in live_items if req .request_remaining
373449 ]
374- num_remaining_requesters = len (remaining_resource_requesters )
375- if num_remaining_requesters > 0 :
376- for node_resource in cluster_node_resources :
377- # Divide remaining resources equally among requesters.
378- # NOTE: Integer division may leave some resources unallocated.
379- divided_resource = {
380- k : v // num_remaining_requesters for k , v in node_resource .items ()
381- }
382- for ongoing_req in remaining_resource_requesters :
383- if any (v > 0 for v in divided_resource .values ()):
384- ongoing_req .allocated_resources .append (divided_resource )
450+ for subcluster , node_resources in cluster_node_resources .items ():
451+ eligible = [
452+ req
453+ for req_id , req in remaining_items
454+ if _subcluster_of (req_id ) == subcluster
455+ ]
456+ if not eligible :
457+ continue
458+ for node_resource in node_resources :
459+ # Integer division may leave some resources unallocated.
460+ divided = {k : v // len (eligible ) for k , v in node_resource .items ()}
461+ if not any (v > 0 for v in divided .values ()):
462+ continue
463+ for r in eligible :
464+ r .allocated_resources .append (divided )
385465
386466 if logger .isEnabledFor (logging .DEBUG ):
387467 msg = "Allocated resources:\n "
0 commit comments