1414from iris .cluster .constraints import (
1515 ConstraintIndex ,
1616 DeviceType ,
17+ PlacementRequirements ,
1718 extract_placement_requirements ,
1819 get_device_type_enum ,
1920 routing_constraints ,
@@ -228,52 +229,6 @@ def _format_variants(variants: frozenset[str] | None) -> str:
228229 return "," .join (sorted (variants ))
229230
230231
231- def _diagnose_no_matching_group (entry : DemandEntry , groups : list [ScalingGroup ]) -> str :
232- """Produce a concise, actionable reason when no group matches a demand entry."""
233-
234- normalized = entry .normalized
235- device_type = normalized .device_type or DeviceType .CPU
236- device_matches = []
237- for group in groups :
238- if group .matches_device_requirement (device_type , normalized .device_variants ):
239- device_matches .append (group )
240-
241- variants_str = _format_variants (normalized .device_variants )
242- if not device_matches :
243- return f"no_matching_group: no groups with device { device_type .value } :{ variants_str } "
244-
245- if normalized .preemptible is not None :
246- preempt_matches = [
247- group
248- for group in device_matches
249- if (group .config .resources .capacity_type == config_pb2 .CAPACITY_TYPE_PREEMPTIBLE ) == normalized .preemptible
250- ]
251- if not preempt_matches :
252- want = "preemptible" if normalized .preemptible else "non-preemptible"
253- return f"no_matching_group: no { want } groups for device { device_type .value } :{ variants_str } "
254- device_matches = preempt_matches
255-
256- if normalized .required_zones :
257- available_zones = {group .zone for group in device_matches } - {None }
258- requested = sorted (normalized .required_zones )
259- message = f"no_matching_group: no groups in zone { ', ' .join (requested )} "
260- for requested_zone in requested :
261- close = difflib .get_close_matches (requested_zone , available_zones , n = 1 , cutoff = 0.7 )
262- if close :
263- message += f" (did you mean { close [0 ]} ?)"
264- return message
265-
266- if normalized .required_regions :
267- requested = sorted (normalized .required_regions )
268- region_message = f"no_matching_group: no groups in region { ', ' .join (requested )} "
269- return region_message
270-
271- return (
272- "no_matching_group: no groups match device="
273- f"{ device_type .value } :{ _format_variants (normalized .device_variants )} "
274- )
275-
276-
277232# GCP zones end with -{single letter}, e.g. us-central1-a.
278233_ZONE_PATTERN = re .compile (r".+-[a-z]$" )
279234
@@ -282,42 +237,41 @@ def _looks_like_zone(value: str) -> bool:
282237 return bool (_ZONE_PATTERN .fullmatch (value ))
283238
284239
285- def diagnose_unsatisfiable_constraints (
286- constraints : Sequence [ job_pb2 . Constraint ] ,
287- groups : list [ScalingGroup ],
240+ def _diagnose (
241+ placement : PlacementRequirements ,
242+ groups : Sequence [ScalingGroup ],
288243) -> str :
289- """Produce a user-facing error when no scaling group can satisfy constraints .
244+ """Explain why no scaling group satisfies a placement requirement .
290245
291- Performs layered diagnosis (device, preemptible, zone, region) and
292- detects zone/region value confusion.
246+ Layered analysis (device → preemptible → zone → region) with zone/region
247+ confusion heuristics and fuzzy-match hints. Returned string has no prefix;
248+ callers prepend their own (e.g. "no_matching_group: ") when needed.
293249 """
294- normalized = extract_placement_requirements (constraints )
295- device_type = normalized .device_type or DeviceType .CPU
250+ device_type = placement .device_type or DeviceType .CPU
251+ device_matches = [g for g in groups if g .matches_device_requirement (device_type , placement .device_variants )]
252+ variants_str = _format_variants (placement .device_variants )
296253
297- device_matches = [g for g in groups if g .matches_device_requirement (device_type , normalized .device_variants )]
298- variants_str = _format_variants (normalized .device_variants )
299254 if not device_matches :
300255 available = ", " .join (g .name for g in groups )
301256 return f"no scaling group provides device { device_type .value } :{ variants_str } (available: { available } )"
302257
303- if normalized .preemptible is not None :
258+ if placement .preemptible is not None :
304259 preempt_matches = [
305260 g
306261 for g in device_matches
307- if (g .config .resources .capacity_type == config_pb2 .CAPACITY_TYPE_PREEMPTIBLE ) == normalized .preemptible
262+ if (g .config .resources .capacity_type == config_pb2 .CAPACITY_TYPE_PREEMPTIBLE ) == placement .preemptible
308263 ]
309264 if not preempt_matches :
310- want = "preemptible" if normalized .preemptible else "non-preemptible"
265+ want = "preemptible" if placement .preemptible else "non-preemptible"
311266 return f"no { want } group provides device { device_type .value } :{ variants_str } "
312267 device_matches = preempt_matches
313268
314- if normalized .required_zones :
269+ if placement .required_zones :
315270 available_zones = {g .zone for g in device_matches } - {None }
316271 available_regions = {g .region for g in device_matches } - {None }
317- requested = sorted (normalized .required_zones )
272+ requested = sorted (placement .required_zones )
318273 parts = [f"no groups in zone { ', ' .join (requested )} " ]
319274 for z in requested :
320- # Prioritize zone/region confusion over fuzzy match
321275 if not _looks_like_zone (z ) and z in available_regions :
322276 parts .append (f"'{ z } ' looks like a region, not a zone; use a region constraint instead" )
323277 else :
@@ -326,13 +280,12 @@ def diagnose_unsatisfiable_constraints(
326280 parts .append (f"did you mean { close [0 ]} ?" )
327281 return "; " .join (parts )
328282
329- if normalized .required_regions :
283+ if placement .required_regions :
330284 available_regions = {g .region for g in device_matches } - {None }
331285 available_zones = {g .zone for g in device_matches } - {None }
332- requested = sorted (normalized .required_regions )
286+ requested = sorted (placement .required_regions )
333287 parts = [f"no groups in region { ', ' .join (requested )} " ]
334288 for r in requested :
335- # Prioritize zone/region confusion over fuzzy match
336289 if _looks_like_zone (r ) and r in available_zones :
337290 parts .append (f"'{ r } ' looks like a zone, not a region; use a zone constraint instead" )
338291 else :
@@ -345,6 +298,70 @@ def diagnose_unsatisfiable_constraints(
345298 return f"no scaling group matches constraints (available: { available } )"
346299
347300
301+ @dataclass (frozen = True )
302+ class GroupFeasibility :
303+ """Result of the job_feasibility predicate.
304+
305+ `feasible` is the subset of groups whose hard routing constraints match
306+ and (if coscheduled) have a compatible num_vms. Non-empty means the job
307+ can, in principle, be scheduled; an autoscaler tick may still need to
308+ grow a group before capacity appears.
309+
310+ `reason` is populated iff `feasible` is empty, with a user-facing
311+ explanation suitable for rejecting the job at submit time.
312+ """
313+
314+ feasible : list [ScalingGroup ]
315+ reason : str | None
316+
317+
318+ def job_feasibility (
319+ groups : Sequence [ScalingGroup ],
320+ constraints : Sequence [job_pb2 .Constraint ],
321+ replicas : int | None = None ,
322+ ) -> GroupFeasibility :
323+ """Answer: can any scaling group ever host this job shape?
324+
325+ Ignores runtime availability (quota, cooldown, in-flight capacity) — that
326+ is the autoscaler's job on each tick. This predicate gates LaunchJob at
327+ submit time so jobs that can never be scheduled fail fast.
328+
329+ Args:
330+ groups: scaling groups to consider.
331+ constraints: the job's hard + soft routing constraints.
332+ replicas: for coscheduled jobs, the required replica count; None for
333+ non-coscheduled jobs. When set, groups must also have num_vms that
334+ divides replicas evenly.
335+ """
336+ groups_list = list (groups )
337+ if not groups_list :
338+ return GroupFeasibility (feasible = [], reason = None )
339+
340+ group_attrs = {g .name : g .to_attributes () for g in groups_list }
341+ group_index = ConstraintIndex .build (group_attrs )
342+ hard_cs , _ = split_hard_soft (routing_constraints (constraints ))
343+ matching_names = group_index .matching_entities (hard_cs )
344+ matching = [g for g in groups_list if g .name in matching_names ]
345+
346+ if not matching :
347+ placement = extract_placement_requirements (constraints )
348+ return GroupFeasibility (feasible = [], reason = _diagnose (placement , groups_list ))
349+
350+ if replicas is not None :
351+ compatible = [g for g in matching if g .num_vms > 0 and replicas % g .num_vms == 0 ]
352+ if not compatible :
353+ sizes = {g .name : g .num_vms for g in matching }
354+ reason = (
355+ f"job requires { replicas } coscheduled replicas but no matching scaling group "
356+ f"has a compatible size (replicas must be an exact multiple of num_vms); "
357+ f"matching group sizes: { sizes } "
358+ )
359+ return GroupFeasibility (feasible = [], reason = reason )
360+ matching = compatible
361+
362+ return GroupFeasibility (feasible = matching , reason = None )
363+
364+
348365def _diagnose_no_capacity (
349366 entry : DemandEntry ,
350367 matching_groups : list [ScalingGroup ],
@@ -493,7 +510,7 @@ def route_demand(
493510 reason = (
494511 f"tier_blocked: { pre_tier_count } matching group(s) blocked by quota-pool tier monotonicity"
495512 if pre_tier_count > 0
496- else _diagnose_no_matching_group (entry , sorted_groups )
513+ else f"no_matching_group: { _diagnose (entry . normalized , sorted_groups )} "
497514 )
498515 unmet .append (UnmetDemand (entry = entry , reason = reason ))
499516 continue
0 commit comments