Skip to content

Commit a9109a1

Browse files
[iris] Reject jobs with unsatisfiable routing constraints at submit time
Adds check_routing_feasibility() to the autoscaler, called from launch_job for all jobs. Produces actionable diagnostics: device type/variant mismatch, region/zone typo suggestions via fuzzy matching, and zone-vs-region confusion detection (e.g. passing a zone value as a region constraint). Soft constraints are excluded from the check so they don't prevent submission. Fixes #4679
1 parent f8d4889 commit a9109a1

File tree

6 files changed

+223
-19
lines changed

6 files changed

+223
-19
lines changed

lib/iris/src/iris/cluster/controller/autoscaler/routing.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,20 @@
77

88
import difflib
99
import math
10+
import re
11+
from collections.abc import Sequence
1012
from dataclasses import dataclass
1113

1214
from iris.cluster.constraints import (
1315
ConstraintIndex,
1416
DeviceType,
17+
extract_placement_requirements,
1518
get_device_type_enum,
1619
routing_constraints,
1720
soft_constraint_score,
1821
split_hard_soft,
1922
)
23+
from iris.rpc import job_pb2
2024
from iris.cluster.controller.autoscaler.models import (
2125
AdditiveReq,
2226
DemandEntry,
@@ -270,6 +274,77 @@ def _diagnose_no_matching_group(entry: DemandEntry, groups: list[ScalingGroup])
270274
)
271275

272276

277+
# GCP zones end with -{single letter}, e.g. us-central1-a.
278+
_ZONE_PATTERN = re.compile(r".+-[a-z]$")
279+
280+
281+
def _looks_like_zone(value: str) -> bool:
282+
return bool(_ZONE_PATTERN.fullmatch(value))
283+
284+
285+
def diagnose_unsatisfiable_constraints(
286+
constraints: Sequence[job_pb2.Constraint],
287+
groups: list[ScalingGroup],
288+
) -> str:
289+
"""Produce a user-facing error when no scaling group can satisfy constraints.
290+
291+
Performs layered diagnosis (device, preemptible, zone, region) and
292+
detects zone/region value confusion.
293+
"""
294+
normalized = extract_placement_requirements(constraints)
295+
device_type = normalized.device_type or DeviceType.CPU
296+
297+
device_matches = [g for g in groups if g.matches_device_requirement(device_type, normalized.device_variants)]
298+
variants_str = _format_variants(normalized.device_variants)
299+
if not device_matches:
300+
available = ", ".join(g.name for g in groups)
301+
return f"no scaling group provides device {device_type.value}:{variants_str} (available: {available})"
302+
303+
if normalized.preemptible is not None:
304+
preempt_matches = [
305+
g
306+
for g in device_matches
307+
if (g.config.resources.capacity_type == config_pb2.CAPACITY_TYPE_PREEMPTIBLE) == normalized.preemptible
308+
]
309+
if not preempt_matches:
310+
want = "preemptible" if normalized.preemptible else "non-preemptible"
311+
return f"no {want} group provides device {device_type.value}:{variants_str}"
312+
device_matches = preempt_matches
313+
314+
if normalized.required_zones:
315+
available_zones = {g.zone for g in device_matches} - {None}
316+
available_regions = {g.region for g in device_matches} - {None}
317+
requested = sorted(normalized.required_zones)
318+
parts = [f"no groups in zone {', '.join(requested)}"]
319+
for z in requested:
320+
# Prioritize zone/region confusion over fuzzy match
321+
if not _looks_like_zone(z) and z in available_regions:
322+
parts.append(f"'{z}' looks like a region, not a zone; use a region constraint instead")
323+
else:
324+
close = difflib.get_close_matches(z, available_zones, n=1, cutoff=0.7)
325+
if close:
326+
parts.append(f"did you mean {close[0]}?")
327+
return "; ".join(parts)
328+
329+
if normalized.required_regions:
330+
available_regions = {g.region for g in device_matches} - {None}
331+
available_zones = {g.zone for g in device_matches} - {None}
332+
requested = sorted(normalized.required_regions)
333+
parts = [f"no groups in region {', '.join(requested)}"]
334+
for r in requested:
335+
# Prioritize zone/region confusion over fuzzy match
336+
if _looks_like_zone(r) and r in available_zones:
337+
parts.append(f"'{r}' looks like a zone, not a region; use a zone constraint instead")
338+
else:
339+
close = difflib.get_close_matches(r, available_regions, n=1, cutoff=0.7)
340+
if close:
341+
parts.append(f"did you mean {close[0]}?")
342+
return "; ".join(parts)
343+
344+
available = ", ".join(g.name for g in groups)
345+
return f"no scaling group matches constraints (available: {available})"
346+
347+
273348
def _diagnose_no_capacity(
274349
entry: DemandEntry,
275350
matching_groups: list[ScalingGroup],

lib/iris/src/iris/cluster/controller/autoscaler/runtime.py

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
RemoteWorkerHandle,
3131
SliceHandle,
3232
)
33-
from iris.cluster.constraints import ConstraintIndex, routing_constraints
33+
from iris.cluster.constraints import ConstraintIndex, routing_constraints, split_hard_soft
3434
from iris.cluster.controller.autoscaler.models import (
3535
DemandEntry,
3636
ScalingAction,
@@ -45,7 +45,7 @@
4545
load_autoscaler_checkpoint,
4646
restore_autoscaler_state,
4747
)
48-
from iris.cluster.controller.autoscaler.routing import route_demand
48+
from iris.cluster.controller.autoscaler.routing import diagnose_unsatisfiable_constraints, route_demand
4949
from iris.cluster.controller.autoscaler.scaling_group import ScalingGroup
5050
from iris.cluster.controller.autoscaler.status import routing_decision_to_proto
5151
from iris.cluster.controller.autoscaler.worker_registry import TrackedWorker, WorkerRegistry
@@ -530,6 +530,30 @@ def get_init_log(self, vm_id: str, tail: int | None = None) -> str:
530530
"""Get bootstrap log for a VM by platform worker ID."""
531531
return self._worker_registry.init_log(vm_id, tail)
532532

533+
def check_routing_feasibility(
534+
self,
535+
constraints: list[job_pb2.Constraint],
536+
) -> str | None:
537+
"""Check if any scaling group can satisfy the job's hard routing constraints.
538+
539+
Returns None if at least one group matches, or a human-readable error
540+
message explaining why no group can satisfy the constraints.
541+
"""
542+
groups = list(self._groups.values())
543+
if not groups:
544+
return None
545+
546+
group_attrs = {g.name: g.to_attributes() for g in groups}
547+
group_index = ConstraintIndex.build(group_attrs)
548+
routing_cs = routing_constraints(constraints)
549+
hard_cs, _ = split_hard_soft(routing_cs)
550+
matching_names = group_index.matching_entities(hard_cs)
551+
552+
if matching_names:
553+
return None
554+
555+
return diagnose_unsatisfiable_constraints(constraints, groups)
556+
533557
def check_coscheduling_feasibility(
534558
self,
535559
replicas: int,
@@ -543,6 +567,10 @@ def check_coscheduling_feasibility(
543567
Returns None if feasible, or a human-readable error message if no scaling
544568
group can accommodate the replica count.
545569
"""
570+
routing_error = self.check_routing_feasibility(constraints)
571+
if routing_error:
572+
return routing_error
573+
546574
groups = list(self._groups.values())
547575
if not groups:
548576
return None
@@ -553,9 +581,6 @@ def check_coscheduling_feasibility(
553581
matching_names = group_index.matching_entities(routing_cs)
554582
matching_groups = [g for g in groups if g.name in matching_names]
555583

556-
if not matching_groups:
557-
return f"no scaling group matches the job constraints; " f"available groups: {[g.name for g in groups]}"
558-
559584
if any(replicas % g.num_vms == 0 for g in matching_groups):
560585
return None
561586

lib/iris/src/iris/cluster/controller/service.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,21 +1154,25 @@ def launch_job(
11541154
# device-variant, etc.) replace auto-generated ones.
11551155
request = _inject_resource_constraints(request)
11561156

1157-
# Reject coscheduled jobs that can never be scheduled: if no scaling
1158-
# group has num_vms matching the replica count, the job would sit in
1159-
# the queue forever.
1160-
if request.HasField("coscheduling"):
1161-
autoscaler = self._controller.autoscaler
1162-
if autoscaler is not None:
1157+
# Reject jobs whose hard routing constraints cannot be satisfied by
1158+
# any scaling group. Without this check the job sits in the queue
1159+
# forever. For coscheduled jobs, also verify the replica count is
1160+
# compatible with a matching group's num_vms.
1161+
autoscaler = self._controller.autoscaler
1162+
if autoscaler is not None:
1163+
error = autoscaler.check_routing_feasibility(
1164+
constraints=list(request.constraints),
1165+
)
1166+
if not error and request.HasField("coscheduling"):
11631167
error = autoscaler.check_coscheduling_feasibility(
11641168
replicas=request.replicas,
11651169
constraints=list(request.constraints),
11661170
)
1167-
if error:
1168-
raise ConnectError(
1169-
Code.FAILED_PRECONDITION,
1170-
f"Job is unschedulable: {error}",
1171-
)
1171+
if error:
1172+
raise ConnectError(
1173+
Code.FAILED_PRECONDITION,
1174+
f"Job is unschedulable: {error}",
1175+
)
11721176

11731177
self._transitions.submit_job(job_id, request, Timestamp.now())
11741178
self._controller.wake()

lib/iris/src/iris/cluster/providers/local/cluster.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,9 @@ def make_local_cluster_config(max_workers: int) -> config_pb2.IrisClusterConfig:
333333
device_type=config_pb2.ACCELERATOR_TYPE_CPU,
334334
),
335335
)
336+
# Match the region that the fake GCP provider sets on local workers so
337+
# that inherited region constraints from child jobs are satisfiable.
338+
sg.worker.attributes["region"] = "local"
336339
base_config.scale_groups["local-cpu"].CopyFrom(sg)
337340

338341
return make_local_config(base_config)

lib/iris/tests/cluster/controller/test_demand_routing.py

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,13 @@
1717
)
1818
from iris.cluster.controller.autoscaler.scaling_group import ScalingGroup
1919
from iris.cluster.constraints import (
20+
Constraint,
21+
ConstraintOp,
2022
DeviceType,
2123
PlacementRequirements,
2224
WellKnownAttribute,
25+
region_constraint,
26+
zone_constraint,
2327
)
2428
from iris.rpc import config_pb2
2529
from iris.rpc import job_pb2
@@ -1332,14 +1336,103 @@ def test_infeasible_no_group_matches_constraints(self):
13321336
autoscaler = self._make_autoscaler({"gpu-group": ScalingGroup(config, make_mock_platform())})
13331337
result = autoscaler.check_coscheduling_feasibility(8, self._make_constraints())
13341338
assert result is not None
1335-
assert "no scaling group matches" in result
1339+
assert "no scaling group provides" in result
13361340

13371341
def test_no_groups_returns_none(self):
13381342
"""Returns None when there are no groups (no validation possible)."""
13391343
autoscaler = self._make_autoscaler({})
13401344
assert autoscaler.check_coscheduling_feasibility(8, []) is None
13411345

13421346

1347+
# ---------------------------------------------------------------------------
1348+
# Routing feasibility (submit-time constraint validation)
1349+
# ---------------------------------------------------------------------------
1350+
1351+
1352+
class TestCheckRoutingFeasibility:
1353+
"""Tests for Autoscaler.check_routing_feasibility()."""
1354+
1355+
def _make_autoscaler(self, groups):
1356+
from iris.cluster.controller.autoscaler import Autoscaler
1357+
1358+
return Autoscaler(
1359+
scale_groups=groups,
1360+
evaluation_interval=Duration.from_seconds(0.1),
1361+
platform=make_mock_platform(),
1362+
)
1363+
1364+
def _tpu_constraints(self, variant: str = "v5p-8") -> list[job_pb2.Constraint]:
1365+
return make_demand_entries(1, device_type=DeviceType.TPU, device_variant=variant)[0].constraints
1366+
1367+
def test_feasible_matching_group(self):
1368+
"""Returns None when a group matches the constraints."""
1369+
config = make_scale_group_config(name="tpu-group", max_slices=5, num_vms=4)
1370+
autoscaler = self._make_autoscaler({"tpu-group": ScalingGroup(config, make_mock_platform())})
1371+
assert autoscaler.check_routing_feasibility(self._tpu_constraints()) is None
1372+
1373+
def test_infeasible_wrong_device_type(self):
1374+
"""Rejects when no group has the requested device type."""
1375+
config = make_scale_group_config(
1376+
name="gpu-group", max_slices=5, num_vms=4, accelerator_type=config_pb2.ACCELERATOR_TYPE_GPU
1377+
)
1378+
autoscaler = self._make_autoscaler({"gpu-group": ScalingGroup(config, make_mock_platform())})
1379+
result = autoscaler.check_routing_feasibility(self._tpu_constraints())
1380+
assert result is not None
1381+
assert "tpu" in result
1382+
1383+
def test_infeasible_wrong_region(self):
1384+
"""Rejects when no group is in the requested region."""
1385+
config = make_scale_group_config(name="tpu-group", max_slices=5, num_vms=4, zones=["us-central1-a"])
1386+
constraints = self._tpu_constraints()
1387+
constraints.append(region_constraint(["europe-west4"]).to_proto())
1388+
autoscaler = self._make_autoscaler({"tpu-group": ScalingGroup(config, make_mock_platform())})
1389+
result = autoscaler.check_routing_feasibility(constraints)
1390+
assert result is not None
1391+
assert "region" in result
1392+
1393+
def test_infeasible_zone_used_as_region(self):
1394+
"""Detects when a zone value is specified as a region constraint."""
1395+
config = make_scale_group_config(name="tpu-group", max_slices=5, num_vms=4, zones=["us-central1-a"])
1396+
constraints = self._tpu_constraints()
1397+
# User mistakenly passes a zone value as a region constraint
1398+
constraints.append(region_constraint(["us-central1-a"]).to_proto())
1399+
autoscaler = self._make_autoscaler({"tpu-group": ScalingGroup(config, make_mock_platform())})
1400+
result = autoscaler.check_routing_feasibility(constraints)
1401+
assert result is not None
1402+
assert "looks like a zone" in result
1403+
1404+
def test_infeasible_region_used_as_zone(self):
1405+
"""Detects when a region value is specified as a zone constraint."""
1406+
config = make_scale_group_config(name="tpu-group", max_slices=5, num_vms=4, zones=["us-central1-a"])
1407+
constraints = self._tpu_constraints()
1408+
# User mistakenly passes a region value as a zone constraint
1409+
constraints.append(zone_constraint("us-central1").to_proto())
1410+
autoscaler = self._make_autoscaler({"tpu-group": ScalingGroup(config, make_mock_platform())})
1411+
result = autoscaler.check_routing_feasibility(constraints)
1412+
assert result is not None
1413+
assert "looks like a region" in result
1414+
1415+
def test_soft_constraint_does_not_reject(self):
1416+
"""Soft constraints that don't match any group should not cause rejection."""
1417+
config = make_scale_group_config(name="tpu-group", max_slices=5, num_vms=4, zones=["us-central1-a"])
1418+
constraints = self._tpu_constraints()
1419+
# Add a soft region constraint for a region that doesn't exist
1420+
soft = Constraint(
1421+
key=WellKnownAttribute.REGION,
1422+
op=ConstraintOp.EQ,
1423+
value="europe-west4",
1424+
mode=job_pb2.CONSTRAINT_MODE_PREFERRED,
1425+
)
1426+
constraints.append(soft.to_proto())
1427+
autoscaler = self._make_autoscaler({"tpu-group": ScalingGroup(config, make_mock_platform())})
1428+
assert autoscaler.check_routing_feasibility(constraints) is None
1429+
1430+
def test_no_groups_returns_none(self):
1431+
"""Returns None when there are no groups (no validation possible)."""
1432+
autoscaler = self._make_autoscaler({})
1433+
assert autoscaler.check_routing_feasibility([]) is None
1434+
1435+
13431436
# ---------------------------------------------------------------------------
13441437
# Allocation tier blocking
13451438
# ---------------------------------------------------------------------------

lib/iris/tests/e2e/test_smoke.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,14 @@ def test_dashboard_task_logs(smoke_cluster, verbose_job, smoke_page, smoke_scree
366366

367367
def test_dashboard_constraints(smoke_cluster, smoke_page, smoke_screenshot):
368368
"""Constraint chips rendered on job detail."""
369+
# Use soft constraints to avoid submit-time routing feasibility rejection;
370+
# the test only checks that constraint chips render on the dashboard.
369371
constraints = [
370-
Constraint(key="region", op=ConstraintOp.EQ, value="local"),
372+
Constraint(key="region", op=ConstraintOp.EQ, value="local", mode=job_pb2.CONSTRAINT_MODE_PREFERRED),
371373
Constraint(key="env-tag", op=ConstraintOp.EXISTS),
372-
Constraint(key="device-variant", op=ConstraintOp.IN, values=("v5p-8", "v6e-4")),
374+
Constraint(
375+
key="device-variant", op=ConstraintOp.IN, values=("v5p-8", "v6e-4"), mode=job_pb2.CONSTRAINT_MODE_PREFERRED
376+
),
373377
]
374378
with smoke_cluster.launched_job(TestJobs.quick, "smoke-constraints", constraints=constraints) as job:
375379
time.sleep(3)

0 commit comments

Comments
 (0)