Skip to content

Commit dbbfe56

Browse files
[iris] Add --preemptible/--no-preemptible flag to job run
Lets callers force scheduling on (non-)preemptible workers instead of relying on the small-CPU-only executor heuristic. The flag threads through run_iris_job() and appends a preemptible_constraint before infer_preemptible_constraint runs; the heuristic already short-circuits when any preemptible constraint is present, so the explicit flag wins. Also extracts constraint assembly into build_job_constraints() to keep the new behavior pure and directly unit-testable without mocking the controller. Fixes #4540 Co-authored-by: Russell Power <rjpower@users.noreply.github.com>
1 parent 9c8fcad commit dbbfe56

2 files changed

Lines changed: 108 additions & 15 deletions

File tree

lib/iris/src/iris/cli/job.py

Lines changed: 61 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
WellKnownAttribute,
3131
device_variant_constraint,
3232
infer_preemptible_constraint,
33+
preemptible_constraint,
3334
region_constraint,
3435
zone_constraint,
3536
)
@@ -532,6 +533,41 @@ def resolve_multinode_defaults(
532533
return replicas, coscheduling
533534

534535

536+
def build_job_constraints(
537+
resources_proto: job_pb2.ResourceSpecProto,
538+
tpu_variants: list[str],
539+
replicas: int,
540+
regions: tuple[str, ...] | None = None,
541+
zone: str | None = None,
542+
preemptible: bool | None = None,
543+
) -> list[Constraint]:
544+
"""Assemble the constraint list for a submitted job.
545+
546+
An explicit ``preemptible`` value wins over the executor heuristic:
547+
``infer_preemptible_constraint`` short-circuits when any preemptible
548+
constraint is already present, so we append the user's choice first.
549+
"""
550+
constraints: list[Constraint] = []
551+
if regions:
552+
constraints.append(region_constraint(list(regions)))
553+
if zone:
554+
constraints.append(zone_constraint(zone))
555+
if len(tpu_variants) > 1:
556+
constraints.append(device_variant_constraint(tpu_variants))
557+
if preemptible is not None:
558+
constraints.append(preemptible_constraint(preemptible))
559+
560+
# Executor heuristic: small CPU-only CLI jobs (no accelerators, 1 replica,
561+
# CPU ≤ 0.5 cores, RAM ≤ 4 GiB) are auto-tagged as non-preemptible so
562+
# coordinators survive spot reclamation. Skipped when the user supplied
563+
# --preemptible / --no-preemptible.
564+
inferred = infer_preemptible_constraint(resources_proto, replicas, constraints)
565+
if inferred is not None:
566+
constraints.append(inferred)
567+
logger.info("Executor heuristic: auto-tagging job as non-preemptible")
568+
return constraints
569+
570+
535571
def run_iris_job(
536572
command: list[str],
537573
env_vars: dict[str, str],
@@ -553,6 +589,7 @@ def run_iris_job(
553589
user: str | None = None,
554590
reserve: tuple[str, ...] | None = None,
555591
priority: str | None = None,
592+
preemptible: bool | None = None,
556593
token_provider: TokenProvider | None = None,
557594
submit_argv: list[str] | None = None,
558595
) -> int:
@@ -565,6 +602,8 @@ def run_iris_job(
565602
regions: If provided, restrict the job to workers in these regions.
566603
zone: If provided, restrict the job to workers in this zone.
567604
reserve: Reservation specs (e.g., ("4:H100x8", "v5litepod-16")).
605+
preemptible: If True/False, force scheduling on (non-)preemptible workers
606+
and bypass the executor heuristic. If None (default), the heuristic runs.
568607
569608
Returns:
570609
Exit code: 0 for success, 1 for failure
@@ -579,22 +618,15 @@ def run_iris_job(
579618

580619
replicas, coscheduling = resolve_multinode_defaults(primary_tpu, gpu, replicas)
581620

582-
constraints: list[Constraint] = []
583-
if regions:
584-
constraints.append(region_constraint(list(regions)))
585-
if zone:
586-
constraints.append(zone_constraint(zone))
587-
if len(tpu_variants) > 1:
588-
constraints.append(device_variant_constraint(tpu_variants))
589-
590-
# Executor heuristic: small CPU-only CLI jobs (no accelerators, 1 replica,
591-
# CPU ≤ 0.5 cores, RAM ≤ 4 GiB) are auto-tagged as non-preemptible so
592-
# coordinators survive spot reclamation.
593621
resources_proto = resources.to_proto()
594-
preemptible = infer_preemptible_constraint(resources_proto, replicas, constraints)
595-
if preemptible is not None:
596-
constraints.append(preemptible)
597-
logger.info("Executor heuristic: auto-tagging job as non-preemptible")
622+
constraints = build_job_constraints(
623+
resources_proto=resources_proto,
624+
tpu_variants=tpu_variants,
625+
replicas=replicas,
626+
regions=regions,
627+
zone=zone,
628+
preemptible=preemptible,
629+
)
598630

599631
reservation: list[ReservationEntry] | None = None
600632
if reserve:
@@ -632,6 +664,8 @@ def run_iris_job(
632664
logger.info(f"Region constraint: {', '.join(regions)}")
633665
if zone:
634666
logger.info(f"Zone constraint: {zone}")
667+
if preemptible is not None:
668+
logger.info(f"Preemptible constraint: {preemptible}")
635669
if reservation:
636670
logger.info(f"Reservation: {len(reservation)} entries")
637671

@@ -831,6 +865,16 @@ def job() -> None:
831865
default=None,
832866
help="Priority band for scheduling (default: interactive). Lower bands run first; batch jobs yield to interactive.",
833867
)
868+
@click.option(
869+
"--preemptible/--no-preemptible",
870+
"preemptible",
871+
default=None,
872+
help=(
873+
"Force scheduling on preemptible (--preemptible) or non-preemptible "
874+
"(--no-preemptible) workers. Overrides the executor heuristic. "
875+
"Default: heuristic-based (small CPU-only jobs pinned to non-preemptible)."
876+
),
877+
)
834878
@click.option(
835879
"--terminate-on-exit/--no-terminate-on-exit",
836880
default=True,
@@ -858,6 +902,7 @@ def run(
858902
extra: tuple[str, ...],
859903
reserve: tuple[str, ...],
860904
priority: str | None,
905+
preemptible: bool | None,
861906
terminate_on_exit: bool,
862907
cmd: tuple[str, ...],
863908
):
@@ -907,6 +952,7 @@ def run(
907952
zone=zone,
908953
reserve=reserve or None,
909954
priority=priority,
955+
preemptible=preemptible,
910956
token_provider=ctx.obj.get("token_provider"),
911957
submit_argv=submit_argv,
912958
)

lib/iris/tests/cli/test_job.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from iris.cli.job import (
1010
_parse_tpu_alternatives,
1111
_render_job_summary_text,
12+
build_job_constraints,
1213
build_job_summary,
1314
build_resources,
1415
build_tpu_alternatives,
@@ -138,6 +139,52 @@ def test_executor_heuristic_with_region_constraint():
138139
assert preemptible.values[0].value == "false"
139140

140141

142+
# ---------------------------------------------------------------------------
143+
# build_job_constraints — --preemptible / --no-preemptible wiring (#4540)
144+
# ---------------------------------------------------------------------------
145+
146+
147+
def _preemptible_values(constraints: list[Constraint]) -> list[str]:
148+
return [c.values[0].value for c in constraints if c.key == WellKnownAttribute.PREEMPTIBLE]
149+
150+
151+
def test_build_job_constraints_preemptible_true_emits_true_constraint():
152+
"""--preemptible forces a preemptible=true constraint and bypasses the heuristic."""
153+
resources_proto = build_resources(tpu=None, gpu=None, cpu=0.5, memory="1GB", disk="5GB").to_proto()
154+
155+
constraints = build_job_constraints(resources_proto, tpu_variants=[], replicas=1, preemptible=True)
156+
157+
assert _preemptible_values(constraints) == ["true"]
158+
159+
160+
def test_build_job_constraints_preemptible_false_emits_false_constraint():
161+
"""--no-preemptible forces a preemptible=false constraint even for non-executor jobs."""
162+
resources_proto = build_resources(tpu=None, gpu=None, cpu=4.0, memory="16GB", disk="5GB").to_proto()
163+
164+
constraints = build_job_constraints(resources_proto, tpu_variants=[], replicas=1, preemptible=False)
165+
166+
assert _preemptible_values(constraints) == ["false"]
167+
168+
169+
def test_build_job_constraints_preemptible_none_runs_heuristic():
170+
"""Default (None) preserves the executor heuristic on small CPU jobs."""
171+
resources_proto = build_resources(tpu=None, gpu=None, cpu=0.5, memory="1GB", disk="5GB").to_proto()
172+
173+
constraints = build_job_constraints(resources_proto, tpu_variants=[], replicas=1, preemptible=None)
174+
175+
assert _preemptible_values(constraints) == ["false"]
176+
177+
178+
def test_build_job_constraints_preemptible_true_overrides_heuristic():
179+
"""Small CPU jobs normally auto-tag non-preemptible; --preemptible wins."""
180+
resources_proto = build_resources(tpu=None, gpu=None, cpu=0.5, memory="1GB", disk="5GB").to_proto()
181+
182+
constraints = build_job_constraints(resources_proto, tpu_variants=[], replicas=1, preemptible=True)
183+
184+
# Exactly one preemptible constraint, and it reflects the user's choice.
185+
assert _preemptible_values(constraints) == ["true"]
186+
187+
141188
# --tpu multi-variant parsing
142189
# ---------------------------------------------------------------------------
143190

0 commit comments

Comments
 (0)