3030 WellKnownAttribute ,
3131 device_variant_constraint ,
3232 infer_preemptible_constraint ,
33+ preemptible_constraint ,
3334 region_constraint ,
3435 zone_constraint ,
3536)
@@ -532,6 +533,41 @@ def resolve_multinode_defaults(
532533 return replicas , coscheduling
533534
534535
536+ def build_job_constraints (
537+ resources_proto : job_pb2 .ResourceSpecProto ,
538+ tpu_variants : list [str ],
539+ replicas : int ,
540+ regions : tuple [str , ...] | None = None ,
541+ zone : str | None = None ,
542+ preemptible : bool | None = None ,
543+ ) -> list [Constraint ]:
544+ """Assemble the constraint list for a submitted job.
545+
546+ An explicit ``preemptible`` value wins over the executor heuristic:
547+ ``infer_preemptible_constraint`` short-circuits when any preemptible
548+ constraint is already present, so we append the user's choice first.
549+ """
550+ constraints : list [Constraint ] = []
551+ if regions :
552+ constraints .append (region_constraint (list (regions )))
553+ if zone :
554+ constraints .append (zone_constraint (zone ))
555+ if len (tpu_variants ) > 1 :
556+ constraints .append (device_variant_constraint (tpu_variants ))
557+ if preemptible is not None :
558+ constraints .append (preemptible_constraint (preemptible ))
559+
560+ # Executor heuristic: small CPU-only CLI jobs (no accelerators, 1 replica,
561+ # CPU ≤ 0.5 cores, RAM ≤ 4 GiB) are auto-tagged as non-preemptible so
562+ # coordinators survive spot reclamation. Skipped when the user supplied
563+ # --preemptible / --no-preemptible.
564+ inferred = infer_preemptible_constraint (resources_proto , replicas , constraints )
565+ if inferred is not None :
566+ constraints .append (inferred )
567+ logger .info ("Executor heuristic: auto-tagging job as non-preemptible" )
568+ return constraints
569+
570+
535571def run_iris_job (
536572 command : list [str ],
537573 env_vars : dict [str , str ],
@@ -553,6 +589,7 @@ def run_iris_job(
553589 user : str | None = None ,
554590 reserve : tuple [str , ...] | None = None ,
555591 priority : str | None = None ,
592+ preemptible : bool | None = None ,
556593 token_provider : TokenProvider | None = None ,
557594 submit_argv : list [str ] | None = None ,
558595) -> int :
@@ -565,6 +602,8 @@ def run_iris_job(
565602 regions: If provided, restrict the job to workers in these regions.
566603 zone: If provided, restrict the job to workers in this zone.
567604 reserve: Reservation specs (e.g., ("4:H100x8", "v5litepod-16")).
605+ preemptible: If True/False, force scheduling on (non-)preemptible workers
606+ and bypass the executor heuristic. If None (default), the heuristic runs.
568607
569608 Returns:
570609 Exit code: 0 for success, 1 for failure
@@ -579,22 +618,15 @@ def run_iris_job(
579618
580619 replicas , coscheduling = resolve_multinode_defaults (primary_tpu , gpu , replicas )
581620
582- constraints : list [Constraint ] = []
583- if regions :
584- constraints .append (region_constraint (list (regions )))
585- if zone :
586- constraints .append (zone_constraint (zone ))
587- if len (tpu_variants ) > 1 :
588- constraints .append (device_variant_constraint (tpu_variants ))
589-
590- # Executor heuristic: small CPU-only CLI jobs (no accelerators, 1 replica,
591- # CPU ≤ 0.5 cores, RAM ≤ 4 GiB) are auto-tagged as non-preemptible so
592- # coordinators survive spot reclamation.
593621 resources_proto = resources .to_proto ()
594- preemptible = infer_preemptible_constraint (resources_proto , replicas , constraints )
595- if preemptible is not None :
596- constraints .append (preemptible )
597- logger .info ("Executor heuristic: auto-tagging job as non-preemptible" )
622+ constraints = build_job_constraints (
623+ resources_proto = resources_proto ,
624+ tpu_variants = tpu_variants ,
625+ replicas = replicas ,
626+ regions = regions ,
627+ zone = zone ,
628+ preemptible = preemptible ,
629+ )
598630
599631 reservation : list [ReservationEntry ] | None = None
600632 if reserve :
@@ -632,6 +664,8 @@ def run_iris_job(
632664 logger .info (f"Region constraint: { ', ' .join (regions )} " )
633665 if zone :
634666 logger .info (f"Zone constraint: { zone } " )
667+ if preemptible is not None :
668+ logger .info (f"Preemptible constraint: { preemptible } " )
635669 if reservation :
636670 logger .info (f"Reservation: { len (reservation )} entries" )
637671
@@ -831,6 +865,16 @@ def job() -> None:
831865 default = None ,
832866 help = "Priority band for scheduling (default: interactive). Lower bands run first; batch jobs yield to interactive." ,
833867)
868+ @click .option (
869+ "--preemptible/--no-preemptible" ,
870+ "preemptible" ,
871+ default = None ,
872+ help = (
873+ "Force scheduling on preemptible (--preemptible) or non-preemptible "
874+ "(--no-preemptible) workers. Overrides the executor heuristic. "
875+ "Default: heuristic-based (small CPU-only jobs pinned to non-preemptible)."
876+ ),
877+ )
834878@click .option (
835879 "--terminate-on-exit/--no-terminate-on-exit" ,
836880 default = True ,
@@ -858,6 +902,7 @@ def run(
858902 extra : tuple [str , ...],
859903 reserve : tuple [str , ...],
860904 priority : str | None ,
905+ preemptible : bool | None ,
861906 terminate_on_exit : bool ,
862907 cmd : tuple [str , ...],
863908):
@@ -907,6 +952,7 @@ def run(
907952 zone = zone ,
908953 reserve = reserve or None ,
909954 priority = priority ,
955+ preemptible = preemptible ,
910956 token_provider = ctx .obj .get ("token_provider" ),
911957 submit_argv = submit_argv ,
912958 )
0 commit comments