Skip to content

Commit 7dd69d7

Browse files
authored
[Iris] Add manual slice CLI (create-slice / delete-slice) (#5078)
Add iris cluster create-slice --scale-group <name> that allocates a slice tagged iris_manual=true bound to the running controller, and iris cluster delete-slice <slice_id> to terminate it. The autoscaler filters iris_manual slices out of list_all_slices and reconcile, so manual slices do not count toward demand, do not get scaled down on idle, and survive iris cluster stop. Fixes #5069
1 parent 7635b2c commit 7dd69d7

8 files changed

Lines changed: 250 additions & 39 deletions

File tree

lib/iris/src/iris/cli/cluster.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
)
2323
from iris.cli.main import IRIS_CLUSTER_CONFIG_DIRS, require_controller_url, rpc_client
2424
from iris.cluster.config import IrisConfig, clear_remote_state, make_local_config
25+
from iris.cluster.controller.autoscaler.scaling_group import (
26+
build_worker_config_for_group,
27+
prepare_slice_config,
28+
)
29+
from iris.cluster.providers.types import Labels
30+
from iris.rpc import config_pb2
2531
from iris.rpc import vm_pb2
2632
from iris.rpc import job_pb2
2733
from iris.rpc import controller_pb2
@@ -409,6 +415,112 @@ def cluster_restart(ctx):
409415
ctx.invoke(cluster_start)
410416

411417

418+
@cluster.command("create-slice")
419+
@click.option("--scale-group", "scale_group_name", required=True, help="Scale group whose template to use")
420+
@click.pass_context
421+
def cluster_create_slice(ctx, scale_group_name: str):
422+
"""Create an operator-managed slice bound to the running controller.
423+
424+
Allocates a slice using the named scale group's template, tags it with
425+
``iris-{prefix}-manual=true``, and bootstraps workers so they connect to
426+
the controller. The autoscaler ignores manual slices: they don't count
427+
toward demand, won't be scaled down on idle, and survive
428+
``iris cluster stop``. Remove with ``iris cluster delete-slice``.
429+
"""
430+
config = ctx.obj.get("config")
431+
if not config:
432+
raise click.ClickException("--config is required for cluster create-slice")
433+
if config.controller.WhichOneof("controller") == "local":
434+
raise click.ClickException("create-slice is not supported for local clusters")
435+
436+
sg_config = config.scale_groups.get(scale_group_name)
437+
if sg_config is None:
438+
available = ", ".join(sorted(config.scale_groups.keys())) or "(none)"
439+
raise click.ClickException(f"Unknown scale group '{scale_group_name}'. Available: {available}")
440+
441+
# Verify the controller is reachable before creating the slice. The
442+
# returned URL may be a tunnel endpoint that's only reachable from the CLI
443+
# host; workers need the cluster-internal address instead, resolved below.
444+
require_controller_url(ctx)
445+
iris_config = IrisConfig(config)
446+
bundle = ctx.obj.get("provider_bundle") or iris_config.provider_bundle()
447+
448+
# Resolve the address workers will connect to. Prefer an explicit value in
449+
# defaults.worker.controller_address, then discover it via the provider
450+
# (e.g., GCE label lookup). Never pass the CLI-local tunnel URL here.
451+
worker_controller_address = iris_config.controller_address()
452+
if not worker_controller_address:
453+
worker_controller_address = bundle.controller.discover_controller(config.controller)
454+
455+
label_prefix = config.platform.label_prefix or "iris"
456+
labels = Labels(label_prefix)
457+
458+
slice_config = prepare_slice_config(sg_config.slice_template, sg_config, label_prefix)
459+
slice_config.labels[labels.iris_manual] = "true"
460+
461+
base_worker_config = config_pb2.WorkerConfig()
462+
base_worker_config.CopyFrom(config.defaults.worker)
463+
if not base_worker_config.controller_address:
464+
base_worker_config.controller_address = worker_controller_address
465+
base_worker_config.platform.CopyFrom(config.platform)
466+
if config.storage.remote_state_dir:
467+
base_worker_config.storage_prefix = config.storage.remote_state_dir
468+
469+
worker_config = build_worker_config_for_group(base_worker_config, sg_config)
470+
471+
click.echo(f"Creating manual slice from scale group '{scale_group_name}'...")
472+
try:
473+
handle = bundle.workers.create_slice(slice_config, worker_config=worker_config)
474+
except Exception as e:
475+
click.echo(f"Failed to create slice: {e}", err=True)
476+
raise SystemExit(1) from e
477+
478+
click.echo(f"Created manual slice: {handle.slice_id}")
479+
click.echo(f" Scale group: {handle.scale_group or scale_group_name}")
480+
click.echo(f" Zone: {handle.zone}")
481+
click.echo("Workers will register with the controller as they bootstrap.")
482+
click.echo(f"Terminate with: iris cluster delete-slice {handle.slice_id}")
483+
484+
485+
@cluster.command("delete-slice")
486+
@click.argument("slice_id")
487+
@click.pass_context
488+
def cluster_delete_slice(ctx, slice_id: str):
489+
"""Terminate an operator-managed slice created via ``create-slice``.
490+
491+
Only slices tagged ``iris-{prefix}-manual=true`` are eligible —
492+
autoscaler-managed slices must go through the autoscaler.
493+
"""
494+
config = ctx.obj.get("config")
495+
if not config:
496+
raise click.ClickException("--config is required for cluster delete-slice")
497+
if config.controller.WhichOneof("controller") == "local":
498+
raise click.ClickException("delete-slice is not supported for local clusters")
499+
500+
iris_config = IrisConfig(config)
501+
bundle = ctx.obj.get("provider_bundle") or iris_config.provider_bundle()
502+
503+
label_prefix = config.platform.label_prefix or "iris"
504+
labels = Labels(label_prefix)
505+
506+
manual_slices = bundle.workers.list_slices(zones=[], labels={labels.iris_manual: "true"})
507+
match = next((s for s in manual_slices if s.slice_id == slice_id), None)
508+
if match is None:
509+
raise click.ClickException(
510+
f"No manual slice found with id '{slice_id}'. "
511+
"List manual slices with the controller dashboard or "
512+
"`iris cluster status` to find the correct id."
513+
)
514+
515+
click.echo(f"Terminating manual slice {slice_id}...")
516+
try:
517+
match.terminate()
518+
except Exception as e:
519+
click.echo(f"Failed to terminate slice: {e}", err=True)
520+
raise SystemExit(1) from e
521+
click.echo("Terminated.")
522+
523+
412524
@cluster.command("status")
413525
@click.pass_context
414526
def cluster_status_cmd(ctx):

lib/iris/src/iris/cluster/controller/autoscaler/runtime.py

Lines changed: 11 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from collections import deque
2424
from collections.abc import Sequence
2525

26-
from iris.cluster.constraints import Constraint, WellKnownAttribute
26+
from iris.cluster.constraints import Constraint
2727
from iris.cluster.providers.protocols import WorkerInfraProvider
2828
from iris.cluster.providers.types import (
2929
CloudSliceState,
@@ -46,7 +46,7 @@
4646
restore_autoscaler_state,
4747
)
4848
from iris.cluster.controller.autoscaler.routing import job_feasibility, route_demand
49-
from iris.cluster.controller.autoscaler.scaling_group import ScalingGroup
49+
from iris.cluster.controller.autoscaler.scaling_group import ScalingGroup, build_worker_config_for_group
5050
from iris.cluster.controller.autoscaler.status import PendingHint, build_job_pending_hints, routing_decision_to_proto
5151
from iris.cluster.controller.autoscaler.worker_registry import TrackedWorker, WorkerRegistry
5252
from iris.cluster.controller.db import ControllerDB
@@ -228,6 +228,14 @@ def _log_action(
228228
status=status,
229229
)
230230
self._action_log.append(action)
231+
logger.info(
232+
"event=autoscaler action=%s entity=%s trigger=- group=%s status=%s reason=%s",
233+
action_type,
234+
slice_id or scale_group,
235+
scale_group,
236+
status,
237+
reason,
238+
)
231239
return action
232240

233241
def evaluate(
@@ -372,39 +380,7 @@ def _do_scale_up(self, group: ScalingGroup, ts: Timestamp, reason: str = "") ->
372380

373381
def _per_group_worker_config(self, group: ScalingGroup) -> config_pb2.WorkerConfig | None:
374382
"""Build per-group WorkerConfig by merging base config with scale group overrides."""
375-
if not self._base_worker_config:
376-
return None
377-
378-
wc = config_pb2.WorkerConfig()
379-
wc.CopyFrom(self._base_worker_config)
380-
381-
# Accelerator config from scale group resources
382-
resources = group.config.resources if group.config.HasField("resources") else None
383-
if resources is not None:
384-
wc.accelerator_type = resources.device_type
385-
if resources.device_variant:
386-
wc.accelerator_variant = resources.device_variant
387-
if resources.device_type == config_pb2.ACCELERATOR_TYPE_GPU and resources.device_count > 0:
388-
wc.gpu_count = resources.device_count
389-
wc.capacity_type = resources.capacity_type
390-
391-
# Worker attributes from scale group
392-
if group.config.HasField("worker"):
393-
for k, v in group.config.worker.attributes.items():
394-
wc.worker_attributes[k] = v
395-
396-
region = group.region
397-
if region and not wc.worker_attributes.get(WellKnownAttribute.REGION):
398-
wc.worker_attributes[WellKnownAttribute.REGION] = region
399-
400-
zone = group.zone
401-
if zone and not wc.worker_attributes.get(WellKnownAttribute.ZONE):
402-
wc.worker_attributes[WellKnownAttribute.ZONE] = zone
403-
404-
if group.config.name:
405-
wc.worker_attributes["scale-group"] = group.config.name
406-
407-
return wc
383+
return build_worker_config_for_group(self._base_worker_config, group.config)
408384

409385
def _register_slice_workers(
410386
self,

lib/iris/src/iris/cluster/controller/autoscaler/scaling_group.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,66 @@ def prepare_slice_config(
152152
return config
153153

154154

155+
def _region_from_template(template: config_pb2.SliceConfig) -> str | None:
156+
"""Region derived from a scale group's slice template."""
157+
if template.HasField("gcp") and template.gcp.zone:
158+
return template.gcp.zone.rsplit("-", 1)[0]
159+
if template.HasField("coreweave") and template.coreweave.region:
160+
return template.coreweave.region
161+
return None
162+
163+
164+
def _zone_from_template(template: config_pb2.SliceConfig) -> str | None:
165+
"""Zone derived from a scale group's slice template."""
166+
if template.HasField("gcp") and template.gcp.zone:
167+
return template.gcp.zone
168+
if template.HasField("coreweave") and template.coreweave.region:
169+
return template.coreweave.region
170+
return None
171+
172+
173+
def build_worker_config_for_group(
174+
base_worker_config: config_pb2.WorkerConfig | None,
175+
group_config: config_pb2.ScaleGroupConfig,
176+
) -> config_pb2.WorkerConfig | None:
177+
"""Merge base worker config with per-scale-group overrides.
178+
179+
Returns None when base_worker_config is None (test/local mode).
180+
"""
181+
if not base_worker_config:
182+
return None
183+
184+
wc = config_pb2.WorkerConfig()
185+
wc.CopyFrom(base_worker_config)
186+
187+
resources = group_config.resources if group_config.HasField("resources") else None
188+
if resources is not None:
189+
wc.accelerator_type = resources.device_type
190+
if resources.device_variant:
191+
wc.accelerator_variant = resources.device_variant
192+
if resources.device_type == config_pb2.ACCELERATOR_TYPE_GPU and resources.device_count > 0:
193+
wc.gpu_count = resources.device_count
194+
wc.capacity_type = resources.capacity_type
195+
196+
if group_config.HasField("worker"):
197+
for k, v in group_config.worker.attributes.items():
198+
wc.worker_attributes[k] = v
199+
200+
template = group_config.slice_template
201+
region = _region_from_template(template)
202+
if region and not wc.worker_attributes.get(WellKnownAttribute.REGION):
203+
wc.worker_attributes[WellKnownAttribute.REGION] = region
204+
205+
zone = _zone_from_template(template)
206+
if zone and not wc.worker_attributes.get(WellKnownAttribute.ZONE):
207+
wc.worker_attributes[WellKnownAttribute.ZONE] = zone
208+
209+
if group_config.name:
210+
wc.worker_attributes["scale-group"] = group_config.name
211+
212+
return wc
213+
214+
155215
def _zones_from_config(config: config_pb2.ScaleGroupConfig) -> list[str]:
156216
"""Extract zones from ScaleGroupConfig's slice_template.
157217
@@ -500,12 +560,16 @@ def reconcile(self) -> None:
500560
501561
Used in tests to populate a scaling group with pre-injected slices.
502562
Production restore uses prepare_for_restore() + restore_scaling_group().
563+
Skips operator-created manual slices (iris_manual=true), which the
564+
autoscaler must not track or scale down.
503565
"""
504566
zones = _zones_from_config(self._config)
505567
labels = {self._labels.iris_scale_group: self._config.name}
506568
slice_handles = self._platform.list_slices(zones, labels)
507569
with self._slices_lock:
508570
for handle in slice_handles:
571+
if handle.labels.get(self._labels.iris_manual) == "true":
572+
continue
509573
state = SliceState(handle=handle)
510574
self._slices[handle.slice_id] = state
511575
self._db_upsert_slice(handle.slice_id, state)

lib/iris/src/iris/cluster/providers/gcp/workers.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -648,15 +648,20 @@ def list_slices(
648648
return handles
649649

650650
def list_all_slices(self) -> list[GcpSliceHandle | GcpVmSliceHandle]:
651-
"""List all slices managed by this cluster.
651+
"""List all autoscaler-managed slices for this cluster.
652652
653653
Uses project-wide queries (empty zones = all zones) via GcpService,
654-
filtered by iris-{prefix}-managed=true.
654+
filtered by iris-{prefix}-managed=true. Slices tagged
655+
iris-{prefix}-manual=true (operator-created via `iris cluster
656+
create-slice`) are excluded: the autoscaler and `cluster stop` must
657+
not see or terminate them.
655658
"""
656659
managed_labels = {self._iris_labels.iris_managed: "true"}
660+
manual_label = self._iris_labels.iris_manual
657661

658662
if self._gcp.mode == ServiceMode.LOCAL:
659-
return self._gcp.get_local_slices(managed_labels) # type: ignore[return-value]
663+
local_handles = self._gcp.get_local_slices(managed_labels)
664+
return [h for h in local_handles if h.labels.get(manual_label) != "true"] # type: ignore[return-value]
660665

661666
tpu_infos = self._gcp.tpu_list(zones=[], labels=managed_labels)
662667
vm_infos = self._gcp.vm_list(zones=[], labels=managed_labels)
@@ -666,6 +671,8 @@ def list_all_slices(self) -> list[GcpSliceHandle | GcpVmSliceHandle]:
666671
for tpu in tpu_infos:
667672
if tpu.state not in ("READY", "CREATING"):
668673
continue
674+
if tpu.labels.get(manual_label) == "true":
675+
continue
669676
handles.append(
670677
GcpSliceHandle(
671678
_slice_id=tpu.name,
@@ -693,6 +700,8 @@ def list_all_slices(self) -> list[GcpSliceHandle | GcpVmSliceHandle]:
693700
continue
694701
if qr.state in ("FAILED", "SUSPENDED", "DELETING"):
695702
continue
703+
if qr.labels.get(manual_label) == "true":
704+
continue
696705
handles.append(
697706
GcpSliceHandle(
698707
_slice_id=qr.name,
@@ -715,6 +724,8 @@ def list_all_slices(self) -> list[GcpSliceHandle | GcpVmSliceHandle]:
715724
slice_id = vm.labels.get(self._iris_labels.iris_slice_id, "")
716725
if not slice_id:
717726
continue
727+
if vm.labels.get(manual_label) == "true":
728+
continue
718729
handles.append(
719730
GcpVmSliceHandle(
720731
_slice_id=slice_id,

lib/iris/src/iris/cluster/providers/manual/provider.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,15 @@ def list_slices(
334334
return results
335335

336336
def list_all_slices(self) -> list[ManualSliceHandle]:
337-
return self.list_slices(zones=[], labels={self._iris_labels.iris_managed: "true"})
337+
"""List autoscaler-managed slices.
338+
339+
Excludes slices tagged iris_manual=true (operator-created via
340+
`iris cluster create-slice`), which the autoscaler and
341+
`iris cluster stop` must not see or terminate.
342+
"""
343+
all_managed = self.list_slices(zones=[], labels={self._iris_labels.iris_managed: "true"})
344+
manual_label = self._iris_labels.iris_manual
345+
return [s for s in all_managed if s.labels.get(manual_label) != "true"]
338346

339347
def list_vms(
340348
self,

lib/iris/src/iris/cluster/providers/types.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ def __init__(self, prefix: str):
5151
self.iris_controller = f"iris-{prefix}-controller"
5252
self.iris_controller_address = f"iris-{prefix}-controller-address"
5353
self.iris_slice_id = f"iris-{prefix}-slice-id"
54+
# Marks a slice as operator-created via `iris cluster create-slice`.
55+
# The autoscaler ignores these: they don't count toward demand, don't
56+
# participate in scale-down, and survive `iris cluster stop`.
57+
self.iris_manual = f"iris-{prefix}-manual"
5458

5559

5660
def find_free_port(start: int = -1) -> int:

lib/iris/tests/cluster/providers/gcp/test_platform.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,27 @@ def test_list_all_slices_returns_all_managed(platform_env: PlatformEnv):
732732
assert len(all_slices) == 2
733733

734734

735+
def test_list_all_slices_excludes_manual_slices(platform_env: PlatformEnv):
736+
"""list_all_slices drops slices labeled iris_manual=true so the autoscaler ignores them."""
737+
labels = Labels(platform_env.label_prefix)
738+
739+
cfg_auto = _make_slice_config(platform_env, "auto-group")
740+
handle_auto = platform_env.platform.create_slice(cfg_auto)
741+
742+
cfg_manual = _make_slice_config(platform_env, "auto-group")
743+
cfg_manual.labels[labels.iris_manual] = "true"
744+
handle_manual = platform_env.platform.create_slice(cfg_manual)
745+
746+
all_slices = platform_env.platform.list_all_slices()
747+
slice_ids = {s.slice_id for s in all_slices}
748+
assert handle_auto.slice_id in slice_ids
749+
assert handle_manual.slice_id not in slice_ids
750+
751+
# Manual slices are still discoverable when explicitly asked for (delete-slice path).
752+
manual_only = platform_env.platform.list_slices(zones=[platform_env.zone], labels={labels.iris_manual: "true"})
753+
assert {s.slice_id for s in manual_only} == {handle_manual.slice_id}
754+
755+
735756
def test_gcp_list_all_slices_multi_zone():
736757
"""GcpWorkerProvider.list_all_slices returns slices across multiple zones."""
737758
gcp_service = InMemoryGcpService(mode=ServiceMode.DRY_RUN, project_id="test-project")

0 commit comments

Comments
 (0)