|
22 | 22 | ) |
23 | 23 | from iris.cli.main import IRIS_CLUSTER_CONFIG_DIRS, require_controller_url, rpc_client |
24 | 24 | from iris.cluster.config import IrisConfig, clear_remote_state, make_local_config |
| 25 | +from iris.cluster.controller.autoscaler.scaling_group import ( |
| 26 | + build_worker_config_for_group, |
| 27 | + prepare_slice_config, |
| 28 | +) |
| 29 | +from iris.cluster.providers.types import Labels |
| 30 | +from iris.rpc import config_pb2 |
25 | 31 | from iris.rpc import vm_pb2 |
26 | 32 | from iris.rpc import job_pb2 |
27 | 33 | from iris.rpc import controller_pb2 |
@@ -409,6 +415,112 @@ def cluster_restart(ctx): |
409 | 415 | ctx.invoke(cluster_start) |
410 | 416 |
|
411 | 417 |
|
| 418 | +@cluster.command("create-slice") |
| 419 | +@click.option("--scale-group", "scale_group_name", required=True, help="Scale group whose template to use") |
| 420 | +@click.pass_context |
| 421 | +def cluster_create_slice(ctx, scale_group_name: str): |
| 422 | + """Create an operator-managed slice bound to the running controller. |
| 423 | +
|
| 424 | + Allocates a slice using the named scale group's template, tags it with |
| 425 | + ``iris-{prefix}-manual=true``, and bootstraps workers so they connect to |
| 426 | + the controller. The autoscaler ignores manual slices: they don't count |
| 427 | + toward demand, won't be scaled down on idle, and survive |
| 428 | + ``iris cluster stop``. Remove with ``iris cluster delete-slice``. |
| 429 | + """ |
| 430 | + config = ctx.obj.get("config") |
| 431 | + if not config: |
| 432 | + raise click.ClickException("--config is required for cluster create-slice") |
| 433 | + if config.controller.WhichOneof("controller") == "local": |
| 434 | + raise click.ClickException("create-slice is not supported for local clusters") |
| 435 | + |
| 436 | + sg_config = config.scale_groups.get(scale_group_name) |
| 437 | + if sg_config is None: |
| 438 | + available = ", ".join(sorted(config.scale_groups.keys())) or "(none)" |
| 439 | + raise click.ClickException(f"Unknown scale group '{scale_group_name}'. Available: {available}") |
| 440 | + |
| 441 | + # Verify the controller is reachable before creating the slice. The |
| 442 | + # returned URL may be a tunnel endpoint that's only reachable from the CLI |
| 443 | + # host; workers need the cluster-internal address instead, resolved below. |
| 444 | + require_controller_url(ctx) |
| 445 | + iris_config = IrisConfig(config) |
| 446 | + bundle = ctx.obj.get("provider_bundle") or iris_config.provider_bundle() |
| 447 | + |
| 448 | + # Resolve the address workers will connect to. Prefer an explicit value in |
| 449 | + # defaults.worker.controller_address, then discover it via the provider |
| 450 | + # (e.g., GCE label lookup). Never pass the CLI-local tunnel URL here. |
| 451 | + worker_controller_address = iris_config.controller_address() |
| 452 | + if not worker_controller_address: |
| 453 | + worker_controller_address = bundle.controller.discover_controller(config.controller) |
| 454 | + |
| 455 | + label_prefix = config.platform.label_prefix or "iris" |
| 456 | + labels = Labels(label_prefix) |
| 457 | + |
| 458 | + slice_config = prepare_slice_config(sg_config.slice_template, sg_config, label_prefix) |
| 459 | + slice_config.labels[labels.iris_manual] = "true" |
| 460 | + |
| 461 | + base_worker_config = config_pb2.WorkerConfig() |
| 462 | + base_worker_config.CopyFrom(config.defaults.worker) |
| 463 | + if not base_worker_config.controller_address: |
| 464 | + base_worker_config.controller_address = worker_controller_address |
| 465 | + base_worker_config.platform.CopyFrom(config.platform) |
| 466 | + if config.storage.remote_state_dir: |
| 467 | + base_worker_config.storage_prefix = config.storage.remote_state_dir |
| 468 | + |
| 469 | + worker_config = build_worker_config_for_group(base_worker_config, sg_config) |
| 470 | + |
| 471 | + click.echo(f"Creating manual slice from scale group '{scale_group_name}'...") |
| 472 | + try: |
| 473 | + handle = bundle.workers.create_slice(slice_config, worker_config=worker_config) |
| 474 | + except Exception as e: |
| 475 | + click.echo(f"Failed to create slice: {e}", err=True) |
| 476 | + raise SystemExit(1) from e |
| 477 | + |
| 478 | + click.echo(f"Created manual slice: {handle.slice_id}") |
| 479 | + click.echo(f" Scale group: {handle.scale_group or scale_group_name}") |
| 480 | + click.echo(f" Zone: {handle.zone}") |
| 481 | + click.echo("Workers will register with the controller as they bootstrap.") |
| 482 | + click.echo(f"Terminate with: iris cluster delete-slice {handle.slice_id}") |
| 483 | + |
| 484 | + |
| 485 | +@cluster.command("delete-slice") |
| 486 | +@click.argument("slice_id") |
| 487 | +@click.pass_context |
| 488 | +def cluster_delete_slice(ctx, slice_id: str): |
| 489 | + """Terminate an operator-managed slice created via ``create-slice``. |
| 490 | +
|
| 491 | + Only slices tagged ``iris-{prefix}-manual=true`` are eligible — |
| 492 | + autoscaler-managed slices must go through the autoscaler. |
| 493 | + """ |
| 494 | + config = ctx.obj.get("config") |
| 495 | + if not config: |
| 496 | + raise click.ClickException("--config is required for cluster delete-slice") |
| 497 | + if config.controller.WhichOneof("controller") == "local": |
| 498 | + raise click.ClickException("delete-slice is not supported for local clusters") |
| 499 | + |
| 500 | + iris_config = IrisConfig(config) |
| 501 | + bundle = ctx.obj.get("provider_bundle") or iris_config.provider_bundle() |
| 502 | + |
| 503 | + label_prefix = config.platform.label_prefix or "iris" |
| 504 | + labels = Labels(label_prefix) |
| 505 | + |
| 506 | + manual_slices = bundle.workers.list_slices(zones=[], labels={labels.iris_manual: "true"}) |
| 507 | + match = next((s for s in manual_slices if s.slice_id == slice_id), None) |
| 508 | + if match is None: |
| 509 | + raise click.ClickException( |
| 510 | + f"No manual slice found with id '{slice_id}'. " |
| 511 | + "List manual slices with the controller dashboard or " |
| 512 | + "`iris cluster status` to find the correct id." |
| 513 | + ) |
| 514 | + |
| 515 | + click.echo(f"Terminating manual slice {slice_id}...") |
| 516 | + try: |
| 517 | + match.terminate() |
| 518 | + except Exception as e: |
| 519 | + click.echo(f"Failed to terminate slice: {e}", err=True) |
| 520 | + raise SystemExit(1) from e |
| 521 | + click.echo("Terminated.") |
| 522 | + |
| 523 | + |
412 | 524 | @cluster.command("status") |
413 | 525 | @click.pass_context |
414 | 526 | def cluster_status_cmd(ctx): |
|
0 commit comments