keras-team
diff --git a/‎.gemini/styleguide.md‎
Lines changed: 35 additions & 2 deletions b/‎.gemini/styleguide.md‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 37 additions & 9 deletions b/‎AGENTS.md‎
Lines changed: 37 additions & 9 deletions
diff --git a/‎keras_remote/cli/commands/pool.py‎
Lines changed: 245 additions & 0 deletions b/‎keras_remote/cli/commands/pool.py‎
Lines changed: 245 additions & 0 deletions
@@ -120,22 +120,55 @@ This prevents confusing situations where a user sets an env var that works in on
 
 ---
 
+## CLI commands must be idempotent and follow the reconciliation pattern.
+
+Every mutating CLI command (`up`, `pool add`, `pool remove`, etc.) must follow the refresh-read-merge-apply pattern:
+
+1. `stack.refresh()` — sync local state with cloud reality
+2. `get_current_node_pools()` — read current pools from stack exports
+3. Build `InfraConfig` — merge existing state with desired changes
+4. `stack.up()` — apply only the diff
+
+This ensures:
+
+- Re-running after partial failure is always safe
+- Existing resources are never accidentally recreated (Pulumi tracks by URN)
+- External drift is detected and corrected
+
+When adding a new CLI command that modifies infrastructure, follow this pattern rather than directly creating or deleting resources.
+
+---
+
+## Prefer graceful degradation over hard failures in CLI operations.
+
+Partial failures in multi-step CLI operations should not abort the entire flow:
+
+- If `stack.refresh()` fails, log a warning and continue with stale state
+- If `stack.up()` fails, set a failure flag but still run post-deploy steps
+- If a post-deploy step fails (kubectl, LWS, GPU drivers), log a warning and continue with remaining steps
+
+The user can always re-run the same command to recover, since all operations are idempotent.
+
+---
+
 ## Don't neglect error messages, docstrings, and documentation.
 
 - **Catch user errors early.** Validate GCP project existence and quota before starting a long build.
 - **Provide detailed feedback.**
-    - Bad: `Error: 403 Forbidden`
-    - Good: `Permission denied. Please ensure your account 'user@example.com' has the 'Storage Object Admin' role on bucket 'gs://my-bucket'.`
+  - Bad: `Error: 403 Forbidden`
+  - Good: `Permission denied. Please ensure your account 'user@example.com' has the 'Storage Object Admin' role on bucket 'gs://my-bucket'.`
 - **Show, don't tell.** Documentation should show code examples of running functions, not just list arguments.
 
 ### Error messages: a case study
 
 Bad:
+
 ```
 RuntimeError: Job failed.
 ```
 
 Good:
+
 ```
 RuntimeError: The remote job failed with exit code 1.
 Logs from the worker:
 
@@ -14,8 +14,8 @@ keras_remote/
 ├── runner/         # Remote worker entrypoint (runs inside container)
 ├── utils/          # Serialization (packager) and Cloud Storage helpers
 ├── cli/            # CLI for infrastructure provisioning (Pulumi-based)
-│   ├── commands/   # up, down, status, config
-│   └── infra/      # Pulumi programs and stack management
+│   ├── commands/   # up, down, status, config, pool (add/remove/list)
+│   └── infra/      # Pulumi programs, stack management, post-deploy steps
 ├── credentials.py  # Credential verification & auto-setup (shared by core & CLI)
 └── constants.py    # Zone/region utilities
 ```
@@ -49,14 +49,17 @@ keras_remote/
 | `utils/packager.py`          | `save_payload()` (cloudpickle), `zip_working_dir()`                              |
 | `utils/storage.py`           | GCS upload/download/cleanup for job artifacts                                    |
 | `runner/remote_runner.py`    | Runs inside container: deserialize, execute, upload result                       |
+| `cli/commands/pool.py`       | Node pool add/remove/list commands                                               |
+| `cli/infra/post_deploy.py`   | kubectl, LWS CRD, GPU driver setup after stack.up()                              |
+| `cli/constants.py`           | CLI defaults, paths, API list                                                    |
 | `cli/main.py`                | CLI entry point (`keras-remote` command)                                         |
 
 ## Key Abstractions
 
 - **`JobContext`** (`backend/execution.py`): Mutable dataclass carrying all job state through the pipeline — inputs, generated IDs, artifact paths, image URI.
 - **`BaseK8sBackend`** (`backend/execution.py`): Base class with `submit_job`, `wait_for_job`, `cleanup_job`. Subclassed by `GKEBackend` and `PathwaysBackend`.
 - **`GpuConfig` / `TpuConfig`** (`core/accelerators.py`): Frozen dataclasses for accelerator metadata. Single source of truth used by runtime, container builder, and CLI.
-- **`InfraConfig`** (`cli/config.py`): CLI provisioning configuration (project, zone, cluster, accelerator).
+- **`InfraConfig` / `NodePoolConfig`** (`cli/config.py`): CLI provisioning configuration. `InfraConfig` holds project, zone, cluster name, and a list of `NodePoolConfig` entries. `NodePoolConfig` pairs a unique pool name (e.g., `gpu-l4-a3f2`) with a `GpuConfig` or `TpuConfig`.
 
 ## Conventions
 
@@ -74,15 +77,40 @@ Every customizable resource name must follow the same resolution model across al
 - **CLI commands**: `--flag` (with `envvar=`) → env var → interactive prompt or default
 - **`config show`**: displays current value and source for every configurable name
 
-| Env Var | `@run()` param | CLI flag | `config show` | Default |
-| --- | --- | --- | --- | --- |
-| `KERAS_REMOTE_PROJECT` | `project=` | `--project` | Yes | *(required)* |
-| `KERAS_REMOTE_ZONE` | `zone=` | `--zone` | Yes | `us-central1-a` |
-| `KERAS_REMOTE_CLUSTER` | `cluster=` | `--cluster` | Yes | `keras-remote-cluster` |
-| `KERAS_REMOTE_GKE_NAMESPACE` | `namespace=` | *(runtime only)* | Yes | `default` |
+| Env Var                      | `@run()` param | CLI flag         | `config show` | Default                |
+| ---------------------------- | -------------- | ---------------- | ------------- | ---------------------- |
+| `KERAS_REMOTE_PROJECT`       | `project=`     | `--project`      | Yes           | _(required)_           |
+| `KERAS_REMOTE_ZONE`          | `zone=`        | `--zone`         | Yes           | `us-central1-a`        |
+| `KERAS_REMOTE_CLUSTER`       | `cluster=`     | `--cluster`      | Yes           | `keras-remote-cluster` |
+| `KERAS_REMOTE_GKE_NAMESPACE` | `namespace=`   | _(runtime only)_ | Yes           | `default`              |
 
 When adding a new configurable resource name, ensure it is wired into **all three paths** (decorator, CLI flags on every relevant command, and `config show`). The `GOOGLE_CLOUD_PROJECT` env var is also accepted as a fallback for project ID (after `KERAS_REMOTE_PROJECT`).
 
+Additional CLI-only env vars:
+
+| Env Var                  | Default                  | Description                  |
+| ------------------------ | ------------------------ | ---------------------------- |
+| `KERAS_REMOTE_STATE_DIR` | `~/.keras-remote/pulumi` | Pulumi local state directory |
+
+### CLI State Management
+
+The CLI manages three layers of state: in-memory config (`InfraConfig`), Pulumi local state files (`~/.keras-remote/pulumi/`), and GCP cloud resources. Each GCP project gets its own Pulumi stack (stack name = project ID).
+
+Every mutating command (`up`, `pool add`, `pool remove`, etc.) follows this reconciliation pattern:
+
+1. `stack.refresh()` — pull cloud reality into local state
+2. `get_current_node_pools()` — read current pools from stack exports
+3. Build new `InfraConfig` — merge existing pools with desired changes
+4. `create_program(config)` — generate Pulumi program from desired state
+5. `stack.up()` — diff desired vs current, apply only changes
+
+Key behaviors:
+
+- **`up` re-runs** preserve existing pools and ignore `--accelerator` (defer to `pool add/remove`)
+- **All commands are idempotent** — safe to re-run after partial failure
+- **Graceful degradation** — partial failures (refresh, post-deploy steps) log warnings but don't abort the operation
+- **Pool state round-trips** through Pulumi stack exports (`accelerators` key) as a list of dicts, reconstructed via `_export_to_node_pool()`
+
 ### Testing
 
 - **Framework**: `absl.testing` (not pytest)
 
@@ -0,0 +1,245 @@
+"""keras-remote pool commands — add, remove, and list accelerator node pools."""
+
+import click
+import pulumi.automation as auto
+
+from keras_remote.cli.config import InfraConfig, NodePoolConfig
+from keras_remote.cli.constants import DEFAULT_CLUSTER_NAME, DEFAULT_ZONE
+from keras_remote.cli.infra.program import create_program
+from keras_remote.cli.infra.stack_manager import (
+  get_current_node_pools,
+  get_stack,
+)
+from keras_remote.cli.output import (
+  banner,
+  console,
+  infrastructure_state,
+  success,
+  warning,
+)
+from keras_remote.cli.prerequisites_check import check_all
+from keras_remote.cli.prompts import resolve_project
+from keras_remote.core import accelerators
+from keras_remote.core.accelerators import generate_pool_name
+
+
+def _common_options(f):
+  """Shared options for pool subcommands."""
+  f = click.option(
+    "--project",
+    envvar="KERAS_REMOTE_PROJECT",
+    default=None,
+    help="GCP project ID [env: KERAS_REMOTE_PROJECT]",
+  )(f)
+  f = click.option(
+    "--zone",
+    envvar="KERAS_REMOTE_ZONE",
+    default=None,
+    help=f"GCP zone [env: KERAS_REMOTE_ZONE, default: {DEFAULT_ZONE}]",
+  )(f)
+  f = click.option(
+    "--cluster",
+    "cluster_name",
+    envvar="KERAS_REMOTE_CLUSTER",
+    default=None,
+    help="GKE cluster name [default: keras-remote-cluster]",
+  )(f)
+  return f
+
+
+def _resolve_common(project, zone, cluster_name):
+  """Resolve common options to concrete values."""
+  return (
+    project or resolve_project(),
+    zone or DEFAULT_ZONE,
+    cluster_name or DEFAULT_CLUSTER_NAME,
+  )
+
+
+@click.group()
+def pool():
+  """Manage accelerator node pools."""
+
+
+def _load_pools(project, zone, cluster_name):
+  """Check prerequisites, refresh stack state, and return existing pools."""
+  check_all()
+  project, zone, cluster_name = _resolve_common(project, zone, cluster_name)
+
+  base_config = InfraConfig(
+    project=project, zone=zone, cluster_name=cluster_name
+  )
+  try:
+    program = create_program(base_config)
+    stack = get_stack(program, base_config)
+  except auto.errors.CommandError as e:
+    raise click.ClickException(
+      f"No Pulumi stack found for project '{project}': {e}\n"
+      "Run 'keras-remote up' to provision infrastructure first."
+    ) from e
+
+  console.print("\nRefreshing state...\n")
+  try:
+    stack.refresh(on_output=print)
+  except auto.errors.CommandError as e:
+    warning(f"Failed to refresh stack state: {e}")
+
+  existing_pools = get_current_node_pools(stack)
+  return project, zone, cluster_name, existing_pools
+
+
+def _apply_pool_update(project, zone, cluster_name, node_pools):
+  """Run a Pulumi update with the given node pool list.
+
+  Returns:
+    True if the update succeeded, False if it encountered an error.
+  """
+  config = InfraConfig(
+    project=project,
+    zone=zone,
+    cluster_name=cluster_name,
+    node_pools=node_pools,
+  )
+  program = create_program(config)
+  stack = get_stack(program, config)
+
+  console.print("\n[bold]Updating infrastructure...[/bold]\n")
+  try:
+    result = stack.up(on_output=print)
+    console.print()
+    success(f"Pulumi update complete. {result.summary.resource_changes}")
+    return True
+  except auto.errors.CommandError as e:
+    console.print()
+    warning(f"Pulumi update encountered an issue: {e}")
+    return False
+
+
+@pool.command("add")
+@_common_options
+@click.option(
+  "--accelerator",
+  required=True,
+  help="Accelerator spec: t4, l4, a100, a100-80gb, h100, "
+  "v5litepod, v5p, v6e, v3 (with optional count/topology)",
+)
+@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
+def pool_add(project, zone, cluster_name, accelerator, yes):
+  """Add an accelerator node pool to the cluster."""
+  banner("keras-remote Pool Add")
+
+  # Parse the accelerator spec first to fail fast on bad input.
+  try:
+    accel_config = accelerators.parse_accelerator(accelerator)
+  except ValueError as e:
+    raise click.BadParameter(str(e), param_hint="--accelerator") from e
+
+  if accel_config is None:
+    raise click.BadParameter(
+      "Cannot add a CPU node pool. Use 'keras-remote up' instead.",
+      param_hint="--accelerator",
+    )
+
+  new_pool_name = generate_pool_name(accel_config)
+  new_pool = NodePoolConfig(new_pool_name, accel_config)
+
+  project, zone, cluster_name, existing_pools = _load_pools(
+    project, zone, cluster_name
+  )
+  all_pools = existing_pools + [new_pool]
+
+  console.print(f"\nAdding pool [bold]{new_pool_name}[/bold] ({accelerator})")
+  console.print(f"Total pools after add: {len(all_pools)}\n")
+
+  if not yes:
+    click.confirm("Proceed?", abort=True)
+
+  update_succeeded = _apply_pool_update(project, zone, cluster_name, all_pools)
+
+  console.print()
+  if update_succeeded:
+    banner("Pool Added")
+  else:
+    banner("Pool Update Failed")
+    console.print()
+    console.print(
+      "You may re-run the command to retry, or use"
+      " [bold]keras-remote pool list[/bold] to check current state."
+    )
+  console.print()
+
+
+@pool.command("remove")
+@_common_options
+@click.argument("pool_name")
+@click.option("--yes", "-y", is_flag=True, help="Skip confirmation prompt")
+def pool_remove(project, zone, cluster_name, pool_name, yes):
+  """Remove an accelerator node pool from the cluster."""
+  banner("keras-remote Pool Remove")
+
+  project, zone, cluster_name, existing_pools = _load_pools(
+    project, zone, cluster_name
+  )
+
+  remaining = [p for p in existing_pools if p.name != pool_name]
+  if len(remaining) == len(existing_pools):
+    existing_names = [p.name for p in existing_pools]
+    raise click.ClickException(
+      f"Node pool '{pool_name}' not found. "
+      f"Existing pools: {', '.join(existing_names) or '(none)'}"
+    )
+
+  console.print(f"\nRemoving pool [bold]{pool_name}[/bold]")
+  console.print(f"Remaining pools after remove: {len(remaining)}\n")
+
+  if not yes:
+    click.confirm("Proceed?", abort=True)
+
+  update_succeeded = _apply_pool_update(project, zone, cluster_name, remaining)
+
+  console.print()
+  if update_succeeded:
+    banner("Pool Removed")
+  else:
+    banner("Pool Update Failed")
+    console.print()
+    console.print(
+      "You may re-run the command to retry, or use"
+      " [bold]keras-remote pool list[/bold] to check current state."
+    )
+  console.print()
+
+
+@pool.command("list")
+@_common_options
+def pool_list(project, zone, cluster_name):
+  """List accelerator node pools on the cluster."""
+  banner("keras-remote Node Pools")
+
+  check_all()
+  project, zone, cluster_name = _resolve_common(project, zone, cluster_name)
+
+  base_config = InfraConfig(
+    project=project, zone=zone, cluster_name=cluster_name
+  )
+
+  try:
+    program = create_program(base_config)
+    stack = get_stack(program, base_config)
+  except auto.errors.CommandError as e:
+    warning(f"No Pulumi stack found for project '{project}': {e}")
+    console.print("Run 'keras-remote up' to provision infrastructure.")
+    return
+
+  console.print("\nRefreshing state...\n")
+  try:
+    stack.refresh(on_output=print)
+  except auto.errors.CommandError as e:
+    warning(f"Failed to refresh stack state: {e}")
+
+  outputs = stack.outputs()
+  if not outputs:
+    warning("No infrastructure found. Run 'keras-remote up' first.")
+    return
+
+  infrastructure_state(outputs)