[iris] Reclaim dead cloud slices and purge orphan slice rows at boot (#5720)

rjpower · claude · web-flow · commit 2f52e4c9c313 · 2026-05-14T17:36:10.000Z
list_all_slices now returns (handle, state) pairs across every cloud
state, and restore_autoscaler_state partitions on it: live slices feed
the autoscaler, dead ones are async-terminated. Discarded checkpoint
slices are also deleted from the slices table so SQLite no longer
accumulates ghost rows that the autoscaler cannot see.

---------

Co-authored-by: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/lib/iris/src/iris/cluster/controller/autoscaler/recovery.py b/lib/iris/src/iris/cluster/controller/autoscaler/recovery.py
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import logging
+import threading
 from dataclasses import dataclass
 
 from sqlalchemy import select
@@ -20,7 +21,9 @@
 from iris.cluster.controller.db import ControllerDB
 from iris.cluster.controller.schema import scaling_groups_table, slices_table, workers_table
 from iris.cluster.providers.protocols import WorkerInfraProvider
-from iris.cluster.providers.types import SliceHandle
+from iris.cluster.providers.types import CloudSliceState, SliceHandle
+
+_LIVE_CLOUD_STATES = frozenset({CloudSliceState.CREATING, CloudSliceState.READY, CloudSliceState.REPAIRING})
 
 logger = logging.getLogger(__name__)
 
@@ -112,10 +115,12 @@ def restore_autoscaler_state(
 ) -> dict[str, TrackedWorker]:
     """Restore scaling groups and tracked workers from a checkpoint."""
 
-    all_cloud_slices = platform.list_all_slices()
     cloud_by_group: dict[str, list[SliceHandle]] = {}
-    for handle in all_cloud_slices:
-        cloud_by_group.setdefault(handle.scale_group, []).append(handle)
+    for listed in platform.list_all_slices():
+        if listed.state not in _LIVE_CLOUD_STATES:
+            _reclaim_dead_slice(listed.handle, listed.state)
+            continue
+        cloud_by_group.setdefault(listed.handle.scale_group, []).append(listed.handle)
 
     for group_snapshot in checkpoint.group_snapshots.values():
         group = groups.get(group_snapshot.name)
@@ -135,5 +140,24 @@ def restore_autoscaler_state(
             last_scale_up=restore_result.last_scale_up,
             last_scale_down=restore_result.last_scale_down,
         )
+        group.purge_persisted_slice_rows(restore_result.discarded_slice_ids)
 
     return restore_tracked_workers(checkpoint.tracked_worker_rows)
+
+
+def _reclaim_dead_slice(handle: SliceHandle, state: CloudSliceState) -> None:
+    """Best-effort terminate of a dead slice in a daemon thread.
+
+    Boot recovery must not block on or fail because of a stale cloud resource:
+    terminate() can hit transient API errors and is not guaranteed to be fast.
+    Errors are logged; on the next restart the slice will surface again.
+    """
+    logger.info("Reclaiming dead slice %s (state=%s, zone=%s)", handle.slice_id, state, handle.zone)
+
+    def _run() -> None:
+        try:
+            handle.terminate()
+        except Exception as e:
+            logger.warning("Failed to terminate dead slice %s: %s", handle.slice_id, e)
+
+    threading.Thread(target=_run, name=f"reclaim-{handle.slice_id}", daemon=True).start()
diff --git a/lib/iris/src/iris/cluster/controller/autoscaler/scaling_group.py b/lib/iris/src/iris/cluster/controller/autoscaler/scaling_group.py
@@ -409,6 +409,13 @@ def _db_clear_slices(self) -> None:
         with self._db.transaction() as cur:
             cur.execute(delete(slices_table).where(slices_table.c.scale_group == self.name))
 
+    def purge_persisted_slice_rows(self, slice_ids: Sequence[str]) -> None:
+        """Delete the named slice rows from the slices table in a single transaction."""
+        if self._db is None or not slice_ids:
+            return
+        with self._db.transaction() as cur:
+            cur.execute(delete(slices_table).where(slices_table.c.slice_id.in_(list(slice_ids))))
+
     @property
     def platform(self) -> WorkerInfraProvider:
         """Worker infrastructure provider for this scale group."""
@@ -1213,7 +1220,7 @@ class ScalingGroupRestoreResult:
     """Result of restoring a single scaling group from checkpoint metadata."""
 
     slices: dict[str, SliceState] = field(default_factory=dict)
-    discarded_count: int = 0
+    discarded_slice_ids: list[str] = field(default_factory=list)
     adopted_count: int = 0
     last_scale_up: Timestamp = field(default_factory=lambda: Timestamp.from_ms(0))
     last_scale_down: Timestamp = field(default_factory=lambda: Timestamp.from_ms(0))
@@ -1234,7 +1241,7 @@ def restore_scaling_group(
         cloud_handle = cloud_by_id.get(slice_id)
         if cloud_handle is None:
             logger.info("Scaling group %s: discarding slice %s (missing from cloud)", group_snapshot.name, slice_id)
-            result.discarded_count += 1
+            result.discarded_slice_ids.append(slice_id)
             continue
 
         try:
@@ -1274,7 +1281,7 @@ def restore_scaling_group(
         "Restored scaling group %s: %d slices (%d discarded, %d adopted)",
         group_snapshot.name,
         len(result.slices),
-        result.discarded_count,
+        len(result.discarded_slice_ids),
         result.adopted_count,
     )
     return result
diff --git a/lib/iris/src/iris/cluster/providers/gcp/handles.py b/lib/iris/src/iris/cluster/providers/gcp/handles.py
@@ -41,7 +41,9 @@
 
 logger = logging.getLogger(__name__)
 
-# GCP TPU state mapping
+# GCP TPU state mapping. States not in this map collapse to UNKNOWN; the boot
+# reconciler treats anything outside the alive set (CREATING/READY/REPAIRING)
+# as a candidate for reclaim.
 _TPU_STATE_MAP: dict[str, CloudSliceState] = {
     "CREATING": CloudSliceState.CREATING,
     "READY": CloudSliceState.READY,
@@ -58,6 +60,19 @@
 }
 
 _ACTIVE_VM_SLICE_STATES = frozenset({"PROVISIONING", "STAGING", "RUNNING"})
+
+# Queued-resource (reserved TPU) state mapping. Non-live states must surface so
+# the boot reconciler can reclaim them; ACTIVE is transient (the matching TPU
+# VM is what list_all_slices normally returns) and only appears here briefly.
+_QR_STATE_MAP: dict[str, CloudSliceState] = {
+    "QUEUED": CloudSliceState.CREATING,
+    "WAITING_FOR_RESOURCES": CloudSliceState.CREATING,
+    "PROVISIONING": CloudSliceState.CREATING,
+    "ACTIVE": CloudSliceState.READY,
+    "FAILED": CloudSliceState.FAILED,
+    "SUSPENDED": CloudSliceState.FAILED,
+    "DELETING": CloudSliceState.DELETING,
+}
 _GCE_NAME_MAX_LEN = 63
 _GCE_NAME_RE = re.compile(r"[^a-z0-9-]+")
 _GCE_NAME_EDGE_RE = re.compile(r"^-+|-+$")
@@ -222,7 +237,6 @@ def __init__(
         _gcp_service: GcpService,
         _ssh_config: config_pb2.SshConfig | None = None,
         _service_account: str | None = None,
-        _state: str = "READY",
         _bootstrapping: bool = False,
         _is_queued_resource: bool = False,
     ):
@@ -237,7 +251,6 @@ def __init__(
         self._accelerator_variant = _accelerator_variant
         self._ssh_config = _ssh_config
         self._service_account = _service_account
-        self._state = _state
         self.is_queued_resource: bool = _is_queued_resource
         self._bootstrap_state: CloudSliceState | None = None if _bootstrapping else CloudSliceState.READY
         self._bootstrap_lock = threading.Lock()
diff --git a/lib/iris/src/iris/cluster/providers/gcp/workers.py b/lib/iris/src/iris/cluster/providers/gcp/workers.py
@@ -27,6 +27,9 @@
 )
 from iris.cluster.providers.gcp.handles import (
     _ACTIVE_VM_SLICE_STATES,
+    _QR_STATE_MAP,
+    _TPU_STATE_MAP,
+    _VM_STATE_MAP,
     CloudSliceState,
     GcpSliceHandle,
     GcpStandaloneWorkerHandle,
@@ -46,6 +49,7 @@
 from iris.cluster.providers.types import (
     InfraError,
     Labels,
+    ListedSlice,
     SliceHandle,
     generate_slice_suffix,
 )
@@ -628,7 +632,6 @@ def list_slices(
                     _gcp_service=self._gcp,
                     _ssh_config=self._ssh_config,
                     _service_account=tpu.service_account,
-                    _state=tpu.state,
                 )
             )
 
@@ -657,102 +660,99 @@ def list_slices(
 
         return handles
 
-    def list_all_slices(self) -> list[GcpSliceHandle | GcpVmSliceHandle]:
-        """List all autoscaler-managed slices for this cluster.
+    def list_all_slices(self) -> list[ListedSlice]:
+        """List every autoscaler-managed slice for this cluster, regardless of cloud state.
 
         Uses project-wide queries (empty zones = all zones) via GcpService,
         filtered by iris-{prefix}-managed=true. Slices tagged
-        iris-{prefix}-manual=true (operator-created via `iris cluster
-        create-slice`) are excluded: the autoscaler and `cluster stop` must
-        not see or terminate them.
+        iris-{prefix}-manual=true are excluded — those are operator-created
+        and never autoscaler-owned.
         """
         managed_labels = {self._iris_labels.iris_managed: "true"}
         manual_label = self._iris_labels.iris_manual
 
         if self._gcp.mode == ServiceMode.LOCAL:
             local_handles = self._gcp.get_local_slices(managed_labels)
-            return [h for h in local_handles if h.labels.get(manual_label) != "true"]  # type: ignore[return-value]
+            return [
+                ListedSlice(handle=h, state=CloudSliceState.READY)
+                for h in local_handles
+                if h.labels.get(manual_label) != "true"
+            ]
 
         tpu_infos = self._gcp.tpu_list(zones=[], labels=managed_labels)
         vm_infos = self._gcp.vm_list(zones=[], labels=managed_labels)
 
-        handles: list[GcpSliceHandle | GcpVmSliceHandle] = []
+        listed: list[ListedSlice] = []
 
         for tpu in tpu_infos:
-            if tpu.state not in ("READY", "CREATING"):
-                continue
             if tpu.labels.get(manual_label) == "true":
                 continue
-            handles.append(
-                GcpSliceHandle(
-                    _slice_id=tpu.name,
-                    _zone=tpu.zone,
-                    _project_id=self._project_id,
-                    _labels=tpu.labels,
-                    _created_at=tpu.created_at,
-                    _label_prefix=self._label_prefix,
-                    _accelerator_variant=tpu.accelerator_type,
-                    _gcp_service=self._gcp,
-                    _ssh_config=self._ssh_config,
-                    _service_account=tpu.service_account,
-                    _state=tpu.state,
-                    _is_queued_resource=tpu.labels.get(CAPACITY_TYPE_LABEL) == CAPACITY_TYPE_RESERVED_VALUE,
-                )
+            handle = GcpSliceHandle(
+                _slice_id=tpu.name,
+                _zone=tpu.zone,
+                _project_id=self._project_id,
+                _labels=tpu.labels,
+                _created_at=tpu.created_at,
+                _label_prefix=self._label_prefix,
+                _accelerator_variant=tpu.accelerator_type,
+                _gcp_service=self._gcp,
+                _ssh_config=self._ssh_config,
+                _service_account=tpu.service_account,
+                _is_queued_resource=tpu.labels.get(CAPACITY_TYPE_LABEL) == CAPACITY_TYPE_RESERVED_VALUE,
             )
+            listed.append(ListedSlice(handle=handle, state=_TPU_STATE_MAP.get(tpu.state, CloudSliceState.UNKNOWN)))
 
-        # Discover queued resources (reserved TPUs) not yet visible as TPU VMs.
-        # These are in QUEUED/PROVISIONING/WAITING_FOR_RESOURCES and need handles
-        # so the controller doesn't orphan them on restart.
-        tpu_names = {h.slice_id for h in handles}
+        # Discover queued resources (reserved TPUs) not already represented by a
+        # TPU VM. We surface every state — including FAILED/SUSPENDED/DELETING —
+        # so the boot reconciler can reclaim dead reservations instead of
+        # orphaning them in GCP.
+        tpu_names = {item.handle.slice_id for item in listed}
         qr_infos = self._gcp.queued_resource_list(zones=[], labels=managed_labels)
         for qr in qr_infos:
             if qr.name in tpu_names:
                 continue
-            if qr.state in ("FAILED", "SUSPENDED", "DELETING"):
-                continue
-            if qr.labels.get(manual_label) == "true":
+            if qr.labels and qr.labels.get(manual_label) == "true":
                 continue
-            handles.append(
-                GcpSliceHandle(
-                    _slice_id=qr.name,
-                    _zone=qr.zone,
-                    _project_id=self._project_id,
-                    _labels=qr.labels
-                    or {CAPACITY_TYPE_LABEL: CAPACITY_TYPE_RESERVED_VALUE, self._iris_labels.iris_managed: "true"},
-                    _created_at=Timestamp.now(),
-                    _label_prefix=self._label_prefix,
-                    _accelerator_variant="",
-                    _gcp_service=self._gcp,
-                    _ssh_config=self._ssh_config,
-                    _is_queued_resource=True,
-                )
+            handle = GcpSliceHandle(
+                _slice_id=qr.name,
+                _zone=qr.zone,
+                _project_id=self._project_id,
+                _labels=qr.labels
+                or {CAPACITY_TYPE_LABEL: CAPACITY_TYPE_RESERVED_VALUE, self._iris_labels.iris_managed: "true"},
+                _created_at=Timestamp.now(),
+                _label_prefix=self._label_prefix,
+                _accelerator_variant="",
+                _gcp_service=self._gcp,
+                _ssh_config=self._ssh_config,
+                _is_queued_resource=True,
             )
+            listed.append(ListedSlice(handle=handle, state=_QR_STATE_MAP.get(qr.state, CloudSliceState.UNKNOWN)))
 
+        # Surface every managed VM regardless of cloud state. Stopped/terminated
+        # instances are exactly what the boot reconciler needs to reclaim; the
+        # active-only filter belongs in list_slices(), used for live discovery.
         for vm in vm_infos:
-            if vm.status not in _ACTIVE_VM_SLICE_STATES:
-                continue
             slice_id = vm.labels.get(self._iris_labels.iris_slice_id, "")
             if not slice_id:
                 continue
             if vm.labels.get(manual_label) == "true":
                 continue
-            handles.append(
-                GcpVmSliceHandle(
-                    _slice_id=slice_id,
-                    _vm_name=vm.name,
-                    _zone=vm.zone,
-                    _project_id=self._project_id,
-                    _gcp_service=self._gcp,
-                    _labels=vm.labels,
-                    _created_at=vm.created_at,
-                    _label_prefix=self._label_prefix,
-                    _ssh_config=self._ssh_config,
-                    _service_account=vm.service_account,
-                )
+            handle = GcpVmSliceHandle(
+                _slice_id=slice_id,
+                _vm_name=vm.name,
+                _zone=vm.zone,
+                _project_id=self._project_id,
+                _gcp_service=self._gcp,
+                _labels=vm.labels,
+                _created_at=vm.created_at,
+                _label_prefix=self._label_prefix,
+                _ssh_config=self._ssh_config,
+                _service_account=vm.service_account,
             )
+            listed.append(ListedSlice(handle=handle, state=_VM_STATE_MAP.get(vm.status, CloudSliceState.UNKNOWN)))
 
-        logger.info("list_all_slices: found %d managed slices", len(handles))
-        return handles
+        logger.info("list_all_slices: found %d managed slices", len(listed))
+        return listed
 
     def list_vms(
         self,
diff --git a/lib/iris/src/iris/cluster/providers/manual/provider.py b/lib/iris/src/iris/cluster/providers/manual/provider.py
@@ -27,6 +27,7 @@
     CloudWorkerState,
     InfraError,
     Labels,
+    ListedSlice,
     SliceStatus,
     WorkerStatus,
     default_stop_all,
@@ -334,16 +335,23 @@ def list_slices(
             results = [s for s in results if all(s.labels.get(k) == v for k, v in labels.items())]
         return results
 
-    def list_all_slices(self) -> list[ManualSliceHandle]:
-        """List autoscaler-managed slices.
+    def list_all_slices(self) -> list[ListedSlice]:
+        """List autoscaler-managed slices paired with cloud state.
 
         Excludes slices tagged iris_manual=true (operator-created via
-        `iris cluster create-slice`), which the autoscaler and
-        `iris cluster stop` must not see or terminate.
+        `iris cluster create-slice`). Manual slices have no real cloud
+        lifecycle; non-terminated ones report READY.
         """
         all_managed = self.list_slices(zones=[], labels={self._iris_labels.iris_managed: "true"})
         manual_label = self._iris_labels.iris_manual
-        return [s for s in all_managed if s.labels.get(manual_label) != "true"]
+        return [
+            ListedSlice(
+                handle=s,
+                state=CloudSliceState.DELETING if s._terminated else CloudSliceState.READY,
+            )
+            for s in all_managed
+            if s.labels.get(manual_label) != "true"
+        ]
 
     def list_vms(
         self,
diff --git a/lib/iris/src/iris/cluster/providers/protocols.py b/lib/iris/src/iris/cluster/providers/protocols.py
@@ -16,7 +16,7 @@
 from contextlib import AbstractContextManager
 from typing import Protocol
 
-from iris.cluster.providers.types import SliceHandle, StandaloneWorkerHandle
+from iris.cluster.providers.types import ListedSlice, SliceHandle, StandaloneWorkerHandle
 from iris.rpc import config_pb2
 
 
@@ -123,8 +123,8 @@ def list_slices(
         """List existing slices, filtered by zone and optionally by labels."""
         ...
 
-    def list_all_slices(self) -> list[SliceHandle]:
-        """List all slices managed by this cluster across all zones."""
+    def list_all_slices(self) -> list[ListedSlice]:
+        """List every iris-managed slice across all zones, paired with its cloud state."""
         ...
 
     def list_vms(
diff --git a/lib/iris/src/iris/cluster/providers/types.py b/lib/iris/src/iris/cluster/providers/types.py
diff --git a/lib/iris/tests/cluster/providers/gcp/test_platform.py b/lib/iris/tests/cluster/providers/gcp/test_platform.py
diff --git a/lib/iris/tests/cluster/test_snapshot_reconciliation.py b/lib/iris/tests/cluster/test_snapshot_reconciliation.py