|
27 | 27 | ) |
28 | 28 | from iris.cluster.providers.gcp.handles import ( |
29 | 29 | _ACTIVE_VM_SLICE_STATES, |
| 30 | + _QR_STATE_MAP, |
| 31 | + _TPU_STATE_MAP, |
| 32 | + _VM_STATE_MAP, |
30 | 33 | CloudSliceState, |
31 | 34 | GcpSliceHandle, |
32 | 35 | GcpStandaloneWorkerHandle, |
|
46 | 49 | from iris.cluster.providers.types import ( |
47 | 50 | InfraError, |
48 | 51 | Labels, |
| 52 | + ListedSlice, |
49 | 53 | SliceHandle, |
50 | 54 | generate_slice_suffix, |
51 | 55 | ) |
@@ -628,7 +632,6 @@ def list_slices( |
628 | 632 | _gcp_service=self._gcp, |
629 | 633 | _ssh_config=self._ssh_config, |
630 | 634 | _service_account=tpu.service_account, |
631 | | - _state=tpu.state, |
632 | 635 | ) |
633 | 636 | ) |
634 | 637 |
|
@@ -657,102 +660,99 @@ def list_slices( |
657 | 660 |
|
658 | 661 | return handles |
659 | 662 |
|
660 | | - def list_all_slices(self) -> list[GcpSliceHandle | GcpVmSliceHandle]: |
661 | | - """List all autoscaler-managed slices for this cluster. |
| 663 | + def list_all_slices(self) -> list[ListedSlice]: |
| 664 | + """List every autoscaler-managed slice for this cluster, regardless of cloud state. |
662 | 665 |
|
663 | 666 | Uses project-wide queries (empty zones = all zones) via GcpService, |
664 | 667 | filtered by iris-{prefix}-managed=true. Slices tagged |
665 | | - iris-{prefix}-manual=true (operator-created via `iris cluster |
666 | | - create-slice`) are excluded: the autoscaler and `cluster stop` must |
667 | | - not see or terminate them. |
| 668 | + iris-{prefix}-manual=true are excluded — those are operator-created |
| 669 | + and never autoscaler-owned. |
668 | 670 | """ |
669 | 671 | managed_labels = {self._iris_labels.iris_managed: "true"} |
670 | 672 | manual_label = self._iris_labels.iris_manual |
671 | 673 |
|
672 | 674 | if self._gcp.mode == ServiceMode.LOCAL: |
673 | 675 | local_handles = self._gcp.get_local_slices(managed_labels) |
674 | | - return [h for h in local_handles if h.labels.get(manual_label) != "true"] # type: ignore[return-value] |
| 676 | + return [ |
| 677 | + ListedSlice(handle=h, state=CloudSliceState.READY) |
| 678 | + for h in local_handles |
| 679 | + if h.labels.get(manual_label) != "true" |
| 680 | + ] |
675 | 681 |
|
676 | 682 | tpu_infos = self._gcp.tpu_list(zones=[], labels=managed_labels) |
677 | 683 | vm_infos = self._gcp.vm_list(zones=[], labels=managed_labels) |
678 | 684 |
|
679 | | - handles: list[GcpSliceHandle | GcpVmSliceHandle] = [] |
| 685 | + listed: list[ListedSlice] = [] |
680 | 686 |
|
681 | 687 | for tpu in tpu_infos: |
682 | | - if tpu.state not in ("READY", "CREATING"): |
683 | | - continue |
684 | 688 | if tpu.labels.get(manual_label) == "true": |
685 | 689 | continue |
686 | | - handles.append( |
687 | | - GcpSliceHandle( |
688 | | - _slice_id=tpu.name, |
689 | | - _zone=tpu.zone, |
690 | | - _project_id=self._project_id, |
691 | | - _labels=tpu.labels, |
692 | | - _created_at=tpu.created_at, |
693 | | - _label_prefix=self._label_prefix, |
694 | | - _accelerator_variant=tpu.accelerator_type, |
695 | | - _gcp_service=self._gcp, |
696 | | - _ssh_config=self._ssh_config, |
697 | | - _service_account=tpu.service_account, |
698 | | - _state=tpu.state, |
699 | | - _is_queued_resource=tpu.labels.get(CAPACITY_TYPE_LABEL) == CAPACITY_TYPE_RESERVED_VALUE, |
700 | | - ) |
| 690 | + handle = GcpSliceHandle( |
| 691 | + _slice_id=tpu.name, |
| 692 | + _zone=tpu.zone, |
| 693 | + _project_id=self._project_id, |
| 694 | + _labels=tpu.labels, |
| 695 | + _created_at=tpu.created_at, |
| 696 | + _label_prefix=self._label_prefix, |
| 697 | + _accelerator_variant=tpu.accelerator_type, |
| 698 | + _gcp_service=self._gcp, |
| 699 | + _ssh_config=self._ssh_config, |
| 700 | + _service_account=tpu.service_account, |
| 701 | + _is_queued_resource=tpu.labels.get(CAPACITY_TYPE_LABEL) == CAPACITY_TYPE_RESERVED_VALUE, |
701 | 702 | ) |
| 703 | + listed.append(ListedSlice(handle=handle, state=_TPU_STATE_MAP.get(tpu.state, CloudSliceState.UNKNOWN))) |
702 | 704 |
|
703 | | - # Discover queued resources (reserved TPUs) not yet visible as TPU VMs. |
704 | | - # These are in QUEUED/PROVISIONING/WAITING_FOR_RESOURCES and need handles |
705 | | - # so the controller doesn't orphan them on restart. |
706 | | - tpu_names = {h.slice_id for h in handles} |
| 705 | + # Discover queued resources (reserved TPUs) not already represented by a |
| 706 | + # TPU VM. We surface every state — including FAILED/SUSPENDED/DELETING — |
| 707 | + # so the boot reconciler can reclaim dead reservations instead of |
| 708 | + # orphaning them in GCP. |
| 709 | + tpu_names = {item.handle.slice_id for item in listed} |
707 | 710 | qr_infos = self._gcp.queued_resource_list(zones=[], labels=managed_labels) |
708 | 711 | for qr in qr_infos: |
709 | 712 | if qr.name in tpu_names: |
710 | 713 | continue |
711 | | - if qr.state in ("FAILED", "SUSPENDED", "DELETING"): |
712 | | - continue |
713 | | - if qr.labels.get(manual_label) == "true": |
| 714 | + if qr.labels and qr.labels.get(manual_label) == "true": |
714 | 715 | continue |
715 | | - handles.append( |
716 | | - GcpSliceHandle( |
717 | | - _slice_id=qr.name, |
718 | | - _zone=qr.zone, |
719 | | - _project_id=self._project_id, |
720 | | - _labels=qr.labels |
721 | | - or {CAPACITY_TYPE_LABEL: CAPACITY_TYPE_RESERVED_VALUE, self._iris_labels.iris_managed: "true"}, |
722 | | - _created_at=Timestamp.now(), |
723 | | - _label_prefix=self._label_prefix, |
724 | | - _accelerator_variant="", |
725 | | - _gcp_service=self._gcp, |
726 | | - _ssh_config=self._ssh_config, |
727 | | - _is_queued_resource=True, |
728 | | - ) |
| 716 | + handle = GcpSliceHandle( |
| 717 | + _slice_id=qr.name, |
| 718 | + _zone=qr.zone, |
| 719 | + _project_id=self._project_id, |
| 720 | + _labels=qr.labels |
| 721 | + or {CAPACITY_TYPE_LABEL: CAPACITY_TYPE_RESERVED_VALUE, self._iris_labels.iris_managed: "true"}, |
| 722 | + _created_at=Timestamp.now(), |
| 723 | + _label_prefix=self._label_prefix, |
| 724 | + _accelerator_variant="", |
| 725 | + _gcp_service=self._gcp, |
| 726 | + _ssh_config=self._ssh_config, |
| 727 | + _is_queued_resource=True, |
729 | 728 | ) |
| 729 | + listed.append(ListedSlice(handle=handle, state=_QR_STATE_MAP.get(qr.state, CloudSliceState.UNKNOWN))) |
730 | 730 |
|
| 731 | + # Surface every managed VM regardless of cloud state. Stopped/terminated |
| 732 | + # instances are exactly what the boot reconciler needs to reclaim; the |
| 733 | + # active-only filter belongs in list_slices(), used for live discovery. |
731 | 734 | for vm in vm_infos: |
732 | | - if vm.status not in _ACTIVE_VM_SLICE_STATES: |
733 | | - continue |
734 | 735 | slice_id = vm.labels.get(self._iris_labels.iris_slice_id, "") |
735 | 736 | if not slice_id: |
736 | 737 | continue |
737 | 738 | if vm.labels.get(manual_label) == "true": |
738 | 739 | continue |
739 | | - handles.append( |
740 | | - GcpVmSliceHandle( |
741 | | - _slice_id=slice_id, |
742 | | - _vm_name=vm.name, |
743 | | - _zone=vm.zone, |
744 | | - _project_id=self._project_id, |
745 | | - _gcp_service=self._gcp, |
746 | | - _labels=vm.labels, |
747 | | - _created_at=vm.created_at, |
748 | | - _label_prefix=self._label_prefix, |
749 | | - _ssh_config=self._ssh_config, |
750 | | - _service_account=vm.service_account, |
751 | | - ) |
| 740 | + handle = GcpVmSliceHandle( |
| 741 | + _slice_id=slice_id, |
| 742 | + _vm_name=vm.name, |
| 743 | + _zone=vm.zone, |
| 744 | + _project_id=self._project_id, |
| 745 | + _gcp_service=self._gcp, |
| 746 | + _labels=vm.labels, |
| 747 | + _created_at=vm.created_at, |
| 748 | + _label_prefix=self._label_prefix, |
| 749 | + _ssh_config=self._ssh_config, |
| 750 | + _service_account=vm.service_account, |
752 | 751 | ) |
| 752 | + listed.append(ListedSlice(handle=handle, state=_VM_STATE_MAP.get(vm.status, CloudSliceState.UNKNOWN))) |
753 | 753 |
|
754 | | - logger.info("list_all_slices: found %d managed slices", len(handles)) |
755 | | - return handles |
| 754 | + logger.info("list_all_slices: found %d managed slices", len(listed)) |
| 755 | + return listed |
756 | 756 |
|
757 | 757 | def list_vms( |
758 | 758 | self, |
|
0 commit comments