@@ -440,49 +440,79 @@ def resolve_image(self, image: str, zone: str | None = None) -> str:
440440 """
441441 ...
442442
443- def discover_controller (self , controller_config : config_pb2 . ControllerVmConfig ) -> str :
444- """Discover controller address from platform-specific mechanism .
443+ def debug_report (self ) -> None :
444+ """Log diagnostic info about the controller after a failure .
445445
446- Returns 'host:port' string. Each platform resolves this differently:
447- - GCP: queries VMs by controller label
448- - Manual: uses static host from config
449- - Local: returns configured address or localhost
446+ Override to inspect platform-specific state (e.g. pod termination
447+ reason, previous container logs). Default is a no-op.
450448 """
451449 ...
452450
453- def start_controller (self , config : config_pb2 . IrisClusterConfig ) -> str :
454- """Start or discover existing controller. Returns address (host:port ).
451+ def shutdown (self ) -> None :
452+ """Release platform-owned resources (threads, connections, caches ).
455453
456- Each remote platform implements its own controller lifecycle:
457- - GCP: creates GCE VM, SSHes in, bootstraps container
458- - Manual: SSHes to configured host, bootstraps container
459- - CoreWeave: kubectl apply ConfigMap + NodePool + Deployment + Service
454+ Distinct from terminate() on handles -- shutdown() doesn't destroy
455+ cloud resources. It cleans up the Platform object itself.
460456
461- Local mode uses LocalCluster directly and does not go through Platform.
457+ In LOCAL mode this stops worker threads managed by ThreadContainer.
458+ For GCP/Manual this is typically a no-op.
462459 """
463460 ...
464461
465- def restart_controller (self , config : config_pb2 . IrisClusterConfig ) -> str :
466- """Restart controller in-place without destroying underlying compute .
462+ def resolve_image (self , image : str , zone : str | None = None ) -> str :
463+ """Resolve a container image reference for this platform's registry .
467464
468- Re-runs the bootstrap script on the existing VM/pod to pull the latest
469- image and restart the container. Falls back to stop+start semantics on
470- platforms where in-place restart isn't meaningful (e.g. CoreWeave).
465+ On GCP, rewrites ``ghcr.io/`` images to the Artifact Registry remote
466+ repo for the given zone's continent. Other platforms return the image
467+ unchanged.
468+
469+ Args:
470+ image: Container image tag (e.g. ``ghcr.io/org/img:v1``).
471+ zone: Cloud zone used to select the regional mirror. Required on
472+ GCP when the image starts with ``ghcr.io/``.
473+
474+ Returns:
475+ Resolved image tag ready for ``docker pull``.
471476 """
472477 ...
473478
474- def stop_controller (self , config : config_pb2 .IrisClusterConfig ) -> None :
475- """Stop the controller.
476479
477- Each remote platform tears down its own controller resources:
478- - GCP: terminates GCE VM
479- - Manual: terminates bootstrap on host
480- - CoreWeave: kubectl delete Deployment + Service + NodePool
480+ # ============================================================================
481+ # Controller Lifecycle Protocol
482+ # ============================================================================
483+
484+
485+ class ControllerLifecycle (Protocol ):
486+ """Controller lifecycle operations, separated from infrastructure provisioning.
487+
488+ Handles discovery, start, restart, stop, and bulk teardown of the controller
489+ process. Each platform provides its own implementation:
490+ - GCP/Manual: VM-based lifecycle via vm_lifecycle.py
491+ - CoreWeave: K8s Deployment/Service lifecycle via kubectl
492+ """
493+
494+ def discover_controller (self , controller_config : config_pb2 .ControllerVmConfig ) -> str :
495+ """Discover controller address from platform-specific mechanism.
481496
482- Local mode uses LocalCluster.close() directly and does not go through Platform.
497+ Returns 'host:port' string. Each platform resolves this differently:
498+ - GCP: queries VMs by controller label
499+ - Manual: uses static host from config
500+ - CoreWeave: returns K8s Service DNS name
483501 """
484502 ...
485503
504+ def start_controller (self , config : config_pb2 .IrisClusterConfig ) -> str :
505+ """Start or discover existing controller. Returns address (host:port)."""
506+ ...
507+
508+ def restart_controller (self , config : config_pb2 .IrisClusterConfig ) -> str :
509+ """Restart controller in-place without destroying underlying compute."""
510+ ...
511+
512+ def stop_controller (self , config : config_pb2 .IrisClusterConfig ) -> None :
513+ """Stop the controller and clean up its resources."""
514+ ...
515+
486516 def stop_all (
487517 self ,
488518 config : config_pb2 .IrisClusterConfig ,
@@ -493,11 +523,6 @@ def stop_all(
493523
494524 When dry_run=True, discovers resources but does not terminate them.
495525 Returns list of resource names that were (or would be) terminated.
496-
497- Each platform implements its own teardown strategy:
498- - GCP/Manual: list_all_slices + terminate each + stop_controller (parallel)
499- - CoreWeave: kubectl delete NodePools + controller resources
500- - Local: terminate slices + stop controller
501526 """
502527 ...
503528
0 commit comments