@@ -18,6 +18,7 @@ package setup
1818
1919import (
2020 "context"
21+ "errors"
2122 "fmt"
2223 "path/filepath"
2324 "strings"
@@ -26,6 +27,7 @@ import (
2627
2728 "github.com/ai-dynamo/grove/operator/e2e/utils"
2829 "github.com/docker/docker/api/types/container"
30+ "github.com/docker/docker/api/types/filters"
2931 dockerclient "github.com/docker/docker/client"
3032 "github.com/k3d-io/k3d/v5/pkg/client"
3133 "github.com/k3d-io/k3d/v5/pkg/config"
@@ -94,6 +96,64 @@ func DefaultClusterConfig() ClusterConfig {
9496 }
9597}
9698
99+ // ensureClusterDoesNotExist removes any stale k3d cluster with the same name from previous runs.
100+ func ensureClusterDoesNotExist (ctx context.Context , clusterName string , logger * utils.Logger ) error {
101+ cluster := & k3d.Cluster {Name : clusterName }
102+
103+ existingCluster , err := client .ClusterGet (ctx , runtimes .Docker , cluster )
104+ if err != nil {
105+ if errors .Is (err , client .ClusterGetNoNodesFoundError ) {
106+ return nil
107+ }
108+ return fmt .Errorf ("failed to inspect existing k3d cluster %s: %w" , clusterName , err )
109+ }
110+
111+ logger .Warnf ("🧹 Removing stale k3d cluster '%s' before setup" , clusterName )
112+ if err := client .ClusterDelete (ctx , runtimes .Docker , existingCluster , k3d.ClusterDeleteOpts {}); err != nil {
113+ return fmt .Errorf ("failed to delete existing k3d cluster %s: %w" , clusterName , err )
114+ }
115+
116+ return nil
117+ }
118+
119+ // ensureRegistryDoesNotExist removes any stale k3d registry container from previous runs.
120+ func ensureRegistryDoesNotExist (ctx context.Context , clusterName string , logger * utils.Logger ) error {
121+ registryContainerName := fmt .Sprintf ("k3d-%s-registry" , clusterName )
122+
123+ dockerClient , err := dockerclient .NewClientWithOpts (dockerclient .FromEnv , dockerclient .WithAPIVersionNegotiation ())
124+ if err != nil {
125+ return fmt .Errorf ("failed to create Docker client: %w" , err )
126+ }
127+ defer dockerClient .Close ()
128+
129+ filterArgs := filters .NewArgs ()
130+ filterArgs .Add ("name" , registryContainerName )
131+
132+ containers , err := dockerClient .ContainerList (ctx , container.ListOptions {All : true , Filters : filterArgs })
133+ if err != nil {
134+ return fmt .Errorf ("failed to list Docker containers: %w" , err )
135+ }
136+
137+ if len (containers ) == 0 {
138+ return nil
139+ }
140+
141+ for _ , c := range containers {
142+ displayName := registryContainerName
143+ if len (c .Names ) > 0 {
144+ displayName = strings .TrimPrefix (c .Names [0 ], "/" )
145+ }
146+
147+ logger .Warnf ("🧹 Removing stale k3d registry container %s (%s) before cluster setup" , displayName , c .ID [:12 ])
148+
149+ if err := dockerClient .ContainerRemove (ctx , c .ID , container.RemoveOptions {Force : true , RemoveVolumes : true }); err != nil {
150+ return fmt .Errorf ("failed to remove existing registry container %s: %w" , displayName , err )
151+ }
152+ }
153+
154+ return nil
155+ }
156+
97157// SetupCompleteK3DCluster creates a complete k3d cluster with Grove, Kai Scheduler, and NVIDIA GPU Operator
98158func SetupCompleteK3DCluster (ctx context.Context , cfg ClusterConfig , skaffoldYAMLPath string , logger * utils.Logger ) (* rest.Config , func (), error ) {
99159 restConfig , cleanup , err := SetupK3DCluster (ctx , cfg , logger )
@@ -292,6 +352,16 @@ func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *utils.Logge
292352 return nil , nil , fmt .Errorf ("failed to transform config: %w" , err )
293353 }
294354
355+ if err := ensureClusterDoesNotExist (ctx , k3dConfig .Name , logger ); err != nil {
356+ return nil , nil , err
357+ }
358+
359+ if cfg .EnableRegistry {
360+ if err := ensureRegistryDoesNotExist (ctx , cfg .Name , logger ); err != nil {
361+ return nil , nil , err
362+ }
363+ }
364+
295365 // this is the cleanup function, we always return it now so the caller can decide to use it or not
296366 cleanup := func () {
297367 logger .Debug ("🗑️ Deleting cluster..." )
0 commit comments