Skip to content

Commit 22a9b7e

Browse files
authored
check for existing cluster and delete if it already exists (#239)
Signed-off-by: Geoff Flarity <gflarity@nvidia.com>
1 parent a201efb commit 22a9b7e

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

operator/e2e/setup/k8s_clusters.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package setup
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"path/filepath"
2324
"strings"
@@ -26,6 +27,7 @@ import (
2627

2728
"github.com/ai-dynamo/grove/operator/e2e/utils"
2829
"github.com/docker/docker/api/types/container"
30+
"github.com/docker/docker/api/types/filters"
2931
dockerclient "github.com/docker/docker/client"
3032
"github.com/k3d-io/k3d/v5/pkg/client"
3133
"github.com/k3d-io/k3d/v5/pkg/config"
@@ -94,6 +96,64 @@ func DefaultClusterConfig() ClusterConfig {
9496
}
9597
}
9698

99+
// ensureClusterDoesNotExist removes any stale k3d cluster with the same name from previous runs.
100+
func ensureClusterDoesNotExist(ctx context.Context, clusterName string, logger *utils.Logger) error {
101+
cluster := &k3d.Cluster{Name: clusterName}
102+
103+
existingCluster, err := client.ClusterGet(ctx, runtimes.Docker, cluster)
104+
if err != nil {
105+
if errors.Is(err, client.ClusterGetNoNodesFoundError) {
106+
return nil
107+
}
108+
return fmt.Errorf("failed to inspect existing k3d cluster %s: %w", clusterName, err)
109+
}
110+
111+
logger.Warnf("🧹 Removing stale k3d cluster '%s' before setup", clusterName)
112+
if err := client.ClusterDelete(ctx, runtimes.Docker, existingCluster, k3d.ClusterDeleteOpts{}); err != nil {
113+
return fmt.Errorf("failed to delete existing k3d cluster %s: %w", clusterName, err)
114+
}
115+
116+
return nil
117+
}
118+
119+
// ensureRegistryDoesNotExist removes any stale k3d registry container from previous runs.
120+
func ensureRegistryDoesNotExist(ctx context.Context, clusterName string, logger *utils.Logger) error {
121+
registryContainerName := fmt.Sprintf("k3d-%s-registry", clusterName)
122+
123+
dockerClient, err := dockerclient.NewClientWithOpts(dockerclient.FromEnv, dockerclient.WithAPIVersionNegotiation())
124+
if err != nil {
125+
return fmt.Errorf("failed to create Docker client: %w", err)
126+
}
127+
defer dockerClient.Close()
128+
129+
filterArgs := filters.NewArgs()
130+
filterArgs.Add("name", registryContainerName)
131+
132+
containers, err := dockerClient.ContainerList(ctx, container.ListOptions{All: true, Filters: filterArgs})
133+
if err != nil {
134+
return fmt.Errorf("failed to list Docker containers: %w", err)
135+
}
136+
137+
if len(containers) == 0 {
138+
return nil
139+
}
140+
141+
for _, c := range containers {
142+
displayName := registryContainerName
143+
if len(c.Names) > 0 {
144+
displayName = strings.TrimPrefix(c.Names[0], "/")
145+
}
146+
147+
logger.Warnf("🧹 Removing stale k3d registry container %s (%s) before cluster setup", displayName, c.ID[:12])
148+
149+
if err := dockerClient.ContainerRemove(ctx, c.ID, container.RemoveOptions{Force: true, RemoveVolumes: true}); err != nil {
150+
return fmt.Errorf("failed to remove existing registry container %s: %w", displayName, err)
151+
}
152+
}
153+
154+
return nil
155+
}
156+
97157
// SetupCompleteK3DCluster creates a complete k3d cluster with Grove, Kai Scheduler, and NVIDIA GPU Operator
98158
func SetupCompleteK3DCluster(ctx context.Context, cfg ClusterConfig, skaffoldYAMLPath string, logger *utils.Logger) (*rest.Config, func(), error) {
99159
restConfig, cleanup, err := SetupK3DCluster(ctx, cfg, logger)
@@ -292,6 +352,16 @@ func SetupK3DCluster(ctx context.Context, cfg ClusterConfig, logger *utils.Logge
292352
return nil, nil, fmt.Errorf("failed to transform config: %w", err)
293353
}
294354

355+
if err := ensureClusterDoesNotExist(ctx, k3dConfig.Name, logger); err != nil {
356+
return nil, nil, err
357+
}
358+
359+
if cfg.EnableRegistry {
360+
if err := ensureRegistryDoesNotExist(ctx, cfg.Name, logger); err != nil {
361+
return nil, nil, err
362+
}
363+
}
364+
295365
// this is the cleanup function, we always return it now so the caller can decide to use it or not
296366
cleanup := func() {
297367
logger.Debug("🗑️ Deleting cluster...")

0 commit comments

Comments
 (0)