Skip to content

Commit 5428cdd

Browse files
authored
add command for creating k3d debug cluster identical to one used in E2E (#361)
* add command for creating k3d debug cluster identical to one used in E2E * clean up cluster delete * add minimal readme
1 parent 69ac2be commit 5428cdd

File tree

6 files changed

+472
-55
lines changed

6 files changed

+472
-55
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# setup-debug-cluster
2+
3+
Creates a local K3D cluster identical to the one used in E2E tests, with Grove operator and Kai scheduler pre-installed.
4+
5+
## Usage
6+
7+
```bash
8+
# From this directory
9+
go run .
10+
11+
# Or build and run
12+
go build
13+
./setup-debug-cluster
14+
```
15+
16+
## Options
17+
18+
```
19+
--name Cluster name (default: "grove-e2e-cluster")
20+
--worker-nodes Number of worker nodes (default: 30)
21+
--verbose, -v Enable verbose logging
22+
--quiet, -q Suppress non-error output
23+
--help Show all options
24+
```
25+
26+
## Teardown
27+
28+
Press `Ctrl+C` if running interactively, or:
29+
30+
```bash
31+
k3d cluster delete grove-e2e-cluster
32+
```
Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
// /*
2+
// Copyright 2025 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"os"
23+
"os/exec"
24+
"os/signal"
25+
"path/filepath"
26+
"strings"
27+
"syscall"
28+
29+
"github.com/ai-dynamo/grove/operator/e2e/setup"
30+
"github.com/ai-dynamo/grove/operator/e2e/utils"
31+
32+
"github.com/alecthomas/kong"
33+
"golang.org/x/term"
34+
"k8s.io/client-go/tools/clientcmd"
35+
clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
36+
)
37+
38+
// CLI defines the command-line interface using Kong struct tags.
39+
// All cluster configuration options default to the values from setup.DefaultE2EClusterConfig()
40+
// to ensure consistency with e2e tests.
41+
type CLI struct {
42+
// Cluster configuration overrides (defaults come from setup.DefaultE2EClusterConfig())
43+
Name *string `name:"name" help:"Name of the K3D cluster"`
44+
ControlPlaneNodes *int `name:"control-plane-nodes" help:"Number of control plane nodes"`
45+
WorkerNodes *int `name:"worker-nodes" help:"Number of worker nodes"`
46+
K3sImage *string `name:"k3s-image" help:"K3s Docker image to use"`
47+
APIPort *string `name:"api-port" help:"Port on host to expose Kubernetes API"`
48+
LBPort *string `name:"lb-port" help:"Load balancer port mapping (host:container)"`
49+
WorkerMemory *string `name:"worker-memory" help:"Memory allocation for worker nodes"`
50+
EnableRegistry *bool `name:"enable-registry" help:"Enable built-in Docker registry" negatable:""`
51+
RegistryPort *string `name:"registry-port" help:"Port for the Docker registry"`
52+
53+
// Deployment options
54+
SkaffoldPath string `name:"skaffold-path" help:"Path to skaffold.yaml (defaults to repo root)" type:"path"`
55+
56+
// Test images
57+
TestImages []string `name:"test-images" help:"Test images to pre-load into registry" default:"nginx:alpine-slim"`
58+
59+
// Logging
60+
Verbose bool `name:"verbose" short:"v" help:"Enable verbose logging"`
61+
Quiet bool `name:"quiet" short:"q" help:"Suppress non-error output"`
62+
}
63+
64+
func main() {
65+
var cli CLI
66+
ctx := kong.Parse(&cli,
67+
kong.Name("setup-debug-cluster"),
68+
kong.Description("Create a K3D cluster with Grove operator for local development and debugging.\n\n"+
69+
"This command handles all setup steps including:\n"+
70+
" - Creating the K3D cluster\n"+
71+
" - Setting up a Docker registry\n"+
72+
" - Pre-pulling and caching images\n"+
73+
" - Deploying Grove operator via Skaffold\n"+
74+
" - Installing Kai scheduler via Helm"),
75+
kong.UsageOnError(),
76+
)
77+
78+
if err := run(&cli); err != nil {
79+
ctx.FatalIfErrorf(err)
80+
}
81+
}
82+
83+
// run executes the main logic for setting up the debug cluster.
84+
func run(cli *CLI) error {
85+
// Set up logging
86+
logger := utils.NewTestLogger(getLogLevel(cli))
87+
88+
// Start with the default cluster configuration
89+
// This includes all node labels and taints required for Grove e2e testing
90+
cfg := setup.DefaultClusterConfig()
91+
92+
// Apply CLI overrides if provided
93+
if cli.Name != nil {
94+
cfg.Name = *cli.Name
95+
}
96+
if cli.ControlPlaneNodes != nil {
97+
cfg.ControlPlaneNodes = *cli.ControlPlaneNodes
98+
}
99+
if cli.WorkerNodes != nil {
100+
cfg.WorkerNodes = *cli.WorkerNodes
101+
}
102+
if cli.K3sImage != nil {
103+
cfg.Image = *cli.K3sImage
104+
}
105+
if cli.APIPort != nil {
106+
cfg.HostPort = *cli.APIPort
107+
}
108+
if cli.LBPort != nil {
109+
cfg.LoadBalancerPort = *cli.LBPort
110+
}
111+
if cli.WorkerMemory != nil {
112+
cfg.WorkerMemory = *cli.WorkerMemory
113+
}
114+
if cli.EnableRegistry != nil {
115+
cfg.EnableRegistry = *cli.EnableRegistry
116+
}
117+
if cli.RegistryPort != nil {
118+
cfg.RegistryPort = *cli.RegistryPort
119+
}
120+
121+
// Determine skaffold path
122+
skaffoldPath := cli.SkaffoldPath
123+
if skaffoldPath == "" {
124+
// Find skaffold.yaml relative to git repo root
125+
skaffoldPath = findSkaffoldYAML()
126+
if skaffoldPath == "" {
127+
return fmt.Errorf("could not find skaffold.yaml. Are you running from within the grove repo? You can also specify the path with --skaffold-path")
128+
}
129+
logger.Debugf("Using skaffold.yaml from: %s", skaffoldPath)
130+
} else {
131+
// Validate specified skaffold path exists
132+
if _, err := os.Stat(skaffoldPath); err != nil {
133+
return fmt.Errorf("skaffold file not found at %s: %w", skaffoldPath, err)
134+
}
135+
}
136+
137+
// Create context that cancels on SIGINT/SIGTERM
138+
runCtx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
139+
defer cancel()
140+
141+
// Print configuration
142+
if !cli.Quiet {
143+
printConfiguration(&cfg, logger)
144+
}
145+
146+
// Set up the cluster
147+
logger.Info("🚀 Setting up K3D cluster with Grove operator...")
148+
149+
_, cleanup, err := setup.SetupCompleteK3DCluster(runCtx, cfg, skaffoldPath, logger)
150+
if err != nil {
151+
logger.Errorf("Failed to setup K3D cluster: %v", err)
152+
if cleanup != nil {
153+
logger.Info("Running cleanup...")
154+
cleanup()
155+
}
156+
return fmt.Errorf("failed to setup K3D cluster: %w", err)
157+
}
158+
159+
// Setup test images in registry (matching what e2e tests do)
160+
logger.Infof("📦 Pre-loading %d test image(s) to registry...", len(cli.TestImages))
161+
if err := setup.SetupRegistryTestImages(cfg.RegistryPort, cli.TestImages); err != nil {
162+
logger.Warnf("⚠️ Failed to pre-load test images (you can push them manually): %v", err)
163+
// Don't fail - user can push images manually if needed
164+
} else {
165+
logger.Info("✅ Test images successfully pre-loaded to registry")
166+
}
167+
168+
// Write kubeconfig to KUBECONFIG env var or default location
169+
kubeconfigPath, err := writeKubeconfig(runCtx, cfg.Name, logger)
170+
if err != nil {
171+
logger.Errorf("Failed to write kubeconfig: %v", err)
172+
logger.Info("Running cleanup...")
173+
cleanup()
174+
return fmt.Errorf("failed to write kubeconfig: %w", err)
175+
}
176+
177+
// Success message
178+
logger.Info("✅ K3D cluster successfully created!")
179+
logger.Infof("Cluster name: %s", cfg.Name)
180+
logger.Infof("API server: https://localhost:%s", cfg.HostPort)
181+
if cfg.EnableRegistry {
182+
logger.Infof("Docker registry: localhost:%s", cfg.RegistryPort)
183+
}
184+
logger.Infof("Kubeconfig written to: %s", kubeconfigPath)
185+
186+
// Print kubectl config instructions
187+
fmt.Println("\nTo use this cluster:")
188+
if kubeconfigPath != clientcmd.RecommendedHomeFile {
189+
fmt.Printf(" export KUBECONFIG=%s\n", kubeconfigPath)
190+
}
191+
fmt.Printf(" kubectl cluster-info\n\n")
192+
193+
// Print teardown instructions
194+
fmt.Println("To tear down the cluster:")
195+
fmt.Printf(" k3d cluster delete %s\n\n", cfg.Name)
196+
197+
// If running interactively, wait for signal
198+
if term.IsTerminal(int(os.Stdin.Fd())) {
199+
fmt.Println("Press Ctrl+C to tear down the cluster...")
200+
<-runCtx.Done()
201+
202+
logger.Info("Tearing down cluster...")
203+
cleanup()
204+
logger.Info("✅ Cluster teardown complete")
205+
} else {
206+
logger.Info("Cluster is ready. Run 'k3d cluster delete " + cfg.Name + "' to tear it down.")
207+
}
208+
209+
return nil
210+
}
211+
212+
// getLogLevel returns the log level based on CLI flags.
213+
func getLogLevel(cli *CLI) utils.LogLevel {
214+
if cli.Quiet {
215+
return utils.ErrorLevel
216+
}
217+
if cli.Verbose {
218+
return utils.DebugLevel
219+
}
220+
return utils.InfoLevel
221+
}
222+
223+
// printConfiguration logs the cluster configuration.
224+
func printConfiguration(cfg *setup.ClusterConfig, logger *utils.Logger) {
225+
logger.Info("Cluster Configuration:")
226+
logger.Infof(" Name: %s", cfg.Name)
227+
logger.Infof(" Control Plane Nodes: %d", cfg.ControlPlaneNodes)
228+
logger.Infof(" Worker Nodes: %d", cfg.WorkerNodes)
229+
logger.Infof(" K3s Image: %s", cfg.Image)
230+
logger.Infof(" API Port: %s", cfg.HostPort)
231+
logger.Infof(" Load Balancer Port: %s", cfg.LoadBalancerPort)
232+
logger.Infof(" Worker Memory: %s", cfg.WorkerMemory)
233+
if cfg.EnableRegistry {
234+
logger.Infof(" Registry: Enabled (port %s)", cfg.RegistryPort)
235+
} else {
236+
logger.Info(" Registry: Disabled")
237+
}
238+
logger.Info("")
239+
}
240+
241+
// findSkaffoldYAML finds skaffold.yaml by locating the git repo root.
242+
func findSkaffoldYAML() string {
243+
// Use git to find the repo root - works from anywhere in the repo
244+
cmd := exec.Command("git", "rev-parse", "--show-toplevel")
245+
output, err := cmd.Output()
246+
if err != nil {
247+
return ""
248+
}
249+
250+
repoRoot := strings.TrimSpace(string(output))
251+
skaffoldPath := filepath.Join(repoRoot, "operator", "skaffold.yaml")
252+
253+
if _, err := os.Stat(skaffoldPath); err == nil {
254+
return skaffoldPath
255+
}
256+
257+
return ""
258+
}
259+
260+
// getKubeconfigPath returns KUBECONFIG env var or ~/.kube/config.
261+
func getKubeconfigPath() string {
262+
if kubeconfigEnv := os.Getenv("KUBECONFIG"); kubeconfigEnv != "" {
263+
return kubeconfigEnv
264+
}
265+
return clientcmd.RecommendedHomeFile
266+
}
267+
268+
// writeKubeconfig writes the cluster kubeconfig, merging with existing config if present.
269+
func writeKubeconfig(ctx context.Context, clusterName string, logger *utils.Logger) (string, error) {
270+
logger.Debug("📄 Fetching kubeconfig from k3d cluster...")
271+
272+
// Get kubeconfig from k3d
273+
kubeconfig, err := setup.GetKubeconfig(ctx, clusterName)
274+
if err != nil {
275+
return "", err
276+
}
277+
278+
// Determine target path
279+
targetPath := getKubeconfigPath()
280+
if targetPath == "" {
281+
return "", fmt.Errorf("could not determine kubeconfig path")
282+
}
283+
284+
// Ensure directory exists
285+
dir := filepath.Dir(targetPath)
286+
if err := os.MkdirAll(dir, 0755); err != nil {
287+
return "", fmt.Errorf("failed to create kubeconfig directory: %w", err)
288+
}
289+
290+
// Check if kubeconfig file already exists
291+
var existingConfig *clientcmdapi.Config
292+
if _, err := os.Stat(targetPath); err == nil {
293+
// File exists, load it
294+
logger.Debugf("Loading existing kubeconfig from %s", targetPath)
295+
existingConfig, err = clientcmd.LoadFromFile(targetPath)
296+
if err != nil {
297+
logger.Warnf("Failed to load existing kubeconfig, will overwrite: %v", err)
298+
existingConfig = nil
299+
}
300+
}
301+
302+
// Merge or use new config
303+
var finalConfig *clientcmdapi.Config
304+
if existingConfig != nil {
305+
// Merge the new cluster config into existing
306+
logger.Debug("Merging new cluster config with existing kubeconfig")
307+
finalConfig = mergeKubeconfigs(existingConfig, kubeconfig, clusterName)
308+
} else {
309+
finalConfig = kubeconfig
310+
}
311+
312+
// Write the kubeconfig
313+
if err := clientcmd.WriteToFile(*finalConfig, targetPath); err != nil {
314+
return "", fmt.Errorf("failed to write kubeconfig to %s: %w", targetPath, err)
315+
}
316+
317+
logger.Debugf("✓ Kubeconfig written to %s", targetPath)
318+
return targetPath, nil
319+
}
320+
321+
// mergeKubeconfigs merges a new kubeconfig into an existing one.
322+
func mergeKubeconfigs(existing, new *clientcmdapi.Config, _ string) *clientcmdapi.Config {
323+
merged := existing.DeepCopy()
324+
325+
// Add/update cluster
326+
for name, cluster := range new.Clusters {
327+
merged.Clusters[name] = cluster
328+
}
329+
330+
// Add/update auth info
331+
for name, authInfo := range new.AuthInfos {
332+
merged.AuthInfos[name] = authInfo
333+
}
334+
335+
// Add/update context
336+
for name, context := range new.Contexts {
337+
merged.Contexts[name] = context
338+
// Set the new cluster as current context
339+
merged.CurrentContext = name
340+
}
341+
342+
return merged
343+
}
344+

0 commit comments

Comments
 (0)