|
| 1 | +// /* |
| 2 | +// Copyright 2025 The Grove Authors. |
| 3 | +// |
| 4 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +// you may not use this file except in compliance with the License. |
| 6 | +// You may obtain a copy of the License at |
| 7 | +// |
| 8 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +// |
| 10 | +// Unless required by applicable law or agreed to in writing, software |
| 11 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +// See the License for the specific language governing permissions and |
| 14 | +// limitations under the License. |
| 15 | +// */ |
| 16 | + |
| 17 | +package main |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + "os" |
| 23 | + "os/exec" |
| 24 | + "os/signal" |
| 25 | + "path/filepath" |
| 26 | + "strings" |
| 27 | + "syscall" |
| 28 | + |
| 29 | + "github.com/ai-dynamo/grove/operator/e2e/setup" |
| 30 | + "github.com/ai-dynamo/grove/operator/e2e/utils" |
| 31 | + |
| 32 | + "github.com/alecthomas/kong" |
| 33 | + "golang.org/x/term" |
| 34 | + "k8s.io/client-go/tools/clientcmd" |
| 35 | + clientcmdapi "k8s.io/client-go/tools/clientcmd/api" |
| 36 | +) |
| 37 | + |
| 38 | +// CLI defines the command-line interface using Kong struct tags. |
| 39 | +// All cluster configuration options default to the values from setup.DefaultE2EClusterConfig() |
| 40 | +// to ensure consistency with e2e tests. |
| 41 | +type CLI struct { |
| 42 | + // Cluster configuration overrides (defaults come from setup.DefaultE2EClusterConfig()) |
| 43 | + Name *string `name:"name" help:"Name of the K3D cluster"` |
| 44 | + ControlPlaneNodes *int `name:"control-plane-nodes" help:"Number of control plane nodes"` |
| 45 | + WorkerNodes *int `name:"worker-nodes" help:"Number of worker nodes"` |
| 46 | + K3sImage *string `name:"k3s-image" help:"K3s Docker image to use"` |
| 47 | + APIPort *string `name:"api-port" help:"Port on host to expose Kubernetes API"` |
| 48 | + LBPort *string `name:"lb-port" help:"Load balancer port mapping (host:container)"` |
| 49 | + WorkerMemory *string `name:"worker-memory" help:"Memory allocation for worker nodes"` |
| 50 | + EnableRegistry *bool `name:"enable-registry" help:"Enable built-in Docker registry" negatable:""` |
| 51 | + RegistryPort *string `name:"registry-port" help:"Port for the Docker registry"` |
| 52 | + |
| 53 | + // Deployment options |
| 54 | + SkaffoldPath string `name:"skaffold-path" help:"Path to skaffold.yaml (defaults to repo root)" type:"path"` |
| 55 | + |
| 56 | + // Test images |
| 57 | + TestImages []string `name:"test-images" help:"Test images to pre-load into registry" default:"nginx:alpine-slim"` |
| 58 | + |
| 59 | + // Logging |
| 60 | + Verbose bool `name:"verbose" short:"v" help:"Enable verbose logging"` |
| 61 | + Quiet bool `name:"quiet" short:"q" help:"Suppress non-error output"` |
| 62 | +} |
| 63 | + |
| 64 | +func main() { |
| 65 | + var cli CLI |
| 66 | + ctx := kong.Parse(&cli, |
| 67 | + kong.Name("setup-debug-cluster"), |
| 68 | + kong.Description("Create a K3D cluster with Grove operator for local development and debugging.\n\n"+ |
| 69 | + "This command handles all setup steps including:\n"+ |
| 70 | + " - Creating the K3D cluster\n"+ |
| 71 | + " - Setting up a Docker registry\n"+ |
| 72 | + " - Pre-pulling and caching images\n"+ |
| 73 | + " - Deploying Grove operator via Skaffold\n"+ |
| 74 | + " - Installing Kai scheduler via Helm"), |
| 75 | + kong.UsageOnError(), |
| 76 | + ) |
| 77 | + |
| 78 | + if err := run(&cli); err != nil { |
| 79 | + ctx.FatalIfErrorf(err) |
| 80 | + } |
| 81 | +} |
| 82 | + |
| 83 | +// run executes the main logic for setting up the debug cluster. |
| 84 | +func run(cli *CLI) error { |
| 85 | + // Set up logging |
| 86 | + logger := utils.NewTestLogger(getLogLevel(cli)) |
| 87 | + |
| 88 | + // Start with the default cluster configuration |
| 89 | + // This includes all node labels and taints required for Grove e2e testing |
| 90 | + cfg := setup.DefaultClusterConfig() |
| 91 | + |
| 92 | + // Apply CLI overrides if provided |
| 93 | + if cli.Name != nil { |
| 94 | + cfg.Name = *cli.Name |
| 95 | + } |
| 96 | + if cli.ControlPlaneNodes != nil { |
| 97 | + cfg.ControlPlaneNodes = *cli.ControlPlaneNodes |
| 98 | + } |
| 99 | + if cli.WorkerNodes != nil { |
| 100 | + cfg.WorkerNodes = *cli.WorkerNodes |
| 101 | + } |
| 102 | + if cli.K3sImage != nil { |
| 103 | + cfg.Image = *cli.K3sImage |
| 104 | + } |
| 105 | + if cli.APIPort != nil { |
| 106 | + cfg.HostPort = *cli.APIPort |
| 107 | + } |
| 108 | + if cli.LBPort != nil { |
| 109 | + cfg.LoadBalancerPort = *cli.LBPort |
| 110 | + } |
| 111 | + if cli.WorkerMemory != nil { |
| 112 | + cfg.WorkerMemory = *cli.WorkerMemory |
| 113 | + } |
| 114 | + if cli.EnableRegistry != nil { |
| 115 | + cfg.EnableRegistry = *cli.EnableRegistry |
| 116 | + } |
| 117 | + if cli.RegistryPort != nil { |
| 118 | + cfg.RegistryPort = *cli.RegistryPort |
| 119 | + } |
| 120 | + |
| 121 | + // Determine skaffold path |
| 122 | + skaffoldPath := cli.SkaffoldPath |
| 123 | + if skaffoldPath == "" { |
| 124 | + // Find skaffold.yaml relative to git repo root |
| 125 | + skaffoldPath = findSkaffoldYAML() |
| 126 | + if skaffoldPath == "" { |
| 127 | + return fmt.Errorf("could not find skaffold.yaml. Are you running from within the grove repo? You can also specify the path with --skaffold-path") |
| 128 | + } |
| 129 | + logger.Debugf("Using skaffold.yaml from: %s", skaffoldPath) |
| 130 | + } else { |
| 131 | + // Validate specified skaffold path exists |
| 132 | + if _, err := os.Stat(skaffoldPath); err != nil { |
| 133 | + return fmt.Errorf("skaffold file not found at %s: %w", skaffoldPath, err) |
| 134 | + } |
| 135 | + } |
| 136 | + |
| 137 | + // Create context that cancels on SIGINT/SIGTERM |
| 138 | + runCtx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) |
| 139 | + defer cancel() |
| 140 | + |
| 141 | + // Print configuration |
| 142 | + if !cli.Quiet { |
| 143 | + printConfiguration(&cfg, logger) |
| 144 | + } |
| 145 | + |
| 146 | + // Set up the cluster |
| 147 | + logger.Info("🚀 Setting up K3D cluster with Grove operator...") |
| 148 | + |
| 149 | + _, cleanup, err := setup.SetupCompleteK3DCluster(runCtx, cfg, skaffoldPath, logger) |
| 150 | + if err != nil { |
| 151 | + logger.Errorf("Failed to setup K3D cluster: %v", err) |
| 152 | + if cleanup != nil { |
| 153 | + logger.Info("Running cleanup...") |
| 154 | + cleanup() |
| 155 | + } |
| 156 | + return fmt.Errorf("failed to setup K3D cluster: %w", err) |
| 157 | + } |
| 158 | + |
| 159 | + // Setup test images in registry (matching what e2e tests do) |
| 160 | + logger.Infof("📦 Pre-loading %d test image(s) to registry...", len(cli.TestImages)) |
| 161 | + if err := setup.SetupRegistryTestImages(cfg.RegistryPort, cli.TestImages); err != nil { |
| 162 | + logger.Warnf("⚠️ Failed to pre-load test images (you can push them manually): %v", err) |
| 163 | + // Don't fail - user can push images manually if needed |
| 164 | + } else { |
| 165 | + logger.Info("✅ Test images successfully pre-loaded to registry") |
| 166 | + } |
| 167 | + |
| 168 | + // Write kubeconfig to KUBECONFIG env var or default location |
| 169 | + kubeconfigPath, err := writeKubeconfig(runCtx, cfg.Name, logger) |
| 170 | + if err != nil { |
| 171 | + logger.Errorf("Failed to write kubeconfig: %v", err) |
| 172 | + logger.Info("Running cleanup...") |
| 173 | + cleanup() |
| 174 | + return fmt.Errorf("failed to write kubeconfig: %w", err) |
| 175 | + } |
| 176 | + |
| 177 | + // Success message |
| 178 | + logger.Info("✅ K3D cluster successfully created!") |
| 179 | + logger.Infof("Cluster name: %s", cfg.Name) |
| 180 | + logger.Infof("API server: https://localhost:%s", cfg.HostPort) |
| 181 | + if cfg.EnableRegistry { |
| 182 | + logger.Infof("Docker registry: localhost:%s", cfg.RegistryPort) |
| 183 | + } |
| 184 | + logger.Infof("Kubeconfig written to: %s", kubeconfigPath) |
| 185 | + |
| 186 | + // Print kubectl config instructions |
| 187 | + fmt.Println("\nTo use this cluster:") |
| 188 | + if kubeconfigPath != clientcmd.RecommendedHomeFile { |
| 189 | + fmt.Printf(" export KUBECONFIG=%s\n", kubeconfigPath) |
| 190 | + } |
| 191 | + fmt.Printf(" kubectl cluster-info\n\n") |
| 192 | + |
| 193 | + // Print teardown instructions |
| 194 | + fmt.Println("To tear down the cluster:") |
| 195 | + fmt.Printf(" k3d cluster delete %s\n\n", cfg.Name) |
| 196 | + |
| 197 | + // If running interactively, wait for signal |
| 198 | + if term.IsTerminal(int(os.Stdin.Fd())) { |
| 199 | + fmt.Println("Press Ctrl+C to tear down the cluster...") |
| 200 | + <-runCtx.Done() |
| 201 | + |
| 202 | + logger.Info("Tearing down cluster...") |
| 203 | + cleanup() |
| 204 | + logger.Info("✅ Cluster teardown complete") |
| 205 | + } else { |
| 206 | + logger.Info("Cluster is ready. Run 'k3d cluster delete " + cfg.Name + "' to tear it down.") |
| 207 | + } |
| 208 | + |
| 209 | + return nil |
| 210 | +} |
| 211 | + |
| 212 | +// getLogLevel returns the log level based on CLI flags. |
| 213 | +func getLogLevel(cli *CLI) utils.LogLevel { |
| 214 | + if cli.Quiet { |
| 215 | + return utils.ErrorLevel |
| 216 | + } |
| 217 | + if cli.Verbose { |
| 218 | + return utils.DebugLevel |
| 219 | + } |
| 220 | + return utils.InfoLevel |
| 221 | +} |
| 222 | + |
| 223 | +// printConfiguration logs the cluster configuration. |
| 224 | +func printConfiguration(cfg *setup.ClusterConfig, logger *utils.Logger) { |
| 225 | + logger.Info("Cluster Configuration:") |
| 226 | + logger.Infof(" Name: %s", cfg.Name) |
| 227 | + logger.Infof(" Control Plane Nodes: %d", cfg.ControlPlaneNodes) |
| 228 | + logger.Infof(" Worker Nodes: %d", cfg.WorkerNodes) |
| 229 | + logger.Infof(" K3s Image: %s", cfg.Image) |
| 230 | + logger.Infof(" API Port: %s", cfg.HostPort) |
| 231 | + logger.Infof(" Load Balancer Port: %s", cfg.LoadBalancerPort) |
| 232 | + logger.Infof(" Worker Memory: %s", cfg.WorkerMemory) |
| 233 | + if cfg.EnableRegistry { |
| 234 | + logger.Infof(" Registry: Enabled (port %s)", cfg.RegistryPort) |
| 235 | + } else { |
| 236 | + logger.Info(" Registry: Disabled") |
| 237 | + } |
| 238 | + logger.Info("") |
| 239 | +} |
| 240 | + |
| 241 | +// findSkaffoldYAML finds skaffold.yaml by locating the git repo root. |
| 242 | +func findSkaffoldYAML() string { |
| 243 | + // Use git to find the repo root - works from anywhere in the repo |
| 244 | + cmd := exec.Command("git", "rev-parse", "--show-toplevel") |
| 245 | + output, err := cmd.Output() |
| 246 | + if err != nil { |
| 247 | + return "" |
| 248 | + } |
| 249 | + |
| 250 | + repoRoot := strings.TrimSpace(string(output)) |
| 251 | + skaffoldPath := filepath.Join(repoRoot, "operator", "skaffold.yaml") |
| 252 | + |
| 253 | + if _, err := os.Stat(skaffoldPath); err == nil { |
| 254 | + return skaffoldPath |
| 255 | + } |
| 256 | + |
| 257 | + return "" |
| 258 | +} |
| 259 | + |
| 260 | +// getKubeconfigPath returns KUBECONFIG env var or ~/.kube/config. |
| 261 | +func getKubeconfigPath() string { |
| 262 | + if kubeconfigEnv := os.Getenv("KUBECONFIG"); kubeconfigEnv != "" { |
| 263 | + return kubeconfigEnv |
| 264 | + } |
| 265 | + return clientcmd.RecommendedHomeFile |
| 266 | +} |
| 267 | + |
| 268 | +// writeKubeconfig writes the cluster kubeconfig, merging with existing config if present. |
| 269 | +func writeKubeconfig(ctx context.Context, clusterName string, logger *utils.Logger) (string, error) { |
| 270 | + logger.Debug("📄 Fetching kubeconfig from k3d cluster...") |
| 271 | + |
| 272 | + // Get kubeconfig from k3d |
| 273 | + kubeconfig, err := setup.GetKubeconfig(ctx, clusterName) |
| 274 | + if err != nil { |
| 275 | + return "", err |
| 276 | + } |
| 277 | + |
| 278 | + // Determine target path |
| 279 | + targetPath := getKubeconfigPath() |
| 280 | + if targetPath == "" { |
| 281 | + return "", fmt.Errorf("could not determine kubeconfig path") |
| 282 | + } |
| 283 | + |
| 284 | + // Ensure directory exists |
| 285 | + dir := filepath.Dir(targetPath) |
| 286 | + if err := os.MkdirAll(dir, 0755); err != nil { |
| 287 | + return "", fmt.Errorf("failed to create kubeconfig directory: %w", err) |
| 288 | + } |
| 289 | + |
| 290 | + // Check if kubeconfig file already exists |
| 291 | + var existingConfig *clientcmdapi.Config |
| 292 | + if _, err := os.Stat(targetPath); err == nil { |
| 293 | + // File exists, load it |
| 294 | + logger.Debugf("Loading existing kubeconfig from %s", targetPath) |
| 295 | + existingConfig, err = clientcmd.LoadFromFile(targetPath) |
| 296 | + if err != nil { |
| 297 | + logger.Warnf("Failed to load existing kubeconfig, will overwrite: %v", err) |
| 298 | + existingConfig = nil |
| 299 | + } |
| 300 | + } |
| 301 | + |
| 302 | + // Merge or use new config |
| 303 | + var finalConfig *clientcmdapi.Config |
| 304 | + if existingConfig != nil { |
| 305 | + // Merge the new cluster config into existing |
| 306 | + logger.Debug("Merging new cluster config with existing kubeconfig") |
| 307 | + finalConfig = mergeKubeconfigs(existingConfig, kubeconfig, clusterName) |
| 308 | + } else { |
| 309 | + finalConfig = kubeconfig |
| 310 | + } |
| 311 | + |
| 312 | + // Write the kubeconfig |
| 313 | + if err := clientcmd.WriteToFile(*finalConfig, targetPath); err != nil { |
| 314 | + return "", fmt.Errorf("failed to write kubeconfig to %s: %w", targetPath, err) |
| 315 | + } |
| 316 | + |
| 317 | + logger.Debugf("✓ Kubeconfig written to %s", targetPath) |
| 318 | + return targetPath, nil |
| 319 | +} |
| 320 | + |
| 321 | +// mergeKubeconfigs merges a new kubeconfig into an existing one. |
| 322 | +func mergeKubeconfigs(existing, new *clientcmdapi.Config, _ string) *clientcmdapi.Config { |
| 323 | + merged := existing.DeepCopy() |
| 324 | + |
| 325 | + // Add/update cluster |
| 326 | + for name, cluster := range new.Clusters { |
| 327 | + merged.Clusters[name] = cluster |
| 328 | + } |
| 329 | + |
| 330 | + // Add/update auth info |
| 331 | + for name, authInfo := range new.AuthInfos { |
| 332 | + merged.AuthInfos[name] = authInfo |
| 333 | + } |
| 334 | + |
| 335 | + // Add/update context |
| 336 | + for name, context := range new.Contexts { |
| 337 | + merged.Contexts[name] = context |
| 338 | + // Set the new cluster as current context |
| 339 | + merged.CurrentContext = name |
| 340 | + } |
| 341 | + |
| 342 | + return merged |
| 343 | +} |
| 344 | + |
0 commit comments