diff --git a/operator/e2e/setup/k8s_clusters.go b/operator/e2e/setup/k8s_clusters.go index 49c117a9..2158dbd7 100644 --- a/operator/e2e/setup/k8s_clusters.go +++ b/operator/e2e/setup/k8s_clusters.go @@ -516,12 +516,57 @@ configs: } } - // Create cluster + // Create cluster with retry logic and timeout + // k3d cluster creation can fail intermittently when starting many nodes (30+) due to + // the "thundering herd" effect - all agents trying to register with the server simultaneously. + // Additionally, ClusterRun can hang indefinitely if a node gets stuck during startup. + // We use a timeout to detect hangs and retry, which usually succeeds. + const maxClusterCreateRetries = 3 + const clusterCreateRetryDelay = 5 * time.Second + const clusterCreateTimeout = 5 * time.Minute + logger.Debugf("🚀 Creating cluster '%s' with %d server(s) and %d worker node(s)...", k3dConfig.Name, cfg.ControlPlaneNodes, cfg.WorkerNodes) - if err := k3dclient.ClusterRun(ctx, runtimes.Docker, k3dConfig); err != nil { - return nil, cleanup, fmt.Errorf("failed to create cluster: %w", err) + var createErr error + for attempt := 1; attempt <= maxClusterCreateRetries; attempt++ { + if attempt > 1 { + logger.Warnf("🔄 Retrying cluster creation (attempt %d/%d)...", attempt, maxClusterCreateRetries) + + // Clean up failed cluster before retry - use original context for cleanup + _ = k3dclient.ClusterDelete(ctx, runtimes.Docker, &k3dConfig.Cluster, k3d.ClusterDeleteOpts{}) + + // Clean up registry if enabled (it may have been partially created) + if cfg.EnableRegistry { + _ = ensureRegistryDoesNotExist(ctx, cfg.Name, logger) + } + + time.Sleep(clusterCreateRetryDelay) + } + + // Create a timeout context for this attempt + // ClusterRun can hang indefinitely if node startup gets stuck + attemptCtx, cancel := context.WithTimeout(ctx, clusterCreateTimeout) + createErr = k3dclient.ClusterRun(attemptCtx, runtimes.Docker, k3dConfig) + cancel() // Always cancel to release resources + + if createErr == nil { + if attempt > 1 { + logger.Infof("✅ Cluster creation succeeded on attempt %d/%d", attempt, maxClusterCreateRetries) + } + break + } + + // Check if it was a timeout + if attemptCtx.Err() == context.DeadlineExceeded { + logger.Errorf("❌ Cluster creation timed out after %v (attempt %d/%d)", clusterCreateTimeout, attempt, maxClusterCreateRetries) + } else { + logger.Errorf("❌ Cluster creation failed (attempt %d/%d): %v", attempt, maxClusterCreateRetries, createErr) + } + } + + if createErr != nil { + return nil, cleanup, fmt.Errorf("failed to create cluster after %d attempts: %w", maxClusterCreateRetries, createErr) } // Get kubeconfig