Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions operator/e2e/setup/k8s_clusters.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,12 +516,57 @@ configs:
}
}

// Create cluster
// Create cluster with retry logic and timeout
// k3d cluster creation can fail intermittently when starting many nodes (30+) due to
// the "thundering herd" effect - all agents trying to register with the server simultaneously.
// Additionally, ClusterRun can hang indefinitely if a node gets stuck during startup.
// We use a timeout to detect hangs and retry, which usually succeeds.
const maxClusterCreateRetries = 3
const clusterCreateRetryDelay = 5 * time.Second
const clusterCreateTimeout = 5 * time.Minute

logger.Debugf("πŸš€ Creating cluster '%s' with %d server(s) and %d worker node(s)...",
k3dConfig.Name, cfg.ControlPlaneNodes, cfg.WorkerNodes)

if err := k3dclient.ClusterRun(ctx, runtimes.Docker, k3dConfig); err != nil {
return nil, cleanup, fmt.Errorf("failed to create cluster: %w", err)
var createErr error
for attempt := 1; attempt <= maxClusterCreateRetries; attempt++ {
if attempt > 1 {
logger.Warnf("πŸ”„ Retrying cluster creation (attempt %d/%d)...", attempt, maxClusterCreateRetries)

// Clean up failed cluster before retry - use original context for cleanup
_ = k3dclient.ClusterDelete(ctx, runtimes.Docker, &k3dConfig.Cluster, k3d.ClusterDeleteOpts{})

// Clean up registry if enabled (it may have been partially created)
if cfg.EnableRegistry {
_ = ensureRegistryDoesNotExist(ctx, cfg.Name, logger)
}

time.Sleep(clusterCreateRetryDelay)
}

// Create a timeout context for this attempt
// ClusterRun can hang indefinitely if node startup gets stuck
attemptCtx, cancel := context.WithTimeout(ctx, clusterCreateTimeout)
createErr = k3dclient.ClusterRun(attemptCtx, runtimes.Docker, k3dConfig)
cancel() // Always cancel to release resources

if createErr == nil {
if attempt > 1 {
logger.Infof("βœ… Cluster creation succeeded on attempt %d/%d", attempt, maxClusterCreateRetries)
}
break
}

// Check if it was a timeout
if attemptCtx.Err() == context.DeadlineExceeded {
logger.Errorf("❌ Cluster creation timed out after %v (attempt %d/%d)", clusterCreateTimeout, attempt, maxClusterCreateRetries)
} else {
logger.Errorf("❌ Cluster creation failed (attempt %d/%d): %v", attempt, maxClusterCreateRetries, createErr)
}
}

if createErr != nil {
return nil, cleanup, fmt.Errorf("failed to create cluster after %d attempts: %w", maxClusterCreateRetries, createErr)
}

// Get kubeconfig
Expand Down
Loading