Skip to content

Commit b6e5a75

Browse files
committed
retry cluster creation when it fails
1 parent a83a55e commit b6e5a75

File tree

1 file changed

+36
-3
lines changed

1 file changed

+36
-3
lines changed

operator/e2e/setup/k8s_clusters.go

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,12 +454,45 @@ configs:
454454
}
455455
}
456456

457-
// Create cluster
457+
// Create cluster with retry logic
458+
// k3d cluster creation can fail intermittently when starting many nodes (30+) due to
459+
// the "thundering herd" effect - all agents trying to register with the server simultaneously.
460+
// Retrying usually succeeds as the failure is transient.
461+
const maxClusterCreateRetries = 3
462+
const clusterCreateRetryDelay = 5 * time.Second
463+
458464
logger.Debugf("🚀 Creating cluster '%s' with %d server(s) and %d worker node(s)...",
459465
k3dConfig.Name, cfg.ControlPlaneNodes, cfg.WorkerNodes)
460466

461-
if err := k3dclient.ClusterRun(ctx, runtimes.Docker, k3dConfig); err != nil {
462-
return nil, cleanup, fmt.Errorf("failed to create cluster: %w", err)
467+
var createErr error
468+
for attempt := 1; attempt <= maxClusterCreateRetries; attempt++ {
469+
if attempt > 1 {
470+
logger.Warnf("🔄 Retrying cluster creation (attempt %d/%d)...", attempt, maxClusterCreateRetries)
471+
472+
// Clean up failed cluster before retry
473+
_ = k3dclient.ClusterDelete(ctx, runtimes.Docker, &k3dConfig.Cluster, k3d.ClusterDeleteOpts{})
474+
475+
// Clean up registry if enabled (it may have been partially created)
476+
if cfg.EnableRegistry {
477+
_ = ensureRegistryDoesNotExist(ctx, cfg.Name, logger)
478+
}
479+
480+
time.Sleep(clusterCreateRetryDelay)
481+
}
482+
483+
createErr = k3dclient.ClusterRun(ctx, runtimes.Docker, k3dConfig)
484+
if createErr == nil {
485+
if attempt > 1 {
486+
logger.Infof("✅ Cluster creation succeeded on attempt %d/%d", attempt, maxClusterCreateRetries)
487+
}
488+
break
489+
}
490+
491+
logger.Errorf("❌ Cluster creation failed (attempt %d/%d): %v", attempt, maxClusterCreateRetries, createErr)
492+
}
493+
494+
if createErr != nil {
495+
return nil, cleanup, fmt.Errorf("failed to create cluster after %d attempts: %w", maxClusterCreateRetries, createErr)
463496
}
464497

465498
// Get kubeconfig

0 commit comments

Comments
 (0)