@@ -454,12 +454,45 @@ configs:
454454 }
455455 }
456456
457- // Create cluster
457+ // Create cluster with retry logic
458+ // k3d cluster creation can fail intermittently when starting many nodes (30+) due to
459+ // the "thundering herd" effect - all agents trying to register with the server simultaneously.
460+ // Retrying usually succeeds as the failure is transient.
461+ const maxClusterCreateRetries = 3
462+ const clusterCreateRetryDelay = 5 * time .Second
463+
458464 logger .Debugf ("🚀 Creating cluster '%s' with %d server(s) and %d worker node(s)..." ,
459465 k3dConfig .Name , cfg .ControlPlaneNodes , cfg .WorkerNodes )
460466
461- if err := k3dclient .ClusterRun (ctx , runtimes .Docker , k3dConfig ); err != nil {
462- return nil , cleanup , fmt .Errorf ("failed to create cluster: %w" , err )
467+ var createErr error
468+ for attempt := 1 ; attempt <= maxClusterCreateRetries ; attempt ++ {
469+ if attempt > 1 {
470+ logger .Warnf ("🔄 Retrying cluster creation (attempt %d/%d)..." , attempt , maxClusterCreateRetries )
471+
472+ // Clean up failed cluster before retry
473+ _ = k3dclient .ClusterDelete (ctx , runtimes .Docker , & k3dConfig .Cluster , k3d.ClusterDeleteOpts {})
474+
475+ // Clean up registry if enabled (it may have been partially created)
476+ if cfg .EnableRegistry {
477+ _ = ensureRegistryDoesNotExist (ctx , cfg .Name , logger )
478+ }
479+
480+ time .Sleep (clusterCreateRetryDelay )
481+ }
482+
483+ createErr = k3dclient .ClusterRun (ctx , runtimes .Docker , k3dConfig )
484+ if createErr == nil {
485+ if attempt > 1 {
486+ logger .Infof ("✅ Cluster creation succeeded on attempt %d/%d" , attempt , maxClusterCreateRetries )
487+ }
488+ break
489+ }
490+
491+ logger .Errorf ("❌ Cluster creation failed (attempt %d/%d): %v" , attempt , maxClusterCreateRetries , createErr )
492+ }
493+
494+ if createErr != nil {
495+ return nil , cleanup , fmt .Errorf ("failed to create cluster after %d attempts: %w" , maxClusterCreateRetries , createErr )
463496 }
464497
465498 // Get kubeconfig
0 commit comments