@@ -454,12 +454,14 @@ configs:
454454 }
455455 }
456456
457- // Create cluster with retry logic
457+ // Create cluster with retry logic and timeout
458458 // k3d cluster creation can fail intermittently when starting many nodes (30+) due to
459459 // the "thundering herd" effect - all agents trying to register with the server simultaneously.
460- // Retrying usually succeeds as the failure is transient.
460+ // Additionally, ClusterRun can hang indefinitely if a node gets stuck during startup.
461+ // We use a timeout to detect hangs and retry, which usually succeeds.
461462 const maxClusterCreateRetries = 3
462463 const clusterCreateRetryDelay = 5 * time .Second
464+ const clusterCreateTimeout = 5 * time .Minute
463465
464466 logger .Debugf ("🚀 Creating cluster '%s' with %d server(s) and %d worker node(s)..." ,
465467 k3dConfig .Name , cfg .ControlPlaneNodes , cfg .WorkerNodes )
@@ -469,7 +471,7 @@ configs:
469471 if attempt > 1 {
470472 logger .Warnf ("🔄 Retrying cluster creation (attempt %d/%d)..." , attempt , maxClusterCreateRetries )
471473
472- // Clean up failed cluster before retry
474+ // Clean up failed cluster before retry - use original context for cleanup
473475 _ = k3dclient .ClusterDelete (ctx , runtimes .Docker , & k3dConfig .Cluster , k3d.ClusterDeleteOpts {})
474476
475477 // Clean up registry if enabled (it may have been partially created)
@@ -480,15 +482,25 @@ configs:
480482 time .Sleep (clusterCreateRetryDelay )
481483 }
482484
483- createErr = k3dclient .ClusterRun (ctx , runtimes .Docker , k3dConfig )
485+ // Create a timeout context for this attempt
486+ // ClusterRun can hang indefinitely if node startup gets stuck
487+ attemptCtx , cancel := context .WithTimeout (ctx , clusterCreateTimeout )
488+ createErr = k3dclient .ClusterRun (attemptCtx , runtimes .Docker , k3dConfig )
489+ cancel () // Always cancel to release resources
490+
484491 if createErr == nil {
485492 if attempt > 1 {
486493 logger .Infof ("✅ Cluster creation succeeded on attempt %d/%d" , attempt , maxClusterCreateRetries )
487494 }
488495 break
489496 }
490497
491- logger .Errorf ("❌ Cluster creation failed (attempt %d/%d): %v" , attempt , maxClusterCreateRetries , createErr )
498+ // Check if it was a timeout
499+ if attemptCtx .Err () == context .DeadlineExceeded {
500+ logger .Errorf ("❌ Cluster creation timed out after %v (attempt %d/%d)" , clusterCreateTimeout , attempt , maxClusterCreateRetries )
501+ } else {
502+ logger .Errorf ("❌ Cluster creation failed (attempt %d/%d): %v" , attempt , maxClusterCreateRetries , createErr )
503+ }
492504 }
493505
494506 if createErr != nil {
0 commit comments