Skip to content

Commit 0c95b7f

Browse files
committed
we also want to timeout after 5 minutes to avoid hanging for the full timeout
1 parent bc7308f commit 0c95b7f

File tree

1 file changed

+17
-5
lines changed

1 file changed

+17
-5
lines changed

operator/e2e/setup/k8s_clusters.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -454,12 +454,14 @@ configs:
454454
}
455455
}
456456

457-
// Create cluster with retry logic
457+
// Create cluster with retry logic and timeout
458458
// k3d cluster creation can fail intermittently when starting many nodes (30+) due to
459459
// the "thundering herd" effect - all agents trying to register with the server simultaneously.
460-
// Retrying usually succeeds as the failure is transient.
460+
// Additionally, ClusterRun can hang indefinitely if a node gets stuck during startup.
461+
// We use a timeout to detect hangs and retry, which usually succeeds.
461462
const maxClusterCreateRetries = 3
462463
const clusterCreateRetryDelay = 5 * time.Second
464+
const clusterCreateTimeout = 5 * time.Minute
463465

464466
logger.Debugf("🚀 Creating cluster '%s' with %d server(s) and %d worker node(s)...",
465467
k3dConfig.Name, cfg.ControlPlaneNodes, cfg.WorkerNodes)
@@ -469,7 +471,7 @@ configs:
469471
if attempt > 1 {
470472
logger.Warnf("🔄 Retrying cluster creation (attempt %d/%d)...", attempt, maxClusterCreateRetries)
471473

472-
// Clean up failed cluster before retry
474+
// Clean up failed cluster before retry - use original context for cleanup
473475
_ = k3dclient.ClusterDelete(ctx, runtimes.Docker, &k3dConfig.Cluster, k3d.ClusterDeleteOpts{})
474476

475477
// Clean up registry if enabled (it may have been partially created)
@@ -480,15 +482,25 @@ configs:
480482
time.Sleep(clusterCreateRetryDelay)
481483
}
482484

483-
createErr = k3dclient.ClusterRun(ctx, runtimes.Docker, k3dConfig)
485+
// Create a timeout context for this attempt
486+
// ClusterRun can hang indefinitely if node startup gets stuck
487+
attemptCtx, cancel := context.WithTimeout(ctx, clusterCreateTimeout)
488+
createErr = k3dclient.ClusterRun(attemptCtx, runtimes.Docker, k3dConfig)
489+
cancel() // Always cancel to release resources
490+
484491
if createErr == nil {
485492
if attempt > 1 {
486493
logger.Infof("✅ Cluster creation succeeded on attempt %d/%d", attempt, maxClusterCreateRetries)
487494
}
488495
break
489496
}
490497

491-
logger.Errorf("❌ Cluster creation failed (attempt %d/%d): %v", attempt, maxClusterCreateRetries, createErr)
498+
// Check if it was a timeout
499+
if attemptCtx.Err() == context.DeadlineExceeded {
500+
logger.Errorf("❌ Cluster creation timed out after %v (attempt %d/%d)", clusterCreateTimeout, attempt, maxClusterCreateRetries)
501+
} else {
502+
logger.Errorf("❌ Cluster creation failed (attempt %d/%d): %v", attempt, maxClusterCreateRetries, createErr)
503+
}
492504
}
493505

494506
if createErr != nil {

0 commit comments

Comments
 (0)