Skip to content

Commit a83a55e

Browse files
authored
handle clean up failures better (#326)
1 parent 24ae6c1 commit a83a55e

File tree

2 files changed

+31
-2
lines changed

2 files changed

+31
-2
lines changed

operator/e2e/setup/shared_cluster.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ const (
4343
// cleanupTimeout is the maximum time to wait for all resources and pods to be deleted during cleanup.
4444
// This needs to be long enough to allow for cascade deletion propagation through
4545
// PodCliqueSet -> PodCliqueScalingGroup -> PodClique -> Pod
46-
cleanupTimeout = 60 * time.Second
46+
cleanupTimeout = 2 * time.Minute
4747

4848
// cleanupPollInterval is the interval between checks during cleanup polling
4949
cleanupPollInterval = 1 * time.Second
@@ -85,6 +85,8 @@ type SharedClusterManager struct {
8585
isSetup bool
8686
workerNodes []string
8787
registryPort string
88+
cleanupFailed bool // Set to true if CleanupWorkloads fails, causing subsequent tests to fail
89+
cleanupError string // The error message from the failed cleanup
8890
}
8991

9092
var (
@@ -205,7 +207,12 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
205207

206208
// PrepareForTest prepares the cluster for a specific test by cordoning the appropriate nodes.
207209
// It ensures exactly `requiredWorkerNodes` nodes are schedulable by cordoning excess nodes.
210+
// Returns an error if a previous cleanup operation failed, preventing potentially corrupted test state.
208211
func (scm *SharedClusterManager) PrepareForTest(ctx context.Context, requiredWorkerNodes int) error {
212+
if scm.cleanupFailed {
213+
return fmt.Errorf("cannot prepare cluster: a previous test cleanup failed - cluster may have orphaned resources. Original error: %s", scm.cleanupError)
214+
}
215+
209216
if !scm.isSetup {
210217
return fmt.Errorf("shared cluster not setup")
211218
}
@@ -463,6 +470,23 @@ func (scm *SharedClusterManager) IsSetup() bool {
463470
return scm.isSetup
464471
}
465472

473+
// MarkCleanupFailed marks that a cleanup operation has failed.
474+
// This causes all subsequent tests to fail immediately when they try to prepare the cluster.
475+
func (scm *SharedClusterManager) MarkCleanupFailed(err error) {
476+
scm.cleanupFailed = true
477+
scm.cleanupError = err.Error()
478+
}
479+
480+
// HasCleanupFailed returns true if a previous cleanup operation failed.
481+
func (scm *SharedClusterManager) HasCleanupFailed() bool {
482+
return scm.cleanupFailed
483+
}
484+
485+
// GetCleanupError returns the error message from the failed cleanup, or empty string if no failure.
486+
func (scm *SharedClusterManager) GetCleanupError() string {
487+
return scm.cleanupError
488+
}
489+
466490
// Teardown cleans up the shared cluster
467491
func (scm *SharedClusterManager) Teardown() {
468492
if scm.cleanup != nil {

operator/e2e/tests/setup.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,12 @@ func prepareTestCluster(ctx context.Context, t *testing.T, requiredWorkerNodes i
181181
logger.Error("=== CLEANUP FAILURE - COLLECTING DIAGNOSTICS ===")
182182
logger.Error("================================================================================")
183183
CollectAllDiagnostics(diagnosticsTc)
184-
t.Fatalf("Failed to cleanup workloads: %v", err)
184+
185+
// Mark cleanup as failed - this will cause all subsequent tests to fail immediately
186+
// when they try to prepare the cluster, preventing potentially corrupted test state
187+
sharedCluster.MarkCleanupFailed(err)
188+
189+
t.Fatalf("Failed to cleanup workloads: %v. All subsequent tests will fail.", err)
185190
}
186191
}
187192

0 commit comments

Comments
 (0)