@@ -43,7 +43,7 @@ const (
4343 // cleanupTimeout is the maximum time to wait for all resources and pods to be deleted during cleanup.
4444 // This needs to be long enough to allow for cascade deletion propagation through
4545 // PodCliqueSet -> PodCliqueScalingGroup -> PodClique -> Pod
46- cleanupTimeout = 60 * time .Second
46+ cleanupTimeout = 2 * time .Minute
4747
4848 // cleanupPollInterval is the interval between checks during cleanup polling
4949 cleanupPollInterval = 1 * time .Second
@@ -85,6 +85,8 @@ type SharedClusterManager struct {
8585 isSetup bool
8686 workerNodes []string
8787 registryPort string
88+ cleanupFailed bool // Set to true if CleanupWorkloads fails, causing subsequent tests to fail
89+ cleanupError string // The error message from the failed cleanup
8890}
8991
9092var (
@@ -205,7 +207,12 @@ func (scm *SharedClusterManager) Setup(ctx context.Context, testImages []string)
205207
206208// PrepareForTest prepares the cluster for a specific test by cordoning the appropriate nodes.
207209// It ensures exactly `requiredWorkerNodes` nodes are schedulable by cordoning excess nodes.
210+ // Returns an error if a previous cleanup operation failed, preventing potentially corrupted test state.
208211func (scm * SharedClusterManager ) PrepareForTest (ctx context.Context , requiredWorkerNodes int ) error {
212+ if scm .cleanupFailed {
213+ return fmt .Errorf ("cannot prepare cluster: a previous test cleanup failed - cluster may have orphaned resources. Original error: %s" , scm .cleanupError )
214+ }
215+
209216 if ! scm .isSetup {
210217 return fmt .Errorf ("shared cluster not setup" )
211218 }
@@ -463,6 +470,23 @@ func (scm *SharedClusterManager) IsSetup() bool {
463470 return scm .isSetup
464471}
465472
473+ // MarkCleanupFailed marks that a cleanup operation has failed.
474+ // This causes all subsequent tests to fail immediately when they try to prepare the cluster.
475+ func (scm * SharedClusterManager ) MarkCleanupFailed (err error ) {
476+ scm .cleanupFailed = true
477+ scm .cleanupError = err .Error ()
478+ }
479+
480+ // HasCleanupFailed returns true if a previous cleanup operation failed.
481+ func (scm * SharedClusterManager ) HasCleanupFailed () bool {
482+ return scm .cleanupFailed
483+ }
484+
485+ // GetCleanupError returns the error message from the failed cleanup, or empty string if no failure.
486+ func (scm * SharedClusterManager ) GetCleanupError () string {
487+ return scm .cleanupError
488+ }
489+
466490// Teardown cleans up the shared cluster
467491func (scm * SharedClusterManager ) Teardown () {
468492 if scm .cleanup != nil {
0 commit comments