temporarily run only RU19 to trigger race condition and debug

gflarity · gflarity · commit c6e040c82a84 · 2026-01-20T17:52:09.000-05:00
diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml
@@ -39,14 +39,17 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - test_name: gang_scheduling
-            test_pattern: "^Test_GS"
-          - test_name: rolling_updates
-            test_pattern: "^Test_RU"
-          - test_name: startup_ordering
-            test_pattern: "^Test_SO"
-          - test_name: Topology_Aware_Scheduling
-            test_pattern: "^Test_TAS"
+          # TEMPORARY: Only running RU19 to reproduce intermittent failure
+          # - test_name: gang_scheduling
+          #   test_pattern: "^Test_GS"
+          # - test_name: rolling_updates
+          #   test_pattern: "^Test_RU"
+          # - test_name: startup_ordering
+          #   test_pattern: "^Test_SO"
+          # - test_name: Topology_Aware_Scheduling
+          #   test_pattern: "^Test_TAS"
+          - test_name: RU19_repro
+            test_pattern: "^Test_RU19"
     name: E2E - ${{ matrix.test_name }}
     steps:
       # print runner specs so we have a record incase of failures
diff --git a/operator/e2e/tests/debug_utils.go b/operator/e2e/tests/debug_utils.go
@@ -35,10 +35,6 @@ const (
 	// logBufferSize is the size of the buffer for reading logs from the operator (debugging purposes)
 	logBufferSize = 64 * 1024 // 64KB
 
-	// operatorLogLines is the number of log lines to capture from the operator.
-	// Set to 2000 to ensure we capture logs from before the failure occurred,
-	// not just the steady-state logs after the failure.
-	operatorLogLines = 2000
 	// eventLookbackDuration is how far back to look for events
 	eventLookbackDuration = 10 * time.Minute
 )
@@ -88,10 +84,10 @@ func CollectAllDiagnostics(tc TestContext) {
 }
 
 // dumpOperatorLogs captures and prints operator logs at INFO level.
-// Captures the last operatorLogLines lines from all containers in the operator pod.
+// Captures all logs from all containers in the operator pod.
 func dumpOperatorLogs(tc TestContext) {
 	logger.Info("================================================================================")
-	logger.Infof("=== OPERATOR LOGS (last %d lines) ===", operatorLogLines)
+	logger.Info("=== OPERATOR LOGS (all) ===")
 	logger.Info("================================================================================")
 
 	// List pods in the operator namespace
@@ -141,10 +137,8 @@ func dumpOperatorLogs(tc TestContext) {
 		for _, container := range pod.Spec.Containers {
 			logger.Infof("--- Container: %s Logs ---", container.Name)
 
-			tailLines := int64(operatorLogLines)
 			req := tc.Clientset.CoreV1().Pods(setup.OperatorNamespace).GetLogs(pod.Name, &corev1.PodLogOptions{
 				Container: container.Name,
-				TailLines: &tailLines,
 			})
 
 			logStream, err := req.Stream(tc.Ctx)
diff --git a/operator/e2e/tests/rolling_updates_test.go b/operator/e2e/tests/rolling_updates_test.go
@@ -19,6 +19,7 @@
 package tests
 
 import (
+	"fmt"
 	"testing"
 	"time"
 )
@@ -743,7 +744,25 @@ func Test_RU18_RollingUpdateWithPodCliqueScaleOutDuringUpdate(t *testing.T) {
 // 4. Change the specification of pc-a, pc-b and pc-c
 // 5. Verify the scaled pods are created with the correct specifications
 // 6. Verify they should not be updated again before the rolling update ends
+//
+// TEMPORARY: This test runs 10 times to reproduce an intermittent failure.
 func Test_RU19_RollingUpdateWithPodCliqueScaleOutBeforeUpdate(t *testing.T) {
+	const maxIterations = 10
+	for i := 1; i <= maxIterations; i++ {
+		logger.Infof("========== RU19 ITERATION %d/%d ==========", i, maxIterations)
+		success := t.Run(fmt.Sprintf("iteration_%d", i), func(t *testing.T) {
+			runRU19Test(t)
+		})
+		if !success {
+			t.Fatalf("RU19 failed on iteration %d/%d", i, maxIterations)
+		}
+		logger.Infof("========== RU19 ITERATION %d/%d PASSED ==========", i, maxIterations)
+	}
+	logger.Infof("All %d iterations of RU19 passed!", maxIterations)
+}
+
+// runRU19Test contains the actual RU19 test logic
+func runRU19Test(t *testing.T) {
 	logger.Info("1. Initialize a 24-node Grove cluster")
 	logger.Info("2. Deploy workload WL1 with 2 replicas, and verify 20 newly created pods")
 	tc, cleanup, tracker := setupRollingUpdateTest(t, RollingUpdateTestConfig{
@@ -784,7 +803,7 @@ func Test_RU19_RollingUpdateWithPodCliqueScaleOutBeforeUpdate(t *testing.T) {
 	}
 	tracker.Stop()
 
-	logger.Info("🎉 Rolling Update with PodClique scale-out before update test (RU-19) completed successfully!")
+	logger.Info("Rolling Update with PodClique scale-out before update test (RU-19) completed successfully!")
 }
 
 // Test_RU20_RollingUpdateWithPodCliqueScaleInDuringUpdate tests rolling update with scale-in on standalone PCLQ being updated