Skip to content

Commit c6e040c

Browse files
committed
temporarily run only RU19 to trigger race condition and debug
1 parent 6b23c22 commit c6e040c

File tree

3 files changed

+33
-17
lines changed

3 files changed

+33
-17
lines changed

.github/workflows/e2e-test.yaml

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,17 @@ jobs:
3939
fail-fast: false
4040
matrix:
4141
include:
42-
- test_name: gang_scheduling
43-
test_pattern: "^Test_GS"
44-
- test_name: rolling_updates
45-
test_pattern: "^Test_RU"
46-
- test_name: startup_ordering
47-
test_pattern: "^Test_SO"
48-
- test_name: Topology_Aware_Scheduling
49-
test_pattern: "^Test_TAS"
42+
# TEMPORARY: Only running RU19 to reproduce intermittent failure
43+
# - test_name: gang_scheduling
44+
# test_pattern: "^Test_GS"
45+
# - test_name: rolling_updates
46+
# test_pattern: "^Test_RU"
47+
# - test_name: startup_ordering
48+
# test_pattern: "^Test_SO"
49+
# - test_name: Topology_Aware_Scheduling
50+
# test_pattern: "^Test_TAS"
51+
- test_name: RU19_repro
52+
test_pattern: "^Test_RU19"
5053
name: E2E - ${{ matrix.test_name }}
5154
steps:
5255
# print runner specs so we have a record incase of failures

operator/e2e/tests/debug_utils.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@ const (
3535
// logBufferSize is the size of the buffer for reading logs from the operator (debugging purposes)
3636
logBufferSize = 64 * 1024 // 64KB
3737

38-
// operatorLogLines is the number of log lines to capture from the operator.
39-
// Set to 2000 to ensure we capture logs from before the failure occurred,
40-
// not just the steady-state logs after the failure.
41-
operatorLogLines = 2000
4238
// eventLookbackDuration is how far back to look for events
4339
eventLookbackDuration = 10 * time.Minute
4440
)
@@ -88,10 +84,10 @@ func CollectAllDiagnostics(tc TestContext) {
8884
}
8985

9086
// dumpOperatorLogs captures and prints operator logs at INFO level.
91-
// Captures the last operatorLogLines lines from all containers in the operator pod.
87+
// Captures all logs from all containers in the operator pod.
9288
func dumpOperatorLogs(tc TestContext) {
9389
logger.Info("================================================================================")
94-
logger.Infof("=== OPERATOR LOGS (last %d lines) ===", operatorLogLines)
90+
logger.Info("=== OPERATOR LOGS (all) ===")
9591
logger.Info("================================================================================")
9692

9793
// List pods in the operator namespace
@@ -141,10 +137,8 @@ func dumpOperatorLogs(tc TestContext) {
141137
for _, container := range pod.Spec.Containers {
142138
logger.Infof("--- Container: %s Logs ---", container.Name)
143139

144-
tailLines := int64(operatorLogLines)
145140
req := tc.Clientset.CoreV1().Pods(setup.OperatorNamespace).GetLogs(pod.Name, &corev1.PodLogOptions{
146141
Container: container.Name,
147-
TailLines: &tailLines,
148142
})
149143

150144
logStream, err := req.Stream(tc.Ctx)

operator/e2e/tests/rolling_updates_test.go

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package tests
2020

2121
import (
22+
"fmt"
2223
"testing"
2324
"time"
2425
)
@@ -743,7 +744,25 @@ func Test_RU18_RollingUpdateWithPodCliqueScaleOutDuringUpdate(t *testing.T) {
743744
// 4. Change the specification of pc-a, pc-b and pc-c
744745
// 5. Verify the scaled pods are created with the correct specifications
745746
// 6. Verify they should not be updated again before the rolling update ends
747+
//
748+
// TEMPORARY: This test runs 10 times to reproduce an intermittent failure.
746749
func Test_RU19_RollingUpdateWithPodCliqueScaleOutBeforeUpdate(t *testing.T) {
750+
const maxIterations = 10
751+
for i := 1; i <= maxIterations; i++ {
752+
logger.Infof("========== RU19 ITERATION %d/%d ==========", i, maxIterations)
753+
success := t.Run(fmt.Sprintf("iteration_%d", i), func(t *testing.T) {
754+
runRU19Test(t)
755+
})
756+
if !success {
757+
t.Fatalf("RU19 failed on iteration %d/%d", i, maxIterations)
758+
}
759+
logger.Infof("========== RU19 ITERATION %d/%d PASSED ==========", i, maxIterations)
760+
}
761+
logger.Infof("All %d iterations of RU19 passed!", maxIterations)
762+
}
763+
764+
// runRU19Test contains the actual RU19 test logic
765+
func runRU19Test(t *testing.T) {
747766
logger.Info("1. Initialize a 24-node Grove cluster")
748767
logger.Info("2. Deploy workload WL1 with 2 replicas, and verify 20 newly created pods")
749768
tc, cleanup, tracker := setupRollingUpdateTest(t, RollingUpdateTestConfig{
@@ -784,7 +803,7 @@ func Test_RU19_RollingUpdateWithPodCliqueScaleOutBeforeUpdate(t *testing.T) {
784803
}
785804
tracker.Stop()
786805

787-
logger.Info("🎉 Rolling Update with PodClique scale-out before update test (RU-19) completed successfully!")
806+
logger.Info("Rolling Update with PodClique scale-out before update test (RU-19) completed successfully!")
788807
}
789808

790809
// Test_RU20_RollingUpdateWithPodCliqueScaleInDuringUpdate tests rolling update with scale-in on standalone PCLQ being updated

0 commit comments

Comments
 (0)