Skip to content

Commit 9e1109c

Browse files
oilbeaterclaude
andauthored
fix(e2e): wait for RAFT cluster convergence in HA db corruption test (#6348)
After OVN database corruption recovery, the RAFT cluster may not immediately show all servers in `cluster/status` output even though pods are ready and db files are healthy. This causes intermittent failures in `getDbSidsFromClusterStatus()` when it asserts the server count right after recovery. Replace the immediate assertion with a WaitUntil poll (up to 30s) that waits for all pods to report the expected number of servers before proceeding with validation. Signed-off-by: Mengxin Liu <liumengxinfly@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d655c90 commit 9e1109c

File tree

1 file changed

+21
-6
lines changed

1 file changed

+21
-6
lines changed

test/e2e/ha/ha_test.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,18 +131,33 @@ func getDbSidsFromClusterStatus(f *framework.Framework, deploy *appsv1.Deploymen
131131
framework.ExpectNoError(err)
132132
framework.ExpectHaveLen(pods.Items, int(*deploy.Spec.Replicas))
133133

134+
expectedCount := len(pods.Items)
134135
dbServers := make(map[string]map[string]string)
135136
for _, db := range [...]string{"nb", "sb"} {
136-
ginkgo.By("Getting ovn" + db + " db server ids on all ovn-central pods")
137+
ginkgo.By("Waiting for ovn" + db + " db cluster to show all servers on every ovn-central pod")
137138
for pod := range slices.Values(pods.Items) {
138-
stdout, stderr, err := framework.ExecShellInPod(context.Background(), f, pod.Namespace, pod.Name, cmdClusterStatus(db))
139-
framework.ExpectNoError(err, fmt.Sprintf("failed to get ovn%s db status in pod %s: stdout = %q, stderr = %q", db, pod.Name, stdout, stderr))
140-
status := parseClusterStatus(stdout)
141-
framework.ExpectHaveLen(status.Servers, len(pods.Items), "unexpected number of servers in ovn%s db status in pod %s: stdout = %q, stderr = %q", db, pod.Name, stdout, stderr)
139+
var lastStdout, lastStderr string
140+
framework.WaitUntil(2*time.Second, 30*time.Second, func(_ context.Context) (bool, error) {
141+
stdout, stderr, err := framework.ExecShellInPod(context.Background(), f, pod.Namespace, pod.Name, cmdClusterStatus(db))
142+
if err != nil {
143+
return false, nil
144+
}
145+
lastStdout, lastStderr = stdout, stderr
146+
var count int
147+
for line := range strings.SplitSeq(stdout, "\n") {
148+
if slices.Contains(strings.Fields(line), "at") {
149+
count++
150+
}
151+
}
152+
return count == expectedCount, nil
153+
}, fmt.Sprintf("ovn%s db on pod %s to show %d servers", db, pod.Name, expectedCount))
154+
155+
status := parseClusterStatus(lastStdout)
156+
framework.ExpectHaveLen(status.Servers, expectedCount, "unexpected number of servers in ovn%s db status in pod %s: stdout = %q, stderr = %q", db, pod.Name, lastStdout, lastStderr)
142157
if len(dbServers[db]) == 0 {
143158
dbServers[db] = maps.Clone(status.Servers)
144159
} else {
145-
framework.ExpectEqual(status.Servers, dbServers[db], "inconsistent servers in ovn%s db status in pod %s: stdout = %q, stderr = %q", db, pod.Name, stdout, stderr)
160+
framework.ExpectEqual(status.Servers, dbServers[db], "inconsistent servers in ovn%s db status in pod %s: stdout = %q, stderr = %q", db, pod.Name, lastStdout, lastStderr)
146161
}
147162
}
148163
}

0 commit comments

Comments
 (0)