fix(e2e): retry check-cluster after HA db corruption recovery

oilbeater · claude · oilbeater · commit 3f45d767c8d2 · 2026-02-26T16:07:24.000Z
After scaling back up from db corruption, deployment readiness only
indicates pod health checks passed, not that RAFT log catch-up is
complete. Replace the immediate ovsdb-tool check-cluster call with a
WaitUntil retry loop (2s interval, 30s timeout) to tolerate transient
RAFT log inconsistency during recovery.

Signed-off-by: Mengxin Liu &lt;liumengxinfly@gmail.com&gt;
Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Mengxin Liu &lt;liumengxinfly@gmail.com&gt;
diff --git a/test/e2e/ha/ha_test.go b/test/e2e/ha/ha_test.go
@@ -287,9 +287,11 @@ func corruptAndRecover(f *framework.Framework, deploy *appsv1.Deployment, dbFile
 	newNodes.Clear()
 	for pod := range slices.Values(pods.Items) {
 		newNodes.Insert(pod.Spec.NodeName)
-		ginkgo.By("Checking whether db file " + dbFile + " on node " + pod.Spec.NodeName + " is healthy")
-		stdout, stderr, err := framework.ExecShellInPod(context.Background(), f, pod.Namespace, pod.Name, checkCmd)
-		framework.ExpectNoError(err, fmt.Sprintf("failed to check db file %q: stdout = %q, stderr = %q", dbFile, stdout, stderr))
+		ginkgo.By("Waiting for db file " + dbFile + " on node " + pod.Spec.NodeName + " to be healthy")
+		framework.WaitUntil(2*time.Second, 30*time.Second, func(_ context.Context) (bool, error) {
+			_, _, err := framework.ExecShellInPod(context.Background(), f, pod.Namespace, pod.Name, checkCmd)
+			return err == nil, nil
+		}, fmt.Sprintf("db file %s on node %s to be healthy", dbFile, pod.Spec.NodeName))
 	}
 	framework.ExpectEqual(newNodes, nodes, "the set of nodes hosting ovn-central pods should be the same as before")