Skip to content

Commit b54aa21

Browse files
committed
fix(ha): detect and recover from split-brain leader with incomplete cluster membership (#6363)
Signed-off-by: Mengxin Liu <liumengxinfly@gmail.com>
1 parent 9a16433 commit b54aa21

File tree

1 file changed

+53
-0
lines changed

1 file changed

+53
-0
lines changed

pkg/ovn_leader_checker/ovn.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,51 @@ func checkNorthdEpAlive(cfg *Configuration, namespace, service string, expectedA
324324
return false
325325
}
326326

327+
// checkDBClusterIntegrity verifies that a leader node has the expected number
328+
// of cluster members. After a split-brain recovery with an incomplete snapshot,
329+
// a leader may have fewer members than expected (e.g., some servers missing from
330+
// its cluster configuration). This causes the missing servers to be permanently
331+
// excluded from the cluster.
332+
//
333+
// When detected, the corrupted db file is removed so that on restart,
334+
// ovn_db_pre_start can rebuild it from the raft header file and rejoin the
335+
// cluster with a clean state.
336+
func checkDBClusterIntegrity(db string, expectedMembers int) {
337+
if expectedMembers <= 1 {
338+
return
339+
}
340+
341+
dbName := ovnnb.DatabaseName
342+
if db == "sb" {
343+
dbName = ovnsb.DatabaseName
344+
}
345+
346+
output, err := ovs.OvnDatabaseControl(db, "cluster/status", dbName)
347+
if err != nil {
348+
klog.Warningf("failed to get %s cluster status: %v", db, err)
349+
return
350+
}
351+
352+
serverCount := 0
353+
for line := range strings.SplitSeq(output, "\n") {
354+
if slices.Contains(strings.Fields(line), "at") {
355+
serverCount++
356+
}
357+
}
358+
359+
if serverCount > 0 && serverCount < expectedMembers {
360+
dbFile := fmt.Sprintf("/etc/ovn/ovn%s_db.db", db)
361+
klog.Errorf("ovn-%s leader has only %d cluster members, expected %d; "+
362+
"cluster may have incomplete membership from a split-brain recovery; "+
363+
"removing db file %s to force clean rejoin on restart",
364+
db, serverCount, expectedMembers, dbFile)
365+
if err := os.Remove(dbFile); err != nil && !os.IsNotExist(err) {
366+
klog.Errorf("failed to remove db file %s: %v", dbFile, err)
367+
}
368+
klog.Fatalf("exiting to trigger re-election with clean state")
369+
}
370+
}
371+
327372
func compactOvnDatabase(db string) {
328373
args := []string{
329374
"-t",
@@ -444,6 +489,14 @@ func doOvnLeaderCheck(cfg *Configuration, podName, podNamespace string) {
444489
}
445490
}
446491

492+
expectedMembers := len(cfg.remoteAddresses) + 1
493+
if nbLeader {
494+
checkDBClusterIntegrity("nb", expectedMembers)
495+
}
496+
if sbLeader {
497+
checkDBClusterIntegrity("sb", expectedMembers)
498+
}
499+
447500
if cfg.EnableCompact {
448501
compactOvnDatabase("nb")
449502
compactOvnDatabase("sb")

0 commit comments

Comments
 (0)