@@ -324,6 +324,51 @@ func checkNorthdEpAlive(cfg *Configuration, namespace, service string, expectedA
324324 return false
325325}
326326
327+ // checkDBClusterIntegrity verifies that a leader node has the expected number
328+ // of cluster members. After a split-brain recovery with an incomplete snapshot,
329+ // a leader may have fewer members than expected (e.g., some servers missing from
330+ // its cluster configuration). This causes the missing servers to be permanently
331+ // excluded from the cluster.
332+ //
333+ // When detected, the corrupted db file is removed so that on restart,
334+ // ovn_db_pre_start can rebuild it from the raft header file and rejoin the
335+ // cluster with a clean state.
336+ func checkDBClusterIntegrity (db string , expectedMembers int ) {
337+ if expectedMembers <= 1 {
338+ return
339+ }
340+
341+ dbName := ovnnb .DatabaseName
342+ if db == "sb" {
343+ dbName = ovnsb .DatabaseName
344+ }
345+
346+ output , err := ovs .OvnDatabaseControl (db , "cluster/status" , dbName )
347+ if err != nil {
348+ klog .Warningf ("failed to get %s cluster status: %v" , db , err )
349+ return
350+ }
351+
352+ serverCount := 0
353+ for line := range strings .SplitSeq (output , "\n " ) {
354+ if slices .Contains (strings .Fields (line ), "at" ) {
355+ serverCount ++
356+ }
357+ }
358+
359+ if serverCount > 0 && serverCount < expectedMembers {
360+ dbFile := fmt .Sprintf ("/etc/ovn/ovn%s_db.db" , db )
361+ klog .Errorf ("ovn-%s leader has only %d cluster members, expected %d; " +
362+ "cluster may have incomplete membership from a split-brain recovery; " +
363+ "removing db file %s to force clean rejoin on restart" ,
364+ db , serverCount , expectedMembers , dbFile )
365+ if err := os .Remove (dbFile ); err != nil && ! os .IsNotExist (err ) {
366+ klog .Errorf ("failed to remove db file %s: %v" , dbFile , err )
367+ }
368+ klog .Fatalf ("exiting to trigger re-election with clean state" )
369+ }
370+ }
371+
327372func compactOvnDatabase (db string ) {
328373 args := []string {
329374 "-t" ,
@@ -444,6 +489,14 @@ func doOvnLeaderCheck(cfg *Configuration, podName, podNamespace string) {
444489 }
445490 }
446491
492+ expectedMembers := len (cfg .remoteAddresses ) + 1
493+ if nbLeader {
494+ checkDBClusterIntegrity ("nb" , expectedMembers )
495+ }
496+ if sbLeader {
497+ checkDBClusterIntegrity ("sb" , expectedMembers )
498+ }
499+
447500 if cfg .EnableCompact {
448501 compactOvnDatabase ("nb" )
449502 compactOvnDatabase ("sb" )
0 commit comments