Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions .github/workflows/build-x86-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2964,21 +2964,15 @@ jobs:
make kube-ovn-security-e2e
make kube-ovn-ha-e2e


- name: Check kube ovn pod restarts
id: check-restarts
if: ${{ success() || (failure() && (steps.install.conclusion == 'failure' || steps.e2e.conclusion == 'failure')) }}
run: make check-kube-ovn-pod-restarts

- name: kubectl ko log
if: failure() && (steps.install.conclusion == 'failure' || steps.e2e.conclusion == 'failure' || steps.check-restarts.conclusion == 'failure')
if: failure() && (steps.install.conclusion == 'failure' || steps.e2e.conclusion == 'failure')
run: |
make kubectl-ko-log
mv kubectl-ko-log.tar.gz kube-ovn-ha-e2e-${{ matrix.ssl }}-${{ matrix.bind-local }}-${{ matrix.ip-family }}-ko-log.tar.gz

- name: upload kubectl ko log
uses: actions/upload-artifact@v7
if: failure() && (steps.install.conclusion == 'failure' || steps.e2e.conclusion == 'failure' || steps.check-restarts.conclusion == 'failure')
if: failure() && (steps.install.conclusion == 'failure' || steps.e2e.conclusion == 'failure')
with:
name: kube-ovn-ha-e2e-${{ matrix.ssl }}-${{ matrix.bind-local }}-${{ matrix.ip-family }}-ko-log
path: kube-ovn-ha-e2e-${{ matrix.ssl }}-${{ matrix.bind-local }}-${{ matrix.ip-family }}-ko-log.tar.gz
Expand Down
53 changes: 53 additions & 0 deletions pkg/ovn_leader_checker/ovn.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,51 @@ func checkNorthdEpAlive(cfg *Configuration, namespace, service string) bool {
return false
}

// checkDBClusterIntegrity verifies that a leader node has the expected number
// of cluster members. After a split-brain recovery with an incomplete snapshot,
// a leader may have fewer members than expected (e.g., some servers missing from
// its cluster configuration). This causes the missing servers to be permanently
// excluded from the cluster.
//
// When detected, the corrupted db file is removed so that on restart,
// ovn_db_pre_start can rebuild it from the raft header file and rejoin the
// cluster with a clean state.
func checkDBClusterIntegrity(db string, expectedMembers int) {
if expectedMembers <= 1 {
return
}

dbName := ovnnb.DatabaseName
if db == "sb" {
dbName = ovnsb.DatabaseName
}

output, err := ovs.OvnDatabaseControl(db, "cluster/status", dbName)
if err != nil {
klog.Warningf("failed to get %s cluster status: %v", db, err)
return
}

serverCount := 0
for line := range strings.SplitSeq(output, "\n") {
if slices.Contains(strings.Fields(line), "at") {
serverCount++
}
}
Comment on lines +333 to +338
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current method of parsing the cluster/status output by splitting each line into fields and checking for the presence of "at" is a bit fragile. It might lead to incorrect server counts if the word "at" appears in other parts of the output for any reason in future OVN versions. A more robust approach would be to locate the "Servers:" section and count the indented lines that follow.

Additionally, the use of strings.SplitSeq with a range expression is a feature of Go 1.22. Using strings.Split would be more compatible with older Go versions and fits well with the more robust parsing logic.

    serverCount := 0
    inServersSection := false
    for _, line := range strings.Split(output, "\n") {
        if !inServersSection {
            if line == "Servers:" {
                inServersSection = true
            }
            continue
        }

        // The server list is indented. An unindented or empty line
        // signifies the end of the list.
        if len(line) == 0 || (line[0] != ' ' && line[0] != '\t') {
            break
        }
        serverCount++
    }


if serverCount > 0 && serverCount < expectedMembers {
dbFile := fmt.Sprintf("/etc/ovn/ovn%s_db.db", db)
klog.Errorf("ovn-%s leader has only %d cluster members, expected %d; "+
"cluster may have incomplete membership from a split-brain recovery; "+
"removing db file %s to force clean rejoin on restart",
db, serverCount, expectedMembers, dbFile)
if err := os.Remove(dbFile); err != nil && !os.IsNotExist(err) {
klog.Errorf("failed to remove db file %s: %v", dbFile, err)
}
klog.Fatalf("exiting to trigger re-election with clean state")
}
}

func compactOvnDatabase(db string) {
output, err := ovs.OvnDatabaseControl(db, "ovsdb-server/compact")
if err != nil {
Expand Down Expand Up @@ -411,6 +456,14 @@ func doOvnLeaderCheck(cfg *Configuration, podName, podNamespace string) {
}
}

expectedMembers := len(cfg.remoteAddresses) + 1
if nbLeader {
checkDBClusterIntegrity("nb", expectedMembers)
}
if sbLeader {
checkDBClusterIntegrity("sb", expectedMembers)
}

if cfg.EnableCompact {
compactOvnDatabase("nb")
compactOvnDatabase("sb")
Expand Down
Loading