diff --git a/cmd/compute-domain-controller/cdstatus.go b/cmd/compute-domain-controller/cdstatus.go index 8d373769d..3c239ec13 100644 --- a/cmd/compute-domain-controller/cdstatus.go +++ b/cmd/compute-domain-controller/cdstatus.go @@ -351,6 +351,22 @@ func (m *ComputeDomainStatusManager) getNonStaleFabricNodes(existingNodes []*nva return result } +func validateCDStatusNodes(cd *nvapi.ComputeDomain) { + type cliqueIndexKey struct { + cliqueID string + index int + } + seen := make(map[cliqueIndexKey]string) + for _, node := range cd.Status.Nodes { + key := cliqueIndexKey{cliqueID: node.CliqueID, index: node.Index} + if prevNodeName, exists := seen[key]; exists { + klog.Fatalf("ComputeDomain %s/%s status invariant violated: node index %d in clique %q is used by both node %q and node %q", + cd.Namespace, cd.Name, node.Index, node.CliqueID, prevNodeName, node.Name) + } + seen[key] = node.Name + } +} + // nodesEqual checks if two slices of ComputeDomainNode are equal. func (m *ComputeDomainStatusManager) nodesEqual(a, b []*nvapi.ComputeDomainNode) bool { aMap := make(map[string]nvapi.ComputeDomainNode) diff --git a/cmd/compute-domain-controller/computedomain.go b/cmd/compute-domain-controller/computedomain.go index d338478bd..2b488d359 100644 --- a/cmd/compute-domain-controller/computedomain.go +++ b/cmd/compute-domain-controller/computedomain.go @@ -207,6 +207,9 @@ func (m *ComputeDomainManager) UpdateStatus(ctx context.Context, cd *nvapi.Compu // Recalculate global status based on current state cd.Status.Status = m.calculateGlobalStatus(cd) + // Validate status invariants: crash loudly if a node index is used more than once. + validateCDStatusNodes(cd) + updatedCD, err := m.config.clientsets.Nvidia.ResourceV1beta1().ComputeDomains(cd.Namespace).UpdateStatus(ctx, cd, metav1.UpdateOptions{}) if err != nil { return nil, err