@@ -33,12 +33,15 @@ import (
3333 "github.com/nvidia/ovn-kubernetes-components/internal/constants"
3434 "github.com/nvidia/ovn-kubernetes-components/internal/utils/ovsclient"
3535
36+ corev1 "k8s.io/api/core/v1"
3637 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
38+ k8stypes "k8s.io/apimachinery/pkg/types"
3739 "k8s.io/client-go/kubernetes"
3840 "k8s.io/klog/v2"
3941 "k8s.io/utils/clock"
4042 kexec "k8s.io/utils/exec"
4143 "k8s.io/utils/ptr"
44+ "sigs.k8s.io/controller-runtime/pkg/client"
4245)
4346
4447type Mode string
@@ -90,16 +93,19 @@ const (
9093 hostBootstrapKubeconfigPath = "/host-kubernetes/kubelet.conf"
9194 // hostNodeNameFilePath is the path used to publish mapped host node name for other containers in the pod.
9295 hostNodeNameFilePath = "/var/run/ovn-kubernetes/host-node-name"
96+ // hostNodeChassisIDAnnotationKey is the host-cluster node annotation used by OVN to track the chassis identity.
97+ hostNodeChassisIDAnnotationKey = "k8s.ovn.org/node-chassis-id"
9398)
9499
95100type DPUCNIProvisioner struct {
96- ctx context.Context
97- clock clock.Clock
98- ensureConfigurationTicker clock.Ticker
99- ovsClient ovsclient.OVSClient
100- networkHelper networkhelper.NetworkHelper
101- exec kexec.Interface
102- kubernetesClient kubernetes.Interface
101+ ctx context.Context
102+ clock clock.Clock
103+ ensureConfigurationTicker clock.Ticker
104+ ovsClient ovsclient.OVSClient
105+ networkHelper networkhelper.NetworkHelper
106+ exec kexec.Interface
107+ dpuClusterKubernetesClient kubernetes.Interface
108+ hostKubernetesClient client.Client
103109
104110 // FileSystemRoot controls the file system root. It's used for enabling easier testing of the package. Defaults to
105111 // empty.
@@ -159,29 +165,33 @@ func New(ctx context.Context,
159165 ovnMTU int ,
160166) * DPUCNIProvisioner {
161167 return & DPUCNIProvisioner {
162- ctx : ctx ,
163- clock : clock ,
164- ensureConfigurationTicker : clock .NewTicker (30 * time .Second ),
165- ovsClient : ovsClient ,
166- networkHelper : networkHelper ,
167- exec : exec ,
168- kubernetesClient : kubernetesClient ,
169- FileSystemRoot : "" ,
170- K8sAPIServer : "" ,
171- BootstrapKubeconfigPath : hostBootstrapKubeconfigPath ,
172- HostNodeNameFilePath : hostNodeNameFilePath ,
173- vtepIPNet : vtepIPNet ,
174- gateway : gateway ,
175- vtepCIDR : vtepCIDR ,
176- hostCIDR : hostCIDR ,
177- pfIP : pfIP ,
178- dpuHostName : dpuHostName ,
179- mode : mode ,
180- gatewayDiscoveryNetwork : gatewayDiscoveryNetwork ,
181- ovnMTU : ovnMTU ,
168+ ctx : ctx ,
169+ clock : clock ,
170+ ensureConfigurationTicker : clock .NewTicker (30 * time .Second ),
171+ ovsClient : ovsClient ,
172+ networkHelper : networkHelper ,
173+ exec : exec ,
174+ dpuClusterKubernetesClient : kubernetesClient ,
175+ FileSystemRoot : "" ,
176+ K8sAPIServer : "" ,
177+ BootstrapKubeconfigPath : hostBootstrapKubeconfigPath ,
178+ HostNodeNameFilePath : hostNodeNameFilePath ,
179+ vtepIPNet : vtepIPNet ,
180+ gateway : gateway ,
181+ vtepCIDR : vtepCIDR ,
182+ hostCIDR : hostCIDR ,
183+ pfIP : pfIP ,
184+ dpuHostName : dpuHostName ,
185+ mode : mode ,
186+ gatewayDiscoveryNetwork : gatewayDiscoveryNetwork ,
187+ ovnMTU : ovnMTU ,
182188 }
183189}
184190
191+ func (p * DPUCNIProvisioner ) SetHostKubernetesClient (c client.Client ) {
192+ p .hostKubernetesClient = c
193+ }
194+
185195// RunOnce runs the provisioning flow once and exits
186196func (p * DPUCNIProvisioner ) RunOnce () error {
187197 if err := p .configure (); err != nil {
@@ -231,6 +241,9 @@ func (p *DPUCNIProvisioner) configure() error {
231241 if err := p .writeHostIdentityBootstrapArtifacts (hostName ); err != nil {
232242 return fmt .Errorf ("error while writing host identity bootstrap artifacts: %w" , err )
233243 }
244+ if err := p .reconcileHostNodeChassisID (hostName ); err != nil {
245+ return fmt .Errorf ("error while reconciling host node chassis annotation: %w" , err )
246+ }
234247
235248 if p .mode == ExternalIPAM {
236249 klog .Info ("Configuring br-ovn" )
@@ -257,9 +270,46 @@ func (p *DPUCNIProvisioner) configure() error {
257270 return nil
258271}
259272
273+ // reconcileHostNodeChassisID removes a stale host-cluster node chassis annotation when it differs from the local OVS
274+ // system-id. This allows ovnkube-node to re-register after DPU reprovisioning.
275+ func (p * DPUCNIProvisioner ) reconcileHostNodeChassisID (hostName string ) error {
276+ systemID , err := p .ovsClient .GetSystemID ()
277+ if err != nil {
278+ return fmt .Errorf ("error while reading local OVS system-id: %w" , err )
279+ }
280+ if systemID == "" {
281+ return fmt .Errorf ("OVS system-id is empty for DPU node %s (host node %s)" , p .dpuHostName , hostName )
282+ }
283+
284+ node := & corev1.Node {}
285+ if err := p .hostKubernetesClient .Get (p .ctx , k8stypes.NamespacedName {Name : hostName }, node ); err != nil {
286+ return fmt .Errorf ("error while getting host cluster node %s: %w" , hostName , err )
287+ }
288+
289+ current := strings .TrimSpace (node .Annotations [hostNodeChassisIDAnnotationKey ])
290+ switch {
291+ case current == "" :
292+ klog .Infof ("Host cluster node %s has no %s annotation; no cleanup needed" , hostName , hostNodeChassisIDAnnotationKey )
293+ return nil
294+ case current == systemID :
295+ klog .Infof ("Host cluster node %s already has matching %s=%s" , hostName , hostNodeChassisIDAnnotationKey , systemID )
296+ return nil
297+ }
298+
299+ klog .Infof ("Removing stale %s=%s from host cluster node %s to allow reprovisioned DPU system-id %s to register" , hostNodeChassisIDAnnotationKey , current , hostName , systemID )
300+ base := node .DeepCopy ()
301+ delete (node .Annotations , hostNodeChassisIDAnnotationKey )
302+ if err := p .hostKubernetesClient .Patch (p .ctx , node , client .MergeFromWithOptions (base , client.MergeFromWithOptimisticLock {})); err != nil {
303+ return fmt .Errorf ("error while removing stale %s annotation from host node %s: %w" , hostNodeChassisIDAnnotationKey , hostName , err )
304+ }
305+ klog .Infof ("Removed stale %s=%s from host cluster node %s" , hostNodeChassisIDAnnotationKey , current , hostName )
306+
307+ return nil
308+ }
309+
260310// findAndSetKubernetesHostNameInOVS discovers and sets the Kubernetes Host Name in OVS
261311func (p * DPUCNIProvisioner ) findAndSetKubernetesHostNameInOVS () (string , error ) {
262- nodeClient := p .kubernetesClient .CoreV1 ().Nodes ()
312+ nodeClient := p .dpuClusterKubernetesClient .CoreV1 ().Nodes ()
263313 n , err := nodeClient .Get (p .ctx , p .dpuHostName , metav1.GetOptions {})
264314 if err != nil {
265315 return "" , fmt .Errorf ("error while getting Kubernetes Node: %w" , err )
0 commit comments