@@ -172,6 +172,29 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
172172 return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
173173 }
174174
175+ // check the device plugin exited and enable it again
176+ // only of we have something in the node state spec
177+ if len (nodeNetworkState .Spec .Interfaces ) > 0 {
178+ completed , err = dr .enableSriovDevicePlugin (ctx , node )
179+ if err != nil {
180+ reqLogger .Error (err , "failed to enable SriovDevicePlugin" )
181+ dr .recorder .Event (nodeNetworkState ,
182+ corev1 .EventTypeWarning ,
183+ "DrainController" ,
184+ "failed to enable SriovDevicePlugin" )
185+ return ctrl.Result {}, err
186+ }
187+
188+ if ! completed {
189+ reqLogger .Info ("sriov device plugin enable was not completed" )
190+ dr .recorder .Event (nodeNetworkState ,
191+ corev1 .EventTypeWarning ,
192+ "DrainController" ,
193+ "sriov device plugin enable was not completed" )
194+ return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
195+ }
196+ }
197+
175198 // move the node state back to idle
176199 err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainIdle , dr .Client )
177200 if err != nil {
@@ -209,7 +232,7 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
209232 }
210233 }
211234
212- // class the drain function that will also call drain to other platform providers like openshift
235+ // call the drain function that will also call drain to other platform providers like openshift
213236 drained , err := dr .drainer .DrainNode (ctx , node , nodeDrainAnnotation == constants .RebootRequired )
214237 if err != nil {
215238 reqLogger .Error (err , "error trying to drain the node" )
@@ -230,6 +253,17 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
230253 return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
231254 }
232255
256+ reqLogger .Info ("remove Device plugin from node" )
257+ err = utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelDisabled , dr .Client )
258+ if err != nil {
259+ log .Log .Error (err , "failed to label node for device plugin label" ,
260+ "labelKey" ,
261+ constants .SriovDevicePluginEnabledLabel ,
262+ "labelValue" ,
263+ constants .SriovDevicePluginEnabledLabelDisabled )
264+ return reconcile.Result {}, err
265+ }
266+
233267 // if we manage to drain we label the node state with drain completed and finish
234268 err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainComplete , dr .Client )
235269 if err != nil {
@@ -243,6 +277,60 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
243277 "DrainController" ,
244278 "node drain completed" )
245279 return ctrl.Result {}, nil
280+ } else if nodeDrainAnnotation == constants .DevicePluginResetRequired {
281+ // nothing to do here we need to wait for the node to move back to idle
282+ if nodeStateDrainAnnotationCurrent == constants .DrainComplete {
283+ reqLogger .Info ("node requested a drain and nodeState is on drain completed nothing todo" )
284+ return ctrl.Result {}, nil
285+ }
286+
287+ // if we are on idle state we move it to drain
288+ if nodeStateDrainAnnotationCurrent == constants .DrainIdle {
289+ err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .Draining , dr .Client )
290+ if err != nil {
291+ reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .Draining )
292+ return ctrl.Result {}, err
293+ }
294+ return ctrl.Result {}, nil
295+ }
296+
297+ // This cover a case where we only need to reset the device plugin
298+ // for that we are going to cordon the node, so we don't get new pods allocated
299+ // to the node in the time we remove the device plugin
300+ err = dr .drainer .RunCordonOrUncordon (ctx , node , true )
301+ if err != nil {
302+ log .Log .Error (err , "failed to cordon on node" )
303+ return reconcile.Result {}, err
304+ }
305+
306+ // we switch the sriov label to disable and mark the drain as completed
307+ // no need to wait for the device plugin to exist here as we cordon the node,
308+ // and we want to config-daemon to start the configuration in parallel of the kube-controller to remove the pod
309+ // we check the device plugin was removed when the config-daemon moves is desire state to idle
310+ reqLogger .Info ("disable Device plugin from node" )
311+ err = utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelDisabled , dr .Client )
312+ if err != nil {
313+ log .Log .Error (err , "failed to label node for device plugin label" ,
314+ "labelKey" ,
315+ constants .SriovDevicePluginEnabledLabel ,
316+ "labelValue" ,
317+ constants .SriovDevicePluginEnabledLabelDisabled )
318+ return reconcile.Result {}, err
319+ }
320+
321+ // if we manage to cordon we label the node state with drain completed and finish
322+ err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainComplete , dr .Client )
323+ if err != nil {
324+ reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .DrainComplete )
325+ return ctrl.Result {}, err
326+ }
327+
328+ reqLogger .Info ("node cordoned successfully and device plugin removed" )
329+ dr .recorder .Event (nodeNetworkState ,
330+ corev1 .EventTypeWarning ,
331+ "DrainController" ,
332+ "node cordoned and device plugin removed completed" )
333+ return ctrl.Result {}, nil
246334 }
247335
248336 reqLogger .Error (nil , "unexpected node drain annotation" )
@@ -436,6 +524,65 @@ func (dr *DrainReconcile) findNodePoolConfig(ctx context.Context, node *corev1.N
436524 }
437525}
438526
527+ // enableSriovDevicePlugin change the device plugin label on the requested node to enable
528+ // if there is a pod still running we will return false
529+ func (dr * DrainReconcile ) enableSriovDevicePlugin (ctx context.Context , node * corev1.Node ) (bool , error ) {
530+ logger := log .FromContext (ctx )
531+ logger .Info ("enableSriovDevicePlugin():" )
532+
533+ // check if the device plugin is terminating only if the node annotation for device plugin is disabled
534+ if node .Annotations [constants .SriovDevicePluginEnabledLabel ] == constants .SriovDevicePluginEnabledLabelDisabled {
535+ pods , err := dr .getDevicePluginPodsOnNode (node .Name )
536+ if err != nil {
537+ logger .Error (err , "failed to list device plugin pods running on node" )
538+ return false , err
539+ }
540+
541+ if len (pods .Items ) != 0 {
542+ log .Log .V (2 ).Info ("device plugin pod still terminating on node" )
543+ return false , nil
544+ }
545+ }
546+
547+ logger .Info ("enable Device plugin from node" )
548+ err := utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelEnabled , dr .Client )
549+ if err != nil {
550+ log .Log .Error (err , "failed to label node for device plugin label" ,
551+ "labelKey" ,
552+ constants .SriovDevicePluginEnabledLabel ,
553+ "labelValue" ,
554+ constants .SriovDevicePluginEnabledLabelEnabled )
555+ return false , err
556+ }
557+
558+ // check if the device plugin pod is running on the node
559+ pods , err := dr .getDevicePluginPodsOnNode (node .Name )
560+ if err != nil {
561+ logger .Error (err , "failed to list device plugin pods running on node" )
562+ return false , err
563+ }
564+
565+ if len (pods .Items ) == 1 && pods .Items [0 ].Status .Phase == corev1 .PodRunning {
566+ logger .Info ("Device plugin pod running on node" )
567+ return true , nil
568+ }
569+
570+ logger .V (2 ).Info ("Device plugin pod still not running on node" )
571+ return false , nil
572+ }
573+
574+ func (dr * DrainReconcile ) getDevicePluginPodsOnNode (nodeName string ) (* corev1.PodList , error ) {
575+ pods := & corev1.PodList {}
576+ err := dr .List (context .Background (), pods , & client.ListOptions {
577+ Raw : & metav1.ListOptions {
578+ LabelSelector : "app=sriov-device-plugin" ,
579+ FieldSelector : "spec.nodeName=" + nodeName ,
580+ ResourceVersion : "0" },
581+ })
582+
583+ return pods , err
584+ }
585+
439586// SetupWithManager sets up the controller with the Manager.
440587func (dr * DrainReconcile ) SetupWithManager (mgr ctrl.Manager ) error {
441588 createUpdateEnqueue := handler.Funcs {
0 commit comments