@@ -139,7 +139,23 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
139139 // node request to be on idle and the currect state is idle
140140 // we don't do anything
141141 if nodeStateDrainAnnotationCurrent == constants .DrainIdle {
142- reqLogger .Info ("node and nodeState are on idle nothing todo" )
142+ // in case we have policy there is nothing else to do
143+ if len (nodeNetworkState .Spec .Interfaces ) > 0 {
144+ reqLogger .Info ("node and nodeState are on idle nothing todo" )
145+ } else {
146+ // if we don't have any policy
147+ // let's be sure the device plugin label doesn't exist on the node
148+ reqLogger .Info ("remove Device plugin from node nodeState spec is empty" )
149+ err = utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelDisabled , dr .Client )
150+ if err != nil {
151+ log .Log .Error (err , "failed to label node for device plugin label" ,
152+ "labelKey" ,
153+ constants .SriovDevicePluginEnabledLabel ,
154+ "labelValue" ,
155+ constants .SriovDevicePluginEnabledLabelDisabled )
156+ return reconcile.Result {}, err
157+ }
158+ }
143159 return reconcile.Result {}, nil
144160 }
145161
@@ -172,6 +188,29 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
172188 return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
173189 }
174190
191+ // check the device plugin exited and enable it again
192+ // only of we have something in the node state spec
193+ if len (nodeNetworkState .Spec .Interfaces ) > 0 {
194+ completed , err = dr .enableSriovDevicePlugin (ctx , node )
195+ if err != nil {
196+ reqLogger .Error (err , "failed to enable SriovDevicePlugin" )
197+ dr .recorder .Event (nodeNetworkState ,
198+ corev1 .EventTypeWarning ,
199+ "DrainController" ,
200+ "failed to enable SriovDevicePlugin" )
201+ return ctrl.Result {}, err
202+ }
203+
204+ if ! completed {
205+ reqLogger .Info ("sriov device plugin enable was not completed" )
206+ dr .recorder .Event (nodeNetworkState ,
207+ corev1 .EventTypeWarning ,
208+ "DrainController" ,
209+ "sriov device plugin enable was not completed" )
210+ return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
211+ }
212+ }
213+
175214 // move the node state back to idle
176215 err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainIdle , dr .Client )
177216 if err != nil {
@@ -209,7 +248,7 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
209248 }
210249 }
211250
212- // class the drain function that will also call drain to other platform providers like openshift
251+ // call the drain function that will also call drain to other platform providers like openshift
213252 drained , err := dr .drainer .DrainNode (ctx , node , nodeDrainAnnotation == constants .RebootRequired )
214253 if err != nil {
215254 reqLogger .Error (err , "error trying to drain the node" )
@@ -230,6 +269,17 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
230269 return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
231270 }
232271
272+ reqLogger .Info ("remove Device plugin from node" )
273+ err = utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelDisabled , dr .Client )
274+ if err != nil {
275+ log .Log .Error (err , "failed to label node for device plugin label" ,
276+ "labelKey" ,
277+ constants .SriovDevicePluginEnabledLabel ,
278+ "labelValue" ,
279+ constants .SriovDevicePluginEnabledLabelDisabled )
280+ return reconcile.Result {}, err
281+ }
282+
233283 // if we manage to drain we label the node state with drain completed and finish
234284 err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainComplete , dr .Client )
235285 if err != nil {
@@ -243,6 +293,60 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
243293 "DrainController" ,
244294 "node drain completed" )
245295 return ctrl.Result {}, nil
296+ } else if nodeDrainAnnotation == constants .DevicePluginResetRequired {
297+ // nothing to do here we need to wait for the node to move back to idle
298+ if nodeStateDrainAnnotationCurrent == constants .DrainComplete {
299+ reqLogger .Info ("node requested a drain and nodeState is on drain completed nothing todo" )
300+ return ctrl.Result {}, nil
301+ }
302+
303+ // if we are on idle state we move it to drain
304+ if nodeStateDrainAnnotationCurrent == constants .DrainIdle {
305+ err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .Draining , dr .Client )
306+ if err != nil {
307+ reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .Draining )
308+ return ctrl.Result {}, err
309+ }
310+ return ctrl.Result {}, nil
311+ }
312+
313+ // This cover a case where we only need to reset the device plugin
314+ // for that we are going to cordon the node, so we don't get new pods allocated
315+ // to the node in the time we remove the device plugin
316+ err = dr .drainer .RunCordonOrUncordon (ctx , node , true )
317+ if err != nil {
318+ log .Log .Error (err , "failed to cordon on node" )
319+ return reconcile.Result {}, err
320+ }
321+
322+ // we switch the sriov label to disable and mark the drain as completed
323+ // no need to wait for the device plugin to exist here as we cordon the node,
324+ // and we want to config-daemon to start the configuration in parallel of the kube-controller to remove the pod
325+ // we check the device plugin was removed when the config-daemon moves is desire state to idle
326+ reqLogger .Info ("disable Device plugin from node" )
327+ err = utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelDisabled , dr .Client )
328+ if err != nil {
329+ log .Log .Error (err , "failed to label node for device plugin label" ,
330+ "labelKey" ,
331+ constants .SriovDevicePluginEnabledLabel ,
332+ "labelValue" ,
333+ constants .SriovDevicePluginEnabledLabelDisabled )
334+ return reconcile.Result {}, err
335+ }
336+
337+ // if we manage to cordon we label the node state with drain completed and finish
338+ err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainComplete , dr .Client )
339+ if err != nil {
340+ reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .DrainComplete )
341+ return ctrl.Result {}, err
342+ }
343+
344+ reqLogger .Info ("node cordoned successfully and device plugin removed" )
345+ dr .recorder .Event (nodeNetworkState ,
346+ corev1 .EventTypeWarning ,
347+ "DrainController" ,
348+ "node cordoned and device plugin removed completed" )
349+ return ctrl.Result {}, nil
246350 }
247351
248352 reqLogger .Error (nil , "unexpected node drain annotation" )
@@ -436,6 +540,65 @@ func (dr *DrainReconcile) findNodePoolConfig(ctx context.Context, node *corev1.N
436540 }
437541}
438542
543+ // enableSriovDevicePlugin change the device plugin label on the requested node to enable
544+ // if there is a pod still running we will return false
545+ func (dr * DrainReconcile ) enableSriovDevicePlugin (ctx context.Context , node * corev1.Node ) (bool , error ) {
546+ logger := log .FromContext (ctx )
547+ logger .Info ("enableSriovDevicePlugin():" )
548+
549+ // check if the device plugin is terminating only if the node annotation for device plugin is disabled
550+ if node .Annotations [constants .SriovDevicePluginEnabledLabel ] == constants .SriovDevicePluginEnabledLabelDisabled {
551+ pods , err := dr .getDevicePluginPodsOnNode (node .Name )
552+ if err != nil {
553+ logger .Error (err , "failed to list device plugin pods running on node" )
554+ return false , err
555+ }
556+
557+ if len (pods .Items ) != 0 {
558+ log .Log .V (2 ).Info ("device plugin pod still terminating on node" )
559+ return false , nil
560+ }
561+ }
562+
563+ logger .Info ("enable Device plugin from node" )
564+ err := utils .LabelNode (ctx , node .Name , constants .SriovDevicePluginEnabledLabel , constants .SriovDevicePluginEnabledLabelEnabled , dr .Client )
565+ if err != nil {
566+ log .Log .Error (err , "failed to label node for device plugin label" ,
567+ "labelKey" ,
568+ constants .SriovDevicePluginEnabledLabel ,
569+ "labelValue" ,
570+ constants .SriovDevicePluginEnabledLabelEnabled )
571+ return false , err
572+ }
573+
574+ // check if the device plugin pod is running on the node
575+ pods , err := dr .getDevicePluginPodsOnNode (node .Name )
576+ if err != nil {
577+ logger .Error (err , "failed to list device plugin pods running on node" )
578+ return false , err
579+ }
580+
581+ if len (pods .Items ) == 1 && pods .Items [0 ].Status .Phase == corev1 .PodRunning {
582+ logger .Info ("Device plugin pod running on node" )
583+ return true , nil
584+ }
585+
586+ logger .V (2 ).Info ("Device plugin pod still not running on node" )
587+ return false , nil
588+ }
589+
590+ func (dr * DrainReconcile ) getDevicePluginPodsOnNode (nodeName string ) (* corev1.PodList , error ) {
591+ pods := & corev1.PodList {}
592+ err := dr .List (context .Background (), pods , & client.ListOptions {
593+ Raw : & metav1.ListOptions {
594+ LabelSelector : "app=sriov-device-plugin" ,
595+ FieldSelector : "spec.nodeName=" + nodeName ,
596+ ResourceVersion : "0" },
597+ })
598+
599+ return pods , err
600+ }
601+
439602// SetupWithManager sets up the controller with the Manager.
440603func (dr * DrainReconcile ) SetupWithManager (mgr ctrl.Manager ) error {
441604 createUpdateEnqueue := handler.Funcs {
0 commit comments