@@ -20,12 +20,10 @@ import (
2020 "context"
2121 "fmt"
2222 "sync"
23- "time"
2423
2524 corev1 "k8s.io/api/core/v1"
2625 "k8s.io/apimachinery/pkg/api/errors"
2726 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28- "k8s.io/apimachinery/pkg/labels"
2927 "k8s.io/apimachinery/pkg/runtime"
3028 "k8s.io/apimachinery/pkg/types"
3129 "k8s.io/apimachinery/pkg/util/intstr"
@@ -136,11 +134,11 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
136134 if nodeDrainAnnotation == constants .DrainIdle {
137135 // this cover the case the node is on idle
138136
139- // node request to be on idle and the currect state is idle
140- // we don't do anything
137+ // node request to be on idle and the current state is idle
138+ // in this case we check if vf configuration exist in the nodeState
139+ // if not we remove the device plugin node selector label from the node
141140 if nodeStateDrainAnnotationCurrent == constants .DrainIdle {
142- reqLogger .Info ("node and nodeState are on idle nothing todo" )
143- return reconcile.Result {}, nil
141+ return dr .handleNodeIdleNodeStateIdle (ctx , & reqLogger , node , nodeNetworkState )
144142 }
145143
146144 // we have two options here:
@@ -151,98 +149,19 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
151149 // doesn't need to drain anymore, so we can stop the drain
152150 if nodeStateDrainAnnotationCurrent == constants .DrainComplete ||
153151 nodeStateDrainAnnotationCurrent == constants .Draining {
154- completed , err := dr .drainer .CompleteDrainNode (ctx , node )
155- if err != nil {
156- reqLogger .Error (err , "failed to complete drain on node" )
157- dr .recorder .Event (nodeNetworkState ,
158- corev1 .EventTypeWarning ,
159- "DrainController" ,
160- "failed to drain node" )
161- return ctrl.Result {}, err
162- }
163-
164- // if we didn't manage to complete the un drain of the node we retry
165- if ! completed {
166- reqLogger .Info ("complete drain was not completed re queueing the request" )
167- dr .recorder .Event (nodeNetworkState ,
168- corev1 .EventTypeWarning ,
169- "DrainController" ,
170- "node complete drain was not completed" )
171- // TODO: make this time configurable
172- return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
173- }
174-
175- // move the node state back to idle
176- err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainIdle , dr .Client )
177- if err != nil {
178- reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .DrainIdle )
179- return ctrl.Result {}, err
180- }
181-
182- reqLogger .Info ("completed the un drain for node" )
183- dr .recorder .Event (nodeNetworkState ,
184- corev1 .EventTypeWarning ,
185- "DrainController" ,
186- "node un drain completed" )
187- return ctrl.Result {}, nil
188- }
189- } else if nodeDrainAnnotation == constants .DrainRequired || nodeDrainAnnotation == constants .RebootRequired {
190- // this cover the case a node request to drain or reboot
191-
192- // nothing to do here we need to wait for the node to move back to idle
193- if nodeStateDrainAnnotationCurrent == constants .DrainComplete {
194- reqLogger .Info ("node requested a drain and nodeState is on drain completed nothing todo" )
195- return ctrl.Result {}, nil
196- }
197-
198- // we need to start the drain, but first we need to check that we can drain the node
199- if nodeStateDrainAnnotationCurrent == constants .DrainIdle {
200- result , err := dr .tryDrainNode (ctx , node )
201- if err != nil {
202- reqLogger .Error (err , "failed to check if we can drain the node" )
203- return ctrl.Result {}, err
204- }
205-
206- // in case we need to wait because we just to the max number of draining nodes
207- if result != nil {
208- return * result , nil
209- }
210- }
211-
212- // class the drain function that will also call drain to other platform providers like openshift
213- drained , err := dr .drainer .DrainNode (ctx , node , nodeDrainAnnotation == constants .RebootRequired )
214- if err != nil {
215- reqLogger .Error (err , "error trying to drain the node" )
216- dr .recorder .Event (nodeNetworkState ,
217- corev1 .EventTypeWarning ,
218- "DrainController" ,
219- "failed to drain node" )
220- return reconcile.Result {}, err
221- }
222-
223- // if we didn't manage to complete the drain of the node we retry
224- if ! drained {
225- reqLogger .Info ("the nodes was not drained re queueing the request" )
226- dr .recorder .Event (nodeNetworkState ,
227- corev1 .EventTypeWarning ,
228- "DrainController" ,
229- "node drain operation was not completed" )
230- return reconcile.Result {RequeueAfter : 5 * time .Second }, nil
152+ return dr .handleNodeIdleNodeStateDrainingOrCompleted (ctx , & reqLogger , node , nodeNetworkState )
231153 }
154+ }
232155
233- // if we manage to drain we label the node state with drain completed and finish
234- err = utils .AnnotateObject (ctx , nodeNetworkState , constants .NodeStateDrainAnnotationCurrent , constants .DrainComplete , dr .Client )
235- if err != nil {
236- reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .DrainComplete )
237- return ctrl.Result {}, err
238- }
156+ // this cover the case a node request to drain or reboot
157+ if nodeDrainAnnotation == constants .DrainRequired ||
158+ nodeDrainAnnotation == constants .RebootRequired {
159+ return dr .handleNodeDrainOrReboot (ctx , & reqLogger , node , nodeNetworkState , nodeDrainAnnotation , nodeStateDrainAnnotationCurrent )
160+ }
239161
240- reqLogger .Info ("node drained successfully" )
241- dr .recorder .Event (nodeNetworkState ,
242- corev1 .EventTypeWarning ,
243- "DrainController" ,
244- "node drain completed" )
245- return ctrl.Result {}, nil
162+ // this cover the case a node request to only reset the device plugin
163+ if nodeDrainAnnotation == constants .DevicePluginResetRequired {
164+ return dr .handleNodeDPReset (ctx , & reqLogger , node , nodeNetworkState , nodeStateDrainAnnotationCurrent )
246165 }
247166
248167 reqLogger .Error (nil , "unexpected node drain annotation" )
@@ -273,169 +192,6 @@ func (dr *DrainReconcile) ensureAnnotationExists(ctx context.Context, object cli
273192 return value , nil
274193}
275194
276- func (dr * DrainReconcile ) tryDrainNode (ctx context.Context , node * corev1.Node ) (* reconcile.Result , error ) {
277- // configure logs
278- reqLogger := log .FromContext (ctx )
279- reqLogger .Info ("checkForNodeDrain():" )
280-
281- //critical section we need to check if we can start the draining
282- dr .drainCheckMutex .Lock ()
283- defer dr .drainCheckMutex .Unlock ()
284-
285- // find the relevant node pool
286- nodePool , nodeList , err := dr .findNodePoolConfig (ctx , node )
287- if err != nil {
288- reqLogger .Error (err , "failed to find the pool for the requested node" )
289- return nil , err
290- }
291-
292- // check how many nodes we can drain in parallel for the specific pool
293- maxUnv , err := nodePool .MaxUnavailable (len (nodeList ))
294- if err != nil {
295- reqLogger .Error (err , "failed to calculate max unavailable" )
296- return nil , err
297- }
298-
299- current := 0
300- snns := & sriovnetworkv1.SriovNetworkNodeState {}
301-
302- var currentSnns * sriovnetworkv1.SriovNetworkNodeState
303- for _ , nodeObj := range nodeList {
304- err = dr .Get (ctx , client.ObjectKey {Name : nodeObj .GetName (), Namespace : vars .Namespace }, snns )
305- if err != nil {
306- if errors .IsNotFound (err ) {
307- reqLogger .V (2 ).Info ("node doesn't have a sriovNetworkNodePolicy" )
308- continue
309- }
310- return nil , err
311- }
312-
313- if snns .GetName () == node .GetName () {
314- currentSnns = snns .DeepCopy ()
315- }
316-
317- if utils .ObjectHasAnnotation (snns , constants .NodeStateDrainAnnotationCurrent , constants .Draining ) ||
318- utils .ObjectHasAnnotation (snns , constants .NodeStateDrainAnnotationCurrent , constants .DrainComplete ) {
319- current ++
320- }
321- }
322- reqLogger .Info ("Max node allowed to be draining at the same time" , "MaxParallelNodeConfiguration" , maxUnv )
323- reqLogger .Info ("Count of draining" , "drainingNodes" , current )
324-
325- // if maxUnv is zero this means we drain all the nodes in parallel without a limit
326- if maxUnv == - 1 {
327- reqLogger .Info ("draining all the nodes in parallel" )
328- } else if current >= maxUnv {
329- // the node requested to be drained, but we are at the limit so we re-enqueue the request
330- reqLogger .Info ("MaxParallelNodeConfiguration limit reached for draining nodes re-enqueue the request" )
331- // TODO: make this time configurable
332- return & reconcile.Result {RequeueAfter : 5 * time .Second }, nil
333- }
334-
335- if currentSnns == nil {
336- return nil , fmt .Errorf ("failed to find sriov network node state for requested node" )
337- }
338-
339- err = utils .AnnotateObject (ctx , currentSnns , constants .NodeStateDrainAnnotationCurrent , constants .Draining , dr .Client )
340- if err != nil {
341- reqLogger .Error (err , "failed to annotate node with annotation" , "annotation" , constants .Draining )
342- return nil , err
343- }
344-
345- return nil , nil
346- }
347-
348- func (dr * DrainReconcile ) findNodePoolConfig (ctx context.Context , node * corev1.Node ) (* sriovnetworkv1.SriovNetworkPoolConfig , []corev1.Node , error ) {
349- logger := log .FromContext (ctx )
350- logger .Info ("findNodePoolConfig():" )
351- // get all the sriov network pool configs
352- npcl := & sriovnetworkv1.SriovNetworkPoolConfigList {}
353- err := dr .List (ctx , npcl )
354- if err != nil {
355- logger .Error (err , "failed to list sriovNetworkPoolConfig" )
356- return nil , nil , err
357- }
358-
359- selectedNpcl := []* sriovnetworkv1.SriovNetworkPoolConfig {}
360- nodesInPools := map [string ]interface {}{}
361-
362- for _ , npc := range npcl .Items {
363- // we skip hw offload objects
364- if npc .Spec .OvsHardwareOffloadConfig .Name != "" {
365- continue
366- }
367-
368- if npc .Spec .NodeSelector == nil {
369- npc .Spec .NodeSelector = & metav1.LabelSelector {}
370- }
371-
372- selector , err := metav1 .LabelSelectorAsSelector (npc .Spec .NodeSelector )
373- if err != nil {
374- logger .Error (err , "failed to create label selector from nodeSelector" , "nodeSelector" , npc .Spec .NodeSelector )
375- return nil , nil , err
376- }
377-
378- if selector .Matches (labels .Set (node .Labels )) {
379- selectedNpcl = append (selectedNpcl , npc .DeepCopy ())
380- }
381-
382- nodeList := & corev1.NodeList {}
383- err = dr .List (ctx , nodeList , & client.ListOptions {LabelSelector : selector })
384- if err != nil {
385- logger .Error (err , "failed to list all the nodes matching the pool with label selector from nodeSelector" ,
386- "machineConfigPoolName" , npc ,
387- "nodeSelector" , npc .Spec .NodeSelector )
388- return nil , nil , err
389- }
390-
391- for _ , nodeName := range nodeList .Items {
392- nodesInPools [nodeName .Name ] = nil
393- }
394- }
395-
396- if len (selectedNpcl ) > 1 {
397- // don't allow the node to be part of multiple pools
398- err = fmt .Errorf ("node is part of more then one pool" )
399- logger .Error (err , "multiple pools founded for a specific node" , "numberOfPools" , len (selectedNpcl ), "pools" , selectedNpcl )
400- return nil , nil , err
401- } else if len (selectedNpcl ) == 1 {
402- // found one pool for our node
403- logger .V (2 ).Info ("found sriovNetworkPool" , "pool" , * selectedNpcl [0 ])
404- selector , err := metav1 .LabelSelectorAsSelector (selectedNpcl [0 ].Spec .NodeSelector )
405- if err != nil {
406- logger .Error (err , "failed to create label selector from nodeSelector" , "nodeSelector" , selectedNpcl [0 ].Spec .NodeSelector )
407- return nil , nil , err
408- }
409-
410- // list all the nodes that are also part of this pool and return them
411- nodeList := & corev1.NodeList {}
412- err = dr .List (ctx , nodeList , & client.ListOptions {LabelSelector : selector })
413- if err != nil {
414- logger .Error (err , "failed to list nodes using with label selector" , "labelSelector" , selector )
415- return nil , nil , err
416- }
417-
418- return selectedNpcl [0 ], nodeList .Items , nil
419- } else {
420- // in this case we get all the nodes and remove the ones that already part of any pool
421- logger .V (1 ).Info ("node doesn't belong to any pool, using default drain configuration with MaxUnavailable of one" , "pool" , * defaultNpcl )
422- nodeList := & corev1.NodeList {}
423- err = dr .List (ctx , nodeList )
424- if err != nil {
425- logger .Error (err , "failed to list all the nodes" )
426- return nil , nil , err
427- }
428-
429- defaultNodeLists := []corev1.Node {}
430- for _ , nodeObj := range nodeList .Items {
431- if _ , exist := nodesInPools [nodeObj .Name ]; ! exist {
432- defaultNodeLists = append (defaultNodeLists , nodeObj )
433- }
434- }
435- return defaultNpcl , defaultNodeLists , nil
436- }
437- }
438-
439195// SetupWithManager sets up the controller with the Manager.
440196func (dr * DrainReconcile ) SetupWithManager (mgr ctrl.Manager ) error {
441197 createUpdateEnqueue := handler.Funcs {
0 commit comments