Skip to content

Commit 3da07dc

Browse files
perf: Perform quick checks in node health first (#2264)
1 parent 1ca9183 commit 3da07dc

File tree

1 file changed

+13
-17
lines changed

1 file changed

+13
-17
lines changed

pkg/controllers/node/health/controller.go

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,19 @@ func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcil
8282
}
8383
ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("NodeClaim", klog.KObj(nodeClaim)))
8484

85-
// If a nodeclaim does has a nodepool label, validate the nodeclaims inside the nodepool are healthy (i.e bellow the allowed threshold)
85+
unhealthyNodeCondition, policyTerminationDuration := c.findUnhealthyConditions(node)
86+
if unhealthyNodeCondition == nil {
87+
return reconcile.Result{}, nil
88+
}
89+
90+
// If the Node is unhealthy, but has not reached its full toleration disruption
91+
// requeue at the termination time of the unhealthy node
92+
terminationTime := unhealthyNodeCondition.LastTransitionTime.Add(policyTerminationDuration)
93+
if c.clock.Now().Before(terminationTime) {
94+
return reconcile.Result{RequeueAfter: terminationTime.Sub(c.clock.Now())}, nil
95+
}
96+
97+
// If a nodeclaim does have a nodepool label, validate the nodeclaims inside the nodepool are healthy (i.e bellow the allowed threshold)
8698
// In the case of standalone nodeclaim, validate the nodes inside the cluster are healthy before proceeding
8799
// to repair the nodes
88100
nodePoolName, found := nodeClaim.Labels[v1.NodePoolLabelKey]
@@ -104,24 +116,10 @@ func (c *Controller) Reconcile(ctx context.Context, node *corev1.Node) (reconcil
104116
return reconcile.Result{}, nil
105117
}
106118
}
107-
108-
unhealthyNodeCondition, policyTerminationDuration := c.findUnhealthyConditions(node)
109-
if unhealthyNodeCondition == nil {
110-
return reconcile.Result{}, nil
111-
}
112-
113-
// If the Node is unhealthy, but has not reached it's full toleration disruption
114-
// requeue at the termination time of the unhealthy node
115-
terminationTime := unhealthyNodeCondition.LastTransitionTime.Add(policyTerminationDuration)
116-
if c.clock.Now().Before(terminationTime) {
117-
return reconcile.Result{RequeueAfter: terminationTime.Sub(c.clock.Now())}, nil
118-
}
119-
120119
// For unhealthy past the tolerationDisruption window we can forcefully terminate the node
121120
if err := c.annotateTerminationGracePeriod(ctx, nodeClaim); err != nil {
122121
return reconcile.Result{}, client.IgnoreNotFound(err)
123122
}
124-
125123
return c.deleteNodeClaim(ctx, nodeClaim, node, unhealthyNodeCondition)
126124
}
127125

@@ -176,7 +174,6 @@ func (c *Controller) annotateTerminationGracePeriod(ctx context.Context, nodeCla
176174
return nil
177175
}
178176
}
179-
180177
stored := nodeClaim.DeepCopy()
181178
terminationTime := c.clock.Now().Format(time.RFC3339)
182179
nodeClaim.ObjectMeta.Annotations = lo.Assign(nodeClaim.ObjectMeta.Annotations, map[string]string{v1.NodeClaimTerminationTimestampAnnotationKey: terminationTime})
@@ -187,7 +184,6 @@ func (c *Controller) annotateTerminationGracePeriod(ctx context.Context, nodeCla
187184
}
188185
log.FromContext(ctx).WithValues(v1.NodeClaimTerminationTimestampAnnotationKey, terminationTime).Info("annotated nodeclaim")
189186
}
190-
191187
return nil
192188
}
193189

0 commit comments

Comments
 (0)