Skip to content

Commit 246a485

Browse files
authored
fix: allow non-churn empty nodes to be disrupted (#2206)
1 parent 79c4a7e commit 246a485

File tree

4 files changed

+37
-3
lines changed

4 files changed

+37
-3
lines changed

pkg/controllers/disruption/consolidation_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2302,7 +2302,7 @@ var _ = Describe("Consolidation", func() {
23022302
// and delete the old one
23032303
ExpectNotFound(ctx, env.Client, nodeClaims[1], nodes[1])
23042304
})
2305-
It("does not delete nodes when there is pod churn", func() {
2305+
It("does not delete nodes with pod churn, deletes nodes without pod churn", func() {
23062306
// create our RS so we can link a pod to it
23072307
ExpectApplied(ctx, env.Client, nodePool)
23082308
for i := range 2 {
@@ -2329,7 +2329,10 @@ var _ = Describe("Consolidation", func() {
23292329
wg.Wait()
23302330
Expect(err).To(Succeed())
23312331
Expect(results).To(Equal(pscheduling.Results{}))
2332-
Expect(cmd).To(Equal(disruption.Command{}))
2332+
Expect(cmd.Candidates()).To(HaveLen(1))
2333+
// the test validator manually binds a pod to nodes[0], causing it to no longer be eligible
2334+
Expect(cmd.Candidates()[0].StateNode.Node.Name).To(Equal(nodes[1].Name))
2335+
Expect(cmd.Decision()).To(Equal(disruption.DeleteDecision))
23332336

23342337
Expect(emptyConsolidation.IsConsolidated()).To(BeFalse())
23352338

pkg/controllers/disruption/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,10 @@ func (c Command) Decision() Decision {
141141
}
142142
}
143143

144+
func (c Command) Candidates() []*Candidate {
145+
return c.candidates
146+
}
147+
144148
func (c Command) LogValues() []any {
145149
podCount := lo.Reduce(c.candidates, func(_ int, cd *Candidate, _ int) int { return len(cd.reschedulablePods) }, 0)
146150

pkg/controllers/disruption/validation.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"time"
2424

25+
"github.com/samber/lo"
2526
"k8s.io/utils/clock"
2627
"sigs.k8s.io/controller-runtime/pkg/client"
2728

@@ -149,6 +150,32 @@ func (c *ConsolidationValidator) isValid(ctx context.Context, cmd Command, valid
149150
return nil
150151
}
151152

153+
func (e *EmptinessValidator) validateCandidates(ctx context.Context, candidates ...*Candidate) ([]*Candidate, error) {
154+
validatedCandidates, err := GetCandidates(ctx, e.cluster, e.kubeClient, e.recorder, e.clock, e.cloudProvider, e.filter, GracefulDisruptionClass, e.queue)
155+
if err != nil {
156+
return nil, fmt.Errorf("constructing validation candidates, %w", err)
157+
}
158+
validatedCandidates = mapCandidates(candidates, validatedCandidates)
159+
if len(validatedCandidates) == 0 {
160+
return nil, NewValidationError(fmt.Errorf("%d candidates are no longer valid", len(candidates)))
161+
}
162+
disruptionBudgetMapping, err := BuildDisruptionBudgetMapping(ctx, e.cluster, e.clock, e.kubeClient, e.cloudProvider, e.recorder, e.reason)
163+
if err != nil {
164+
return nil, fmt.Errorf("building disruption budgets, %w", err)
165+
}
166+
167+
if valid := lo.Filter(validatedCandidates, func(cn *Candidate, _ int) bool {
168+
if e.cluster.IsNodeNominated(cn.ProviderID()) || disruptionBudgetMapping[cn.NodePool.Name] == 0 {
169+
return false
170+
}
171+
disruptionBudgetMapping[cn.NodePool.Name]--
172+
return true
173+
}); len(valid) > 0 {
174+
return valid, nil
175+
}
176+
return nil, NewValidationError(fmt.Errorf("a candidate failed validation because it was nominated for a pod or would violate disruption budgets"))
177+
}
178+
152179
// ValidateCandidates gets the current representation of the provided candidates and ensures that they are all still valid.
153180
// For a candidate to still be valid, the following conditions must be met:
154181
//

pkg/controllers/nodeclaim/disruption/drift.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func instanceTypeNotFound(its []*cloudprovider.InstanceType, nodeClaim *v1.NodeC
135135
}
136136

137137
// Eligible fields for drift are described in the docs
138-
// https://karpenter.sh/docs/concepts/deprovisioning/#drift
138+
// https://karpenter.sh/docs/concepts/disruption/#drift
139139
func areStaticFieldsDrifted(nodePool *v1.NodePool, nodeClaim *v1.NodeClaim) cloudprovider.DriftReason {
140140
nodePoolHash, foundNodePoolHash := nodePool.Annotations[v1.NodePoolHashAnnotationKey]
141141
nodePoolHashVersion, foundNodePoolHashVersion := nodePool.Annotations[v1.NodePoolHashVersionAnnotationKey]

0 commit comments

Comments
 (0)