Skip to content

Commit a34f011

Browse files
committed
fix(podevents): check the lastPodEventTime have been set, dedupe timeout or is max
Signed-off-by: flavono123 <[email protected]>
1 parent dca7a2c commit a34f011

File tree

4 files changed

+103
-52
lines changed

4 files changed

+103
-52
lines changed

pkg/controllers/nodeclaim/disruption/consolidation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ func (c *Consolidation) Reconcile(ctx context.Context, nodePool *v1.NodePool, no
6060
timeToCheck := lo.Ternary(!nodeClaim.Status.LastPodEventTime.IsZero(), nodeClaim.Status.LastPodEventTime.Time, initialized.LastTransitionTime.Time)
6161

6262
// Consider a node consolidatable by looking at the lastPodEvent status field on the nodeclaim.
63-
// This time is now based on the PodScheduled condition's lastTransitionTime or pod is being removed(terminal or terminating)
63+
// This time is based on the PodScheduled condition's lastTransitionTime or pod is being removed(terminal or terminating)
6464
if c.clock.Since(timeToCheck) < lo.FromPtr(nodePool.Spec.Disruption.ConsolidateAfter.Duration) {
6565
if hasConsolidatableCondition {
6666
_ = nodeClaim.StatusConditions().Clear(v1.ConditionTypeConsolidatable)

pkg/controllers/nodeclaim/hydration/suite_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,10 @@ import (
3535
. "sigs.k8s.io/karpenter/pkg/utils/testing"
3636
)
3737

38-
var (
39-
ctx context.Context
40-
hydrationController *hydration.Controller
41-
env *test.Environment
42-
cloudProvider *fake.CloudProvider
43-
)
38+
var ctx context.Context
39+
var hydrationController *hydration.Controller
40+
var env *test.Environment
41+
var cloudProvider *fake.CloudProvider
4442

4543
func TestAPIs(t *testing.T) {
4644
ctx = TestContextWithLogger(t)

pkg/controllers/nodeclaim/podevents/controller.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ import (
4040
podutils "sigs.k8s.io/karpenter/pkg/utils/pod"
4141
)
4242

43+
// dedupeTimeout is 10 seconds to reduce the number of writes to the APIServer, since pod scheduling and deletion events are very frequent.
44+
// The smaller this value is, the more writes Karpenter will make in a busy cluster. This timeout is intentionally smaller than the consolidation
45+
// 15 second validation period, so that we can ensure that we invalidate consolidation commands that are decided while we're de-duping pod events.
46+
const dedupeTimeout = 10 * time.Second
47+
4348
// Podevents is a nodeclaim controller that updates the lastPodEvent status based on PodScheduled condition
4449
type Controller struct {
4550
clock clock.Clock
@@ -96,9 +101,16 @@ func (c *Controller) Reconcile(ctx context.Context, pod *corev1.Pod) (reconcile.
96101
return reconcile.Result{}, nil
97102
}
98103

99-
// otherwise, set the pod event time to now
100-
stored := nc.DeepCopy()
104+
// If we've set the lastPodEvent before
105+
// and the stored lastPodEventTime is after(max) eventTime
106+
// and it hasn't been before the timeout, don't do anything
107+
if !nc.Status.LastPodEventTime.Time.IsZero() &&
108+
nc.Status.LastPodEventTime.Time.After(eventTime) &&
109+
c.clock.Since(nc.Status.LastPodEventTime.Time) < dedupeTimeout {
110+
return reconcile.Result{}, nil
111+
}
101112

113+
stored := nc.DeepCopy()
102114
nc.Status.LastPodEventTime.Time = eventTime
103115
if !equality.Semantic.DeepEqual(stored, nc) {
104116
if err = c.kubeClient.Status().Patch(ctx, nc, client.MergeFrom(stored)); err != nil {

pkg/controllers/nodeclaim/podevents/suite_test.go

Lines changed: 84 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,11 @@ import (
3838
. "sigs.k8s.io/karpenter/pkg/utils/testing"
3939
)
4040

41-
var (
42-
ctx context.Context
43-
podEventsController *podevents.Controller
44-
env *test.Environment
45-
fakeClock *clock.FakeClock
46-
cp *fake.CloudProvider
47-
)
41+
var ctx context.Context
42+
var podEventsController *podevents.Controller
43+
var env *test.Environment
44+
var fakeClock *clock.FakeClock
45+
var cp *fake.CloudProvider
4846

4947
func TestAPIs(t *testing.T) {
5048
ctx = TestContextWithLogger(t)
@@ -77,7 +75,6 @@ var _ = AfterEach(func() {
7775
cp.Reset()
7876
ExpectCleanedUp(ctx, env.Client)
7977
})
80-
8178
var _ = Describe("PodEvents", func() {
8279
var nodePool *v1.NodePool
8380
var nodeClaim *v1.NodeClaim
@@ -101,45 +98,109 @@ var _ = Describe("PodEvents", func() {
10198
NodeName: node.Name,
10299
})
103100
})
104-
It("should set the nodeclaim lastPodEvent based on PodScheduled condition", func() {
105-
scheduledTime := fakeClock.Now().Truncate(time.Second)
101+
It("should set the nodeclaim lastPodEvent", func() {
102+
timeToCheck := fakeClock.Now().Truncate(time.Second)
106103
pod.Status.Conditions = []corev1.PodCondition{{
107104
Type: corev1.PodScheduled,
108105
Status: corev1.ConditionTrue,
109-
LastTransitionTime: metav1.Time{Time: scheduledTime},
106+
LastTransitionTime: metav1.Time{Time: timeToCheck},
110107
}}
108+
ExpectApplied(ctx, env.Client, nodePool, nodeClaim, node, pod)
109+
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
110+
111+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
112+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
113+
})
114+
It("should not set the nodeclaim lastPodEvent when the node does not exist", func() {
115+
ExpectApplied(ctx, env.Client, nodePool, nodeClaim, pod)
116+
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
117+
118+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
119+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeZero())
120+
})
121+
It("should not set the nodeclaim lastPodEvent when pod has no PodScheduled condition", func() {
111122
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
112123
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
113124

114125
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
115-
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(scheduledTime))
126+
Expect(nodeClaim.Status.LastPodEventTime.Time.IsZero()).To(BeTrue())
127+
})
128+
It("should not set the nodeclaim lastPodEvent when PodScheduled condition is not True", func() {
129+
pod.Status.Conditions = []corev1.PodCondition{{
130+
Type: corev1.PodScheduled,
131+
Status: corev1.ConditionFalse, // PodScheduled false
132+
LastTransitionTime: metav1.Time{Time: fakeClock.Now()},
133+
}}
134+
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
135+
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
116136

117-
// Update the PodScheduled condition's lastTransitionTime
118-
newScheduledTime := fakeClock.Now().Add(time.Minute).Truncate(time.Second)
137+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
138+
Expect(nodeClaim.Status.LastPodEventTime.Time.IsZero()).To(BeTrue())
139+
})
140+
It("should succeed when the nodeclaim lastPodEvent when the nodeclaim does not exist", func() {
141+
ExpectApplied(ctx, env.Client, nodePool, node, pod)
142+
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
143+
})
144+
It("should set the nodeclaim lastPodEvent when it's been set before", func() {
145+
nodeClaim.Status.LastPodEventTime.Time = fakeClock.Now().Add(-5 * time.Minute)
146+
timeToCheck := fakeClock.Now().Truncate(time.Second)
119147
pod.Status.Conditions = []corev1.PodCondition{{
120148
Type: corev1.PodScheduled,
121149
Status: corev1.ConditionTrue,
122-
LastTransitionTime: metav1.Time{Time: newScheduledTime},
150+
LastTransitionTime: metav1.Time{Time: timeToCheck},
123151
}}
124-
ExpectApplied(ctx, env.Client, pod)
152+
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
125153
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
126154

127155
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
128-
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(newScheduledTime))
156+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
129157
})
130-
It("should not set the nodeclaim lastPodEvent when the node does not exist", func() {
131-
ExpectApplied(ctx, env.Client, nodePool, nodeClaim, pod)
158+
It("should only set the nodeclaim lastPodEventTime when the event time is after than", func() {
159+
timeToCheck := fakeClock.Now().Truncate(time.Second)
160+
nodeClaim.Status.LastPodEventTime.Time = timeToCheck
161+
scheduledTimeBeforeLastPodEventTime := timeToCheck.Add(-5 * time.Minute)
162+
pod.Status.Conditions = []corev1.PodCondition{{
163+
Type: corev1.PodScheduled,
164+
Status: corev1.ConditionTrue,
165+
LastTransitionTime: metav1.Time{Time: scheduledTimeBeforeLastPodEventTime},
166+
}}
167+
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
132168
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
133169

134170
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
135-
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeZero())
171+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
136172
})
137-
It("should succeed when the nodeclaim lastPodEvent when the nodeclaim does not exist", func() {
138-
ExpectApplied(ctx, env.Client, nodePool, node, pod)
173+
It("should only set the nodeclaim lastPodEvent within the dedupe timeframe", func() {
174+
timeToCheck := fakeClock.Now().Truncate(time.Second)
175+
pod.Status.Conditions = []corev1.PodCondition{{
176+
Type: corev1.PodScheduled,
177+
Status: corev1.ConditionTrue,
178+
LastTransitionTime: metav1.Time{Time: timeToCheck},
179+
}}
180+
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
181+
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
182+
183+
// Expect that the lastPodEventTime is set
184+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
185+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
186+
187+
// step through half of the dedupe timeout, and re-reconcile,
188+
// expecting the status to not change
189+
fakeClock.Step(5 * time.Second)
139190
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
191+
192+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
193+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
194+
195+
// step through rest of the dedupe timeout, and re-reconcile,
196+
// expecting the status to not change since the podscheduled condition has not changed or terminal or terminating
197+
fakeClock.Step(5 * time.Second)
198+
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
199+
200+
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
201+
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
140202
})
141203
It("should set the nodeclaim lastPodEvent when pod becomes terminal", func() {
142-
// First set up a regular pod
143204
scheduledTime := fakeClock.Now().Truncate(time.Second)
144205
pod.Status.Conditions = []corev1.PodCondition{{
145206
Type: corev1.PodScheduled,
@@ -172,26 +233,6 @@ var _ = Describe("PodEvents", func() {
172233
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
173234
Expect(nodeClaim.Status.LastPodEventTime.Time).To(BeEquivalentTo(timeToCheck))
174235
})
175-
It("should not update lastPodEvent when pod has no PodScheduled condition", func() {
176-
// Pod with no conditions
177-
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
178-
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
179-
180-
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
181-
Expect(nodeClaim.Status.LastPodEventTime.Time.IsZero()).To(BeTrue())
182-
})
183-
It("should not update lastPodEvent when PodScheduled condition is not True", func() {
184-
pod.Status.Conditions = []corev1.PodCondition{{
185-
Type: corev1.PodScheduled,
186-
Status: corev1.ConditionFalse,
187-
LastTransitionTime: metav1.Time{Time: fakeClock.Now()},
188-
}}
189-
ExpectApplied(ctx, env.Client, nodePool, node, nodeClaim, pod)
190-
ExpectObjectReconciled(ctx, env.Client, podEventsController, pod)
191-
192-
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim)
193-
Expect(nodeClaim.Status.LastPodEventTime.Time.IsZero()).To(BeTrue())
194-
})
195236
It("should set the nodeclaim lastPodEvent when pod is already in a terminal state", func() {
196237
// Setup time
197238
timeToCheck := fakeClock.Now().Truncate(time.Second)

0 commit comments

Comments
 (0)