Skip to content

Commit fcb3de0

Browse files
committed
chore: Update pod metrics when pod is terminal
1 parent 1ca9183 commit fcb3de0

File tree

2 files changed

+107
-20
lines changed

2 files changed

+107
-20
lines changed

pkg/controllers/metrics/pod/controller.go

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"sigs.k8s.io/karpenter/pkg/controllers/state"
4040
"sigs.k8s.io/karpenter/pkg/metrics"
4141
"sigs.k8s.io/karpenter/pkg/operator/injection"
42+
podutils "sigs.k8s.io/karpenter/pkg/utils/pod"
4243
)
4344

4445
const (
@@ -304,28 +305,11 @@ func (c *Controller) recordPodStartupMetric(pod *corev1.Pod, schedulableTime tim
304305
c.pendingPods.Insert(key)
305306
return
306307
}
307-
cond, ok := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool {
308-
return c.Type == corev1.PodReady
308+
cond, ready := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool {
309+
return c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue
309310
})
310311
if c.pendingPods.Has(key) {
311-
if !ok || cond.Status != corev1.ConditionTrue {
312-
PodUnstartedTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{
313-
podName: pod.Name,
314-
podNamespace: pod.Namespace,
315-
})
316-
if !schedulableTime.IsZero() {
317-
PodProvisioningUnstartedTimeSeconds.Set(time.Since(schedulableTime).Seconds(), map[string]string{
318-
podName: pod.Name,
319-
podNamespace: pod.Namespace,
320-
})
321-
} else {
322-
// Idempotently delete the unstarted_time_seconds metric if the schedulable time is zero
323-
PodProvisioningUnstartedTimeSeconds.Delete(map[string]string{
324-
podName: pod.Name,
325-
podNamespace: pod.Namespace,
326-
})
327-
}
328-
} else {
312+
if ready || podutils.IsTerminal(pod) {
329313
// Delete the unstarted metric since the pod is now started
330314
PodUnstartedTimeSeconds.Delete(map[string]string{
331315
podName: pod.Name,
@@ -342,6 +326,23 @@ func (c *Controller) recordPodStartupMetric(pod *corev1.Pod, schedulableTime tim
342326
c.pendingPods.Delete(key)
343327
// Clear cluster state's representation of these pods as we don't need to keep track of them anymore
344328
c.cluster.ClearPodSchedulingMappings(client.ObjectKeyFromObject(pod))
329+
} else {
330+
PodUnstartedTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{
331+
podName: pod.Name,
332+
podNamespace: pod.Namespace,
333+
})
334+
if !schedulableTime.IsZero() {
335+
PodProvisioningUnstartedTimeSeconds.Set(time.Since(schedulableTime).Seconds(), map[string]string{
336+
podName: pod.Name,
337+
podNamespace: pod.Namespace,
338+
})
339+
} else {
340+
// Idempotently delete the unstarted_time_seconds metric if the schedulable time is zero
341+
PodProvisioningUnstartedTimeSeconds.Delete(map[string]string{
342+
podName: pod.Name,
343+
podNamespace: pod.Namespace,
344+
})
345+
}
345346
}
346347
}
347348
}

pkg/controllers/metrics/pod/suite_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,92 @@ var _ = Describe("Pod Metrics", func() {
248248
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_startup_duration_seconds", nil)
249249
Expect(found).To(BeTrue())
250250
})
251+
It("should update the pod startup and unstarted time metrics when the pod has succeeded", func() {
252+
p := test.Pod()
253+
p.Status.Phase = corev1.PodPending
254+
255+
fakeClock.Step(1 * time.Hour)
256+
cluster.MarkPodSchedulingDecisions(ctx, map[*corev1.Pod]error{}, map[string][]*corev1.Pod{"n1": {p}}, map[string][]*corev1.Pod{"nc1": {p}})
257+
ExpectApplied(ctx, env.Client, p)
258+
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set
259+
_, found := FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
260+
"name": p.GetName(),
261+
"namespace": p.GetNamespace(),
262+
})
263+
Expect(found).To(BeTrue())
264+
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{
265+
"name": p.GetName(),
266+
"namespace": p.GetNamespace(),
267+
})
268+
Expect(found).To(BeTrue())
269+
270+
// Pod has now succeeded but readiness condition is set to false because the pod is now completed
271+
p.Status.Phase = corev1.PodSucceeded
272+
p.Status.Conditions = []corev1.PodCondition{
273+
{Type: corev1.PodReady, Status: corev1.ConditionFalse, LastTransitionTime: metav1.Now()},
274+
{Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()},
275+
}
276+
ExpectApplied(ctx, env.Client, p)
277+
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled and completed
278+
_, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
279+
"name": p.GetName(),
280+
"namespace": p.GetNamespace(),
281+
})
282+
Expect(found).To(BeFalse())
283+
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{
284+
"name": p.GetName(),
285+
"namespace": p.GetNamespace(),
286+
})
287+
Expect(found).To(BeFalse())
288+
289+
_, found = FindMetricWithLabelValues("karpenter_pods_startup_duration_seconds", nil)
290+
Expect(found).To(BeTrue())
291+
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_startup_duration_seconds", nil)
292+
Expect(found).To(BeTrue())
293+
})
294+
It("should update the pod startup and unstarted time metrics when the pod has failed", func() {
295+
p := test.Pod()
296+
p.Status.Phase = corev1.PodPending
297+
298+
fakeClock.Step(1 * time.Hour)
299+
cluster.MarkPodSchedulingDecisions(ctx, map[*corev1.Pod]error{}, map[string][]*corev1.Pod{"n1": {p}}, map[string][]*corev1.Pod{"nc1": {p}})
300+
ExpectApplied(ctx, env.Client, p)
301+
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set
302+
_, found := FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
303+
"name": p.GetName(),
304+
"namespace": p.GetNamespace(),
305+
})
306+
Expect(found).To(BeTrue())
307+
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{
308+
"name": p.GetName(),
309+
"namespace": p.GetNamespace(),
310+
})
311+
Expect(found).To(BeTrue())
312+
313+
// Pod has now failed and readiness condition is set to false
314+
p.Status.Phase = corev1.PodFailed
315+
p.Status.Conditions = []corev1.PodCondition{
316+
{Type: corev1.PodReady, Status: corev1.ConditionFalse, LastTransitionTime: metav1.Now()},
317+
{Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()},
318+
}
319+
ExpectApplied(ctx, env.Client, p)
320+
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled and completed
321+
_, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
322+
"name": p.GetName(),
323+
"namespace": p.GetNamespace(),
324+
})
325+
Expect(found).To(BeFalse())
326+
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{
327+
"name": p.GetName(),
328+
"namespace": p.GetNamespace(),
329+
})
330+
Expect(found).To(BeFalse())
331+
332+
_, found = FindMetricWithLabelValues("karpenter_pods_startup_duration_seconds", nil)
333+
Expect(found).To(BeTrue())
334+
_, found = FindMetricWithLabelValues("karpenter_pods_provisioning_startup_duration_seconds", nil)
335+
Expect(found).To(BeTrue())
336+
})
251337
It("should create and delete provisioning undecided metrics based on scheduling simulatinos", func() {
252338
p := test.Pod()
253339
p.Status.Phase = corev1.PodPending

0 commit comments

Comments
 (0)