diff --git a/pkg/controllers/metrics/pod/controller.go b/pkg/controllers/metrics/pod/controller.go index 5d28f3130c..262cce74da 100644 --- a/pkg/controllers/metrics/pod/controller.go +++ b/pkg/controllers/metrics/pod/controller.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/karpenter/pkg/controllers/state" "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/operator/injection" + podutils "sigs.k8s.io/karpenter/pkg/utils/pod" ) const ( @@ -304,28 +305,11 @@ func (c *Controller) recordPodStartupMetric(pod *corev1.Pod, schedulableTime tim c.pendingPods.Insert(key) return } - cond, ok := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool { - return c.Type == corev1.PodReady + cond, ready := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool { + return c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue }) if c.pendingPods.Has(key) { - if !ok || cond.Status != corev1.ConditionTrue { - PodUnstartedTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{ - podName: pod.Name, - podNamespace: pod.Namespace, - }) - if !schedulableTime.IsZero() { - PodProvisioningUnstartedTimeSeconds.Set(time.Since(schedulableTime).Seconds(), map[string]string{ - podName: pod.Name, - podNamespace: pod.Namespace, - }) - } else { - // Idempotently delete the unstarted_time_seconds metric if the schedulable time is zero - PodProvisioningUnstartedTimeSeconds.Delete(map[string]string{ - podName: pod.Name, - podNamespace: pod.Namespace, - }) - } - } else { + if ready || podutils.IsTerminal(pod) { // Delete the unstarted metric since the pod is now started PodUnstartedTimeSeconds.Delete(map[string]string{ podName: pod.Name, @@ -342,6 +326,23 @@ func (c *Controller) recordPodStartupMetric(pod *corev1.Pod, schedulableTime tim c.pendingPods.Delete(key) // Clear cluster state's representation of these pods as we don't need to keep track of them anymore c.cluster.ClearPodSchedulingMappings(client.ObjectKeyFromObject(pod)) + } else { + PodUnstartedTimeSeconds.Set(time.Since(pod.CreationTimestamp.Time).Seconds(), map[string]string{ + podName: pod.Name, + podNamespace: pod.Namespace, + }) + if !schedulableTime.IsZero() { + PodProvisioningUnstartedTimeSeconds.Set(time.Since(schedulableTime).Seconds(), map[string]string{ + podName: pod.Name, + podNamespace: pod.Namespace, + }) + } else { + // Idempotently delete the unstarted_time_seconds metric if the schedulable time is zero + PodProvisioningUnstartedTimeSeconds.Delete(map[string]string{ + podName: pod.Name, + podNamespace: pod.Namespace, + }) + } } } } diff --git a/pkg/controllers/metrics/pod/suite_test.go b/pkg/controllers/metrics/pod/suite_test.go index c2596c21a2..c6abb4e979 100644 --- a/pkg/controllers/metrics/pod/suite_test.go +++ b/pkg/controllers/metrics/pod/suite_test.go @@ -248,6 +248,92 @@ var _ = Describe("Pod Metrics", func() { _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_startup_duration_seconds", nil) Expect(found).To(BeTrue()) }) + It("should update the pod startup and unstarted time metrics when the pod has succeeded", func() { + p := test.Pod() + p.Status.Phase = corev1.PodPending + + fakeClock.Step(1 * time.Hour) + cluster.MarkPodSchedulingDecisions(ctx, map[*corev1.Pod]error{}, map[string][]*corev1.Pod{"n1": {p}}, map[string][]*corev1.Pod{"nc1": {p}}) + ExpectApplied(ctx, env.Client, p) + ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set + _, found := FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeTrue()) + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeTrue()) + + // Pod has now succeeded but readiness condition is set to false because the pod is now completed + p.Status.Phase = corev1.PodSucceeded + p.Status.Conditions = []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse, LastTransitionTime: metav1.Now()}, + {Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, + } + ExpectApplied(ctx, env.Client, p) + ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled and completed + _, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeFalse()) + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeFalse()) + + _, found = FindMetricWithLabelValues("karpenter_pods_startup_duration_seconds", nil) + Expect(found).To(BeTrue()) + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_startup_duration_seconds", nil) + Expect(found).To(BeTrue()) + }) + It("should update the pod startup and unstarted time metrics when the pod has failed", func() { + p := test.Pod() + p.Status.Phase = corev1.PodPending + + fakeClock.Step(1 * time.Hour) + cluster.MarkPodSchedulingDecisions(ctx, map[*corev1.Pod]error{}, map[string][]*corev1.Pod{"n1": {p}}, map[string][]*corev1.Pod{"nc1": {p}}) + ExpectApplied(ctx, env.Client, p) + ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set + _, found := FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeTrue()) + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeTrue()) + + // Pod has now failed and readiness condition is set to false + p.Status.Phase = corev1.PodFailed + p.Status.Conditions = []corev1.PodCondition{ + {Type: corev1.PodReady, Status: corev1.ConditionFalse, LastTransitionTime: metav1.Now()}, + {Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}, + } + ExpectApplied(ctx, env.Client, p) + ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled and completed + _, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeFalse()) + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_unstarted_time_seconds", map[string]string{ + "name": p.GetName(), + "namespace": p.GetNamespace(), + }) + Expect(found).To(BeFalse()) + + _, found = FindMetricWithLabelValues("karpenter_pods_startup_duration_seconds", nil) + Expect(found).To(BeTrue()) + _, found = FindMetricWithLabelValues("karpenter_pods_provisioning_startup_duration_seconds", nil) + Expect(found).To(BeTrue()) + }) It("should create and delete provisioning undecided metrics based on scheduling simulatinos", func() { p := test.Pod() p.Status.Phase = corev1.PodPending