Skip to content

Commit b68454a

Browse files
committed
Add pod_completed_total
1 parent eafab75 commit b68454a

File tree

2 files changed

+40
-12
lines changed

2 files changed

+40
-12
lines changed

audit-policy.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ rules:
1111
resources:
1212
- pods
1313
- pods/binding
14+
- pods/status
1415
- group: batch
1516
resources:
1617
- jobs

exporter/metrics.go

+39-12
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ var (
2323
Help: "Total number of pods deleted",
2424
}, []string{"cluster", "namespace", "user", "phase"})
2525

26+
podCompletedTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
27+
Name: "pod_completed_total",
28+
Help: "Total number of pods transitioned to completed status",
29+
}, []string{"cluster", "namespace", "user", "phase"})
30+
2631
podSchedulingLatency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
2732
Name: "pod_scheduling_latency_seconds",
2833
Help: "Duration from pod creation to scheduled on node in seconds",
@@ -42,6 +47,7 @@ func init() {
4247
podSchedulingLatency,
4348
podDeletedTotal,
4449
batchJobCompleteLatency,
50+
podCompletedTotal,
4551
)
4652
}
4753

@@ -76,14 +82,7 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
7682
target := buildTarget(event.ObjectRef)
7783
createTime, exists := p.podCreationTimes[target]
7884
if !exists {
79-
// Kueue's audit events may create pod/binding events before pod creation events
80-
user := extractUserAgent(event.UserAgent)
81-
podSchedulingLatency.WithLabelValues(
82-
clusterLabel,
83-
ns,
84-
user,
85-
).Observe(0)
86-
p.podCreationTimes[target] = nil
85+
slog.Warn("Pod not found", "target", target)
8786
return
8887
}
8988

@@ -118,16 +117,17 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
118117
} else {
119118
p.podCreationTimes[target] = nil
120119
}
121-
} else if event.Verb == "delete" {
122-
delete(p.podCreationTimes, buildTarget(event.ObjectRef))
123-
124-
if event.ResponseObject != nil {
120+
} else if event.Verb == "delete" && event.ResponseObject != nil {
121+
target := buildTarget(event.ObjectRef)
122+
_, ok := p.podCreationTimes[target]
123+
if ok {
125124
var pod Pod
126125
if err := json.Unmarshal(event.ResponseObject.Raw, &pod); err != nil {
127126
slog.Error("failed to unmarshal pod during delete", "err", err)
128127
return
129128
}
130129

130+
delete(p.podCreationTimes, target)
131131
user := extractUserAgent(event.UserAgent)
132132
podDeletedTotal.WithLabelValues(
133133
clusterLabel,
@@ -136,6 +136,31 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
136136
pod.Status.Phase,
137137
).Inc()
138138
}
139+
} else if (event.Verb == "update" || event.Verb == "patch") &&
140+
event.ObjectRef.Subresource == "status" &&
141+
event.ResponseObject != nil {
142+
143+
target := buildTarget(event.ObjectRef)
144+
t, ok := p.podCreationTimes[target]
145+
if ok && t == nil {
146+
var pod Pod
147+
if err := json.Unmarshal(event.ResponseObject.Raw, &pod); err != nil {
148+
slog.Error("failed to unmarshal new pod during update", "err", err)
149+
return
150+
}
151+
152+
phase := pod.Status.Phase
153+
if podCompletedPhases[phase] {
154+
delete(p.podCreationTimes, target)
155+
user := extractUserAgent(event.UserAgent)
156+
podCompletedTotal.WithLabelValues(
157+
clusterLabel,
158+
ns,
159+
user,
160+
phase,
161+
).Inc()
162+
}
163+
}
139164
}
140165
}
141166

@@ -180,3 +205,5 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
180205
}
181206
}
182207
}
208+
209+
var podCompletedPhases = map[string]bool{"Succeeded": true, "Failed": true}

0 commit comments

Comments
 (0)