23
23
Help : "Total number of pods deleted" ,
24
24
}, []string {"cluster" , "namespace" , "user" , "phase" })
25
25
26
+ podCompletedTotal = prometheus .NewCounterVec (prometheus.CounterOpts {
27
+ Name : "pod_completed_total" ,
28
+ Help : "Total number of pods transitioned to completed status" ,
29
+ }, []string {"cluster" , "namespace" , "user" , "phase" })
30
+
26
31
podSchedulingLatency = prometheus .NewHistogramVec (prometheus.HistogramOpts {
27
32
Name : "pod_scheduling_latency_seconds" ,
28
33
Help : "Duration from pod creation to scheduled on node in seconds" ,
@@ -42,6 +47,7 @@ func init() {
42
47
podSchedulingLatency ,
43
48
podDeletedTotal ,
44
49
batchJobCompleteLatency ,
50
+ podCompletedTotal ,
45
51
)
46
52
}
47
53
@@ -76,14 +82,7 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
76
82
target := buildTarget (event .ObjectRef )
77
83
createTime , exists := p .podCreationTimes [target ]
78
84
if ! exists {
79
- // Kueue's audit events may create pod/binding events before pod creation events
80
- user := extractUserAgent (event .UserAgent )
81
- podSchedulingLatency .WithLabelValues (
82
- clusterLabel ,
83
- ns ,
84
- user ,
85
- ).Observe (0 )
86
- p .podCreationTimes [target ] = nil
85
+ slog .Warn ("Pod not found" , "target" , target )
87
86
return
88
87
}
89
88
@@ -118,16 +117,17 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
118
117
} else {
119
118
p .podCreationTimes [target ] = nil
120
119
}
121
- } else if event .Verb == "delete" {
122
- delete ( p . podCreationTimes , buildTarget (event .ObjectRef ) )
123
-
124
- if event . ResponseObject != nil {
120
+ } else if event .Verb == "delete" && event . ResponseObject != nil {
121
+ target := buildTarget (event .ObjectRef )
122
+ _ , ok := p . podCreationTimes [ target ]
123
+ if ok {
125
124
var pod Pod
126
125
if err := json .Unmarshal (event .ResponseObject .Raw , & pod ); err != nil {
127
126
slog .Error ("failed to unmarshal pod during delete" , "err" , err )
128
127
return
129
128
}
130
129
130
+ delete (p .podCreationTimes , target )
131
131
user := extractUserAgent (event .UserAgent )
132
132
podDeletedTotal .WithLabelValues (
133
133
clusterLabel ,
@@ -136,6 +136,31 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
136
136
pod .Status .Phase ,
137
137
).Inc ()
138
138
}
139
+ } else if (event .Verb == "update" || event .Verb == "patch" ) &&
140
+ event .ObjectRef .Subresource == "status" &&
141
+ event .ResponseObject != nil {
142
+
143
+ target := buildTarget (event .ObjectRef )
144
+ t , ok := p .podCreationTimes [target ]
145
+ if ok && t == nil {
146
+ var pod Pod
147
+ if err := json .Unmarshal (event .ResponseObject .Raw , & pod ); err != nil {
148
+ slog .Error ("failed to unmarshal new pod during update" , "err" , err )
149
+ return
150
+ }
151
+
152
+ phase := pod .Status .Phase
153
+ if podCompletedPhases [phase ] {
154
+ delete (p .podCreationTimes , target )
155
+ user := extractUserAgent (event .UserAgent )
156
+ podCompletedTotal .WithLabelValues (
157
+ clusterLabel ,
158
+ ns ,
159
+ user ,
160
+ phase ,
161
+ ).Inc ()
162
+ }
163
+ }
139
164
}
140
165
}
141
166
@@ -180,3 +205,5 @@ func (p *Exporter) updateMetrics(clusterLabel string, event auditv1.Event) {
180
205
}
181
206
}
182
207
}
208
+
209
+ var podCompletedPhases = map [string ]bool {"Succeeded" : true , "Failed" : true }
0 commit comments