Skip to content

Commit 0b48f3c

Browse files
authored
refactor metrics for container gpu allocation (#1169)
Signed-off-by: Jifei Wang <[email protected]>
1 parent 4fffa54 commit 0b48f3c

File tree

1 file changed

+26
-6
lines changed

1 file changed

+26
-6
lines changed

cmd/scheduler/metrics.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -171,16 +171,26 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
171171

172172
ctrvGPUDeviceAllocatedDesc := prometheus.NewDesc(
173173
"vGPUPodsDeviceAllocated",
174-
"vGPU Allocated from pods",
175-
[]string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid", "deviceusedcore"}, nil,
174+
"vGPU Allocated from pods (This metric will be deprecated in v2.8.0, use vGPUMemoryAllocated and vGPUCoreAllocated instead.)",
175+
[]string{"deprecated_version", "podnamespace", "nodename", "podname", "containeridx", "deviceuuid", "deviceusedcore"}, nil,
176176
)
177177
ctrvGPUdeviceAllocatedMemoryPercentageDesc := prometheus.NewDesc(
178178
"vGPUMemoryPercentage",
179-
"vGPU memory percentage allocated from a container",
180-
[]string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
179+
"vGPU memory percentage allocated from a container (This metric will be deprecated in v2.8.0, use vGPUMemoryAllocated instead.)",
180+
[]string{"deprecated_version", "podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
181181
)
182182
ctrvGPUdeviceAllocateCorePercentageDesc := prometheus.NewDesc(
183183
"vGPUCorePercentage",
184+
"vGPU core allocated from a container (This metric will be deprecated in v2.8.0, use vGPUCoreAllocated instead.)",
185+
[]string{"deprecated_version", "podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
186+
)
187+
ctrvGPUdeviceAllocatedMemoryDesc := prometheus.NewDesc(
188+
"vGPUMemoryAllocated",
189+
"vGPU memory allocated from a container",
190+
[]string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
191+
)
192+
ctrvGPUdeviceAllocatedCoreDesc := prometheus.NewDesc(
193+
"vGPUCoreAllocated",
184194
"vGPU core allocated from a container",
185195
[]string{"podnamespace", "nodename", "podname", "containeridx", "deviceuuid"}, nil,
186196
)
@@ -206,7 +216,17 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
206216
ctrvGPUDeviceAllocatedDesc,
207217
prometheus.GaugeValue,
208218
float64(ctrdevval.Usedmem)*float64(1024)*float64(1024),
219+
"v2.8.0", val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID, fmt.Sprint(ctrdevval.Usedcores))
220+
ch <- prometheus.MustNewConstMetric(
221+
ctrvGPUdeviceAllocatedMemoryDesc,
222+
prometheus.GaugeValue,
223+
float64(ctrdevval.Usedmem)*float64(1024)*float64(1024),
209224
val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID, fmt.Sprint(ctrdevval.Usedcores))
225+
ch <- prometheus.MustNewConstMetric(
226+
ctrvGPUdeviceAllocatedCoreDesc,
227+
prometheus.GaugeValue,
228+
float64(ctrdevval.Usedcores),
229+
val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID)
210230
var totaldev int32
211231
found := false
212232
for _, ni := range *nu {
@@ -232,13 +252,13 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
232252
ctrvGPUdeviceAllocatedMemoryPercentageDesc,
233253
prometheus.GaugeValue,
234254
float64(ctrdevval.Usedmem)/float64(totaldev),
235-
val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID)
255+
"v2.8.0", val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID)
236256
}
237257
ch <- prometheus.MustNewConstMetric(
238258
ctrvGPUdeviceAllocateCorePercentageDesc,
239259
prometheus.GaugeValue,
240260
float64(ctrdevval.Usedcores),
241-
val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID)
261+
"v2.8.0", val.Namespace, val.NodeID, val.Name, fmt.Sprint(ctridx), ctrdevval.UUID)
242262
}
243263
}
244264
}

0 commit comments

Comments
 (0)