forked from kubeflow/community-distribution
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgpu-availability-allocation-dashboard.yaml
More file actions
42 lines (42 loc) · 1.43 KB
/
Copy pathgpu-availability-allocation-dashboard.yaml
File metadata and controls
42 lines (42 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: gpu-availability-allocation-dashboard
namespace: kubeflow-monitoring-system
labels:
app.kubernetes.io/part-of: kubeflow-observability
spec:
instanceSelector:
matchLabels:
dashboards: "grafana"
json: |
{
"__requires": [
{ "type": "grafana", "id": "grafana", "name": "Grafana", "version": "9.0.0" },
{ "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }
],
"templating": {
"list": [
{
"name": "datasource",
"type": "datasource",
"query": "prometheus",
"label": "Datasource",
"hide": 0
}
]
},
"title": "GPU Availability & Allocation",
"description": "Requires kube-state-metrics for GPU allocation and availability metrics.",
"panels": [
{
"title": "Pending GPU workloads",
"description": "Requires kube-state-metrics being scraped by the Prometheus datasource.",
"type": "stat",
"datasource": { "type": "prometheus", "uid": "prometheus" },
"targets": [
{ "expr": "count(kube_pod_status_phase{phase=\"Pending\"} * on(pod, namespace) group_left() kube_pod_container_resource_requests{resource=\"nvidia.com/gpu\"})", "legendFormat": "Pending NVIDIA GPU Pods" }
]
}
]
}