Skip to content

Commit 3d1e1cc

Browse files
authored
add Grafana dashboard for GPU occupancy (#116)
Signed-off-by: Dmitry Shmulevich <dshmulevich@nvidia.com>
1 parent d2b7826 commit 3d1e1cc

File tree

1 file changed

+254
-0
lines changed

1 file changed

+254
-0
lines changed
Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
{
2+
"__inputs": [
3+
{
4+
"name": "DS_PROMETHEUS",
5+
"label": "Prometheus",
6+
"description": "",
7+
"type": "datasource",
8+
"pluginId": "prometheus",
9+
"pluginName": "Prometheus"
10+
}
11+
],
12+
"__elements": {},
13+
"__requires": [
14+
{
15+
"type": "panel",
16+
"id": "gauge",
17+
"name": "Gauge",
18+
"version": ""
19+
},
20+
{
21+
"type": "grafana",
22+
"id": "grafana",
23+
"name": "Grafana",
24+
"version": "11.0.0"
25+
},
26+
{
27+
"type": "datasource",
28+
"id": "prometheus",
29+
"name": "Prometheus",
30+
"version": "1.0.0"
31+
},
32+
{
33+
"type": "panel",
34+
"id": "timeseries",
35+
"name": "Time series",
36+
"version": ""
37+
}
38+
],
39+
"annotations": {
40+
"list": [
41+
{
42+
"builtIn": 1,
43+
"datasource": {
44+
"type": "grafana",
45+
"uid": "-- Grafana --"
46+
},
47+
"enable": true,
48+
"hide": true,
49+
"iconColor": "rgba(0, 211, 255, 1)",
50+
"name": "Annotations & Alerts",
51+
"type": "dashboard"
52+
}
53+
]
54+
},
55+
"editable": true,
56+
"fiscalYearStartMonth": 0,
57+
"graphTooltip": 0,
58+
"id": null,
59+
"links": [],
60+
"panels": [
61+
{
62+
"datasource": {
63+
"type": "prometheus",
64+
"uid": "${DS_PROMETHEUS}"
65+
},
66+
"fieldConfig": {
67+
"defaults": {
68+
"color": {
69+
"mode": "thresholds"
70+
},
71+
"mappings": [],
72+
"thresholds": {
73+
"mode": "absolute",
74+
"steps": [
75+
{
76+
"color": "red",
77+
"value": null
78+
},
79+
{
80+
"color": "orange",
81+
"value": 80
82+
},
83+
{
84+
"color": "yellow",
85+
"value": 90
86+
},
87+
{
88+
"color": "green",
89+
"value": 95
90+
}
91+
]
92+
}
93+
},
94+
"overrides": []
95+
},
96+
"gridPos": {
97+
"h": 9,
98+
"w": 4,
99+
"x": 0,
100+
"y": 0
101+
},
102+
"id": 2,
103+
"options": {
104+
"minVizHeight": 75,
105+
"minVizWidth": 75,
106+
"orientation": "auto",
107+
"reduceOptions": {
108+
"calcs": [
109+
"lastNotNull"
110+
],
111+
"fields": "",
112+
"values": false
113+
},
114+
"showThresholdLabels": false,
115+
"showThresholdMarkers": true,
116+
"sizing": "auto"
117+
},
118+
"pluginVersion": "11.0.0",
119+
"targets": [
120+
{
121+
"datasource": {
122+
"type": "prometheus",
123+
"uid": "${DS_PROMETHEUS}"
124+
},
125+
"editorMode": "code",
126+
"expr": "avg(avg_over_time(node_resource_occupancy{resource=\"nvidia.com/gpu\"}[$__range]))",
127+
"instant": false,
128+
"legendFormat": "__auto",
129+
"range": true,
130+
"refId": "A"
131+
}
132+
],
133+
"title": "Avg. cluster GPU occupancy (%)",
134+
"type": "gauge"
135+
},
136+
{
137+
"datasource": {
138+
"type": "prometheus",
139+
"uid": "${DS_PROMETHEUS}"
140+
},
141+
"description": "",
142+
"fieldConfig": {
143+
"defaults": {
144+
"color": {
145+
"mode": "palette-classic"
146+
},
147+
"custom": {
148+
"axisBorderShow": false,
149+
"axisCenteredZero": false,
150+
"axisColorMode": "text",
151+
"axisLabel": "",
152+
"axisPlacement": "auto",
153+
"barAlignment": 0,
154+
"drawStyle": "line",
155+
"fillOpacity": 0,
156+
"gradientMode": "none",
157+
"hideFrom": {
158+
"legend": false,
159+
"tooltip": false,
160+
"viz": false
161+
},
162+
"insertNulls": false,
163+
"lineInterpolation": "linear",
164+
"lineWidth": 1,
165+
"pointSize": 5,
166+
"scaleDistribution": {
167+
"type": "linear"
168+
},
169+
"showPoints": "auto",
170+
"spanNulls": false,
171+
"stacking": {
172+
"group": "A",
173+
"mode": "none"
174+
},
175+
"thresholdsStyle": {
176+
"mode": "off"
177+
}
178+
},
179+
"mappings": [],
180+
"max": 110,
181+
"min": 0,
182+
"thresholds": {
183+
"mode": "absolute",
184+
"steps": [
185+
{
186+
"color": "green",
187+
"value": null
188+
},
189+
{
190+
"color": "red",
191+
"value": 80
192+
}
193+
]
194+
}
195+
},
196+
"overrides": []
197+
},
198+
"gridPos": {
199+
"h": 9,
200+
"w": 14,
201+
"x": 4,
202+
"y": 0
203+
},
204+
"id": 1,
205+
"options": {
206+
"legend": {
207+
"calcs": [],
208+
"displayMode": "list",
209+
"placement": "bottom",
210+
"showLegend": true
211+
},
212+
"tooltip": {
213+
"maxHeight": 600,
214+
"mode": "single",
215+
"sort": "none"
216+
}
217+
},
218+
"targets": [
219+
{
220+
"datasource": {
221+
"type": "prometheus",
222+
"uid": "${DS_PROMETHEUS}"
223+
},
224+
"editorMode": "code",
225+
"expr": "avg(node_resource_occupancy{resource=\"nvidia.com/gpu\"})",
226+
"instant": false,
227+
"interval": "10s",
228+
"legendFormat": "__auto",
229+
"range": true,
230+
"refId": "A"
231+
}
232+
],
233+
"title": "Avg. node GPU occupancy (%)",
234+
"type": "timeseries"
235+
}
236+
],
237+
"refresh": "",
238+
"schemaVersion": 39,
239+
"tags": [],
240+
"templating": {
241+
"list": []
242+
},
243+
"time": {
244+
"from": "now-15m",
245+
"to": "now"
246+
},
247+
"timeRangeUpdatedDuringEditOrView": false,
248+
"timepicker": {},
249+
"timezone": "browser",
250+
"title": "GPU occupancy",
251+
"uid": "ddt1v72mdqxa8a",
252+
"version": 18,
253+
"weekStart": ""
254+
}

0 commit comments

Comments
 (0)