Skip to content

Commit 7d725aa

Browse files
Enhance Grafana dashboard with new Prometheus datasource configurations and additional metrics for improved monitoring of OCI GPU Scanner health checks.
1 parent 7a9b682 commit 7d725aa

File tree

1 file changed

+71
-4
lines changed

1 file changed

+71
-4
lines changed

grafana_dashboards/oke-workloads-to-gpuscanner-nodename-dashboard.json

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,22 @@
19641964
"value": "Node"
19651965
}
19661966
]
1967+
},
1968+
{
1969+
"matcher": {
1970+
"id": "byName",
1971+
"options": "Time Last Run"
1972+
},
1973+
"properties": [
1974+
{
1975+
"id": "unit",
1976+
"value": "dateTimeAsIso"
1977+
},
1978+
{
1979+
"id": "custom.align",
1980+
"value": "auto"
1981+
}
1982+
]
19671983
}
19681984
]
19691985
},
@@ -2157,6 +2173,21 @@
21572173
"legendFormat": "__auto",
21582174
"range": false,
21592175
"refId": "K"
2176+
},
2177+
{
2178+
"datasource": {
2179+
"type": "prometheus",
2180+
"uid": "{{DATASOURCE_PROMETHEUS}}"
2181+
},
2182+
"editorMode": "code",
2183+
"exemplar": false,
2184+
"expr": "max by (instance) (push_time_seconds{job=\"oci_lens_healthchecks\"}) * 1000",
2185+
"format": "table",
2186+
"instant": true,
2187+
"legendFormat": "__auto",
2188+
"range": false,
2189+
"refId": "L",
2190+
"hide": false
21602191
}
21612192
],
21622193
"title": "OCI GPU Scanner - ACTIVE health check summary",
@@ -2391,12 +2422,48 @@
23912422
"test 6": true,
23922423
"test 7": true,
23932424
"test 8": true,
2394-
"test 9": true
2425+
"test 9": true,
2426+
"Time 1": true,
2427+
"Value #L": false,
2428+
"dtype 11": true,
2429+
"dtype 12": true,
2430+
"dtype 13": true,
2431+
"dtype 14": true,
2432+
"dtype 15": true,
2433+
"dtype 16": true,
2434+
"dtype 17": true,
2435+
"dtype 18": true,
2436+
"dtype 19": true,
2437+
"dtype 20": true,
2438+
"dtype": true,
2439+
"tensor_parallel_size 11": true,
2440+
"tensor_parallel_size 12": true,
2441+
"tensor_parallel_size 13": true,
2442+
"tensor_parallel_size 14": true
23952443
},
23962444
"includeByName": {},
2397-
"indexByName": {},
2445+
"indexByName": {
2446+
"instance": 0,
2447+
"Value #L": 1,
2448+
"hostGPU 1": 2,
2449+
"hostIP 1": 3,
2450+
"hostRegion 1": 4,
2451+
"tensor_parallel_size 1": 5,
2452+
"Value #A": 6,
2453+
"Value #B": 7,
2454+
"Value #C": 8,
2455+
"Value #D": 9,
2456+
"Value #E": 10,
2457+
"Value #F": 11,
2458+
"Value #G": 12,
2459+
"Value #H": 13,
2460+
"Value #I": 14,
2461+
"Value #J": 15,
2462+
"Value #K": 16
2463+
},
23982464
"renameByName": {
2399-
"Time 1": "Time Last Run",
2465+
"Time 1": "",
2466+
"tensor_parallel_size 1": "Tensor Parallel Size",
24002467
"Value #A": "Compute Throughput",
24012468
"Value #B": "Memory Bandwidth",
24022469
"Value #C": "Error Detection",
@@ -2408,7 +2475,7 @@
24082475
"Value #I": "GPU Temperature Check",
24092476
"Value #J": "GPU Power Check",
24102477
"Value #K": "GPU Utilization Check",
2411-
"Value #L": "",
2478+
"Value #L": "Time Last Run",
24122479
"__name__ 1": "",
24132480
"gpu 1": "GPU",
24142481
"hostGPU 1": "GPU Shape",

0 commit comments

Comments
 (0)