Skip to content

Commit a5b648d

Browse files
committed
Add GCE recommended alerts for GPU VMs
1 parent e0b122f commit a5b648d

File tree

5 files changed

+146
-0
lines changed

5 files changed

+146
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"displayName": "VM Instance - High GPU Memory Utilization (${INSTANCE_NAME})",
3+
"documentation": {
4+
"content": "This alert fires when the GPU memory utilization on the VM instance ${INSTANCE_NAME} rises above 90% for 5 minutes or more.",
5+
"mimeType": "text/markdown"
6+
},
7+
"userLabels": {},
8+
"conditions": [
9+
{
10+
"displayName": "VM Instance - High GPU memory utilization (${INSTANCE_NAME})",
11+
"conditionMonitoringQueryLanguage": {
12+
"duration": "0s",
13+
"trigger": {
14+
"count": 1
15+
},
16+
"query": "{ fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n | filter metric.memory_state == 'used'\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'"
17+
}
18+
}
19+
],
20+
"alertStrategy": {
21+
"autoClose": "604800s"
22+
},
23+
"combiner": "OR",
24+
"enabled": true
25+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"displayName": "VM Instance - High GPU Memory Utilization",
3+
"documentation": {
4+
"content": "This alert fires when the GPU memory utilization on any VM instance in the project rises above 90% for 5 minutes or more.",
5+
"mimeType": "text/markdown"
6+
},
7+
"userLabels": {},
8+
"conditions": [
9+
{
10+
"displayName": "VM Instance - High GPU memory utilization",
11+
"conditionMonitoringQueryLanguage": {
12+
"duration": "0s",
13+
"trigger": {
14+
"count": 1
15+
},
16+
"query": "{ fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n | filter metric.memory_state == 'used'\n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n | every 5m\n | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'"
17+
}
18+
}
19+
],
20+
"alertStrategy": {
21+
"autoClose": "604800s"
22+
},
23+
"combiner": "OR",
24+
"enabled": true
25+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"displayName": "VM Instance - High GPU Utilization (${INSTANCE_NAME})",
3+
"documentation": {
4+
"content": "This alert fires when the GPU utilization on the VM instance (${INSTANCE_NAME}) rises above 90% for 5 minutes or more.",
5+
"mimeType": "text/markdown"
6+
},
7+
"userLabels": {},
8+
"conditions": [
9+
{
10+
"displayName": "VM Instance - High GPU utilization (${INSTANCE_NAME})",
11+
"conditionThreshold": {
12+
"filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\" AND metadata.system_labels.name = \"${INSTANCE_NAME}\"",
13+
"aggregations": [
14+
{
15+
"alignmentPeriod": "300s",
16+
"crossSeriesReducer": "REDUCE_NONE",
17+
"perSeriesAligner": "ALIGN_MEAN"
18+
}
19+
],
20+
"comparison": "COMPARISON_GT",
21+
"duration": "0s",
22+
"trigger": {
23+
"count": 1
24+
},
25+
"thresholdValue": 90
26+
}
27+
}
28+
],
29+
"alertStrategy": {
30+
"autoClose": "604800s"
31+
},
32+
"combiner": "OR",
33+
"enabled": true
34+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
{
2+
"displayName": "VM Instance - High GPU Utilization",
3+
"documentation": {
4+
"content": "This alert fires when the GPU utilization on any VM instance in the project rises above 90% for 5 minutes or more.",
5+
"mimeType": "text/markdown"
6+
},
7+
"userLabels": {},
8+
"conditions": [
9+
{
10+
"displayName": "VM Instance - High GPU utilization",
11+
"conditionThreshold": {
12+
"filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\"",
13+
"aggregations": [
14+
{
15+
"alignmentPeriod": "300s",
16+
"crossSeriesReducer": "REDUCE_NONE",
17+
"perSeriesAligner": "ALIGN_MEAN"
18+
}
19+
],
20+
"comparison": "COMPARISON_GT",
21+
"duration": "0s",
22+
"trigger": {
23+
"count": 1
24+
},
25+
"thresholdValue": 90
26+
}
27+
}
28+
],
29+
"alertStrategy": {
30+
"autoClose": "604800s"
31+
},
32+
"combiner": "OR",
33+
"enabled": true
34+
}

alerts/google-gce/metadata.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,31 @@ alert_policy_templates:
6969
related_integrations:
7070
- id: gce
7171
platform: GCP
72+
-
73+
id: gpu-utilization-too-high
74+
description: "Monitors GPU utilization across all GCE VMs in the current project and will notify you if the GPU utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric."
75+
version: 1
76+
related_integrations:
77+
- id: gce
78+
platform: GCP
79+
-
80+
id: gpu-utilization-too-high-within-vm
81+
description: "Monitors GPU utilization in the specified GCE VM and will notify you if the GPU utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric."
82+
version: 1
83+
related_integrations:
84+
- id: gce
85+
platform: GCP
86+
-
87+
id: gpu-memory-utilization-too-high
88+
description: "Monitors gpu memory utilization across all GCE VMs in the current project and will notify you if the gpu memory utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu memory utilization metric."
89+
version: 1
90+
related_integrations:
91+
- id: gce
92+
platform: GCP
93+
-
94+
id: gpu-memory-utilization-too-high-within-vm
95+
description: "Monitors gpu memory utilization in the specified GCE VM and will notify you if the gpu memory utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on the VM to collect the gpu memory utilization metric."
96+
version: 1
97+
related_integrations:
98+
- id: gce
99+
platform: GCP

0 commit comments

Comments
 (0)