Add GCE recommended alerts for GPU VMs

LujieDuan · LujieDuan · commit a5b648d8aa24 · 2024-06-26T18:22:21.000Z
diff --git a/alerts/google-gce/gpu-memory-utilization-too-high-within-vm.v1.json b/alerts/google-gce/gpu-memory-utilization-too-high-within-vm.v1.json
@@ -0,0 +1,25 @@
+{
+  "displayName": "VM Instance - High GPU Memory Utilization (${INSTANCE_NAME})",
+  "documentation": {
+    "content": "This alert fires when the GPU memory utilization on the VM instance ${INSTANCE_NAME} rises above 90% for 5 minutes or more.",
+    "mimeType": "text/markdown"
+  },
+  "userLabels": {},
+  "conditions": [
+    {
+      "displayName": "VM Instance - High GPU memory utilization (${INSTANCE_NAME})",
+      "conditionMonitoringQueryLanguage": {
+        "duration": "0s",
+        "trigger": {
+          "count": 1
+        },
+        "query": "{ fetch gce_instance\n  | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n  | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n  | filter metric.memory_state == 'used'\n  | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n  | every 5m\n  | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n  | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n  | filter (metadata.system_labels.name == '${INSTANCE_NAME}')\n  | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n  | every 5m\n  | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'"
+      }
+    }
+  ],
+  "alertStrategy": {
+    "autoClose": "604800s"
+  },
+  "combiner": "OR",
+  "enabled": true
+}
diff --git a/alerts/google-gce/gpu-memory-utilization-too-high.v1.json b/alerts/google-gce/gpu-memory-utilization-too-high.v1.json
@@ -0,0 +1,25 @@
+{
+  "displayName": "VM Instance - High GPU Memory Utilization",
+  "documentation": {
+    "content": "This alert fires when the GPU memory utilization on any VM instance in the project rises above 90% for 5 minutes or more.",
+    "mimeType": "text/markdown"
+  },
+  "userLabels": {},
+  "conditions": [
+    {
+      "displayName": "VM Instance - High GPU memory utilization",
+      "conditionMonitoringQueryLanguage": {
+        "duration": "0s",
+        "trigger": {
+          "count": 1
+        },
+        "query": "{ fetch gce_instance\n  | metric 'agent.googleapis.com/gpu/memory/bytes_used'\n  | filter metric.memory_state == 'used'\n  | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n  | every 5m\n  | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)]\n; fetch gce_instance\n  | metric 'agent.googleapis.com/gpu/memory/bytes_used' \n  | group_by 5m, [value_bytes_used_mean: mean(value.bytes_used)]\n  | every 5m\n  | group_by [metric.gpu_number, metric.model, metric.uuid, resource.instance_id, resource.project_id, resource.zone, metadata.system_labels.name], [value_bytes_used_mean_aggregate: aggregate(value_bytes_used_mean)] }\n| ratio\n| mul (100)\n| cast_units ('%')\n| every 5m\n| condition val() > 0.9 '10^2.%'"
+      }
+    }
+  ],
+  "alertStrategy": {
+    "autoClose": "604800s"
+  },
+  "combiner": "OR",
+  "enabled": true
+}
diff --git a/alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json b/alerts/google-gce/gpu-utilization-too-high-within-vm.v1.json
@@ -0,0 +1,34 @@
+{
+  "displayName": "VM Instance - High GPU Utilization (${INSTANCE_NAME})",
+  "documentation": {
+    "content": "This alert fires when the GPU utilization on the VM instance (${INSTANCE_NAME}) rises above 90% for 5 minutes or more.",
+    "mimeType": "text/markdown"
+  },
+  "userLabels": {},
+  "conditions": [
+    {
+      "displayName": "VM Instance - High GPU utilization (${INSTANCE_NAME})",
+      "conditionThreshold": {
+        "filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\" AND metadata.system_labels.name = \"${INSTANCE_NAME}\"",
+        "aggregations": [
+          {
+            "alignmentPeriod": "300s",
+            "crossSeriesReducer": "REDUCE_NONE",
+            "perSeriesAligner": "ALIGN_MEAN"
+          }
+        ],
+        "comparison": "COMPARISON_GT",
+        "duration": "0s",
+        "trigger": {
+          "count": 1
+        },
+        "thresholdValue": 90
+      }
+    }
+  ],
+  "alertStrategy": {
+    "autoClose": "604800s"
+  },
+  "combiner": "OR",
+  "enabled": true
+}
diff --git a/alerts/google-gce/gpu-utilization-too-high.v1.json b/alerts/google-gce/gpu-utilization-too-high.v1.json
@@ -0,0 +1,34 @@
+{
+  "displayName": "VM Instance - High GPU Utilization",
+  "documentation": {
+    "content": "This alert fires when the GPU utilization on any VM instance in the project rises above 90% for 5 minutes or more.",
+    "mimeType": "text/markdown"
+  },
+  "userLabels": {},
+  "conditions": [
+    {
+      "displayName": "VM Instance - High GPU utilization",
+      "conditionThreshold": {
+        "filter": "resource.type = \"gce_instance\" AND metric.type = \"agent.googleapis.com/gpu/utilization\"",
+        "aggregations": [
+          {
+            "alignmentPeriod": "300s",
+            "crossSeriesReducer": "REDUCE_NONE",
+            "perSeriesAligner": "ALIGN_MEAN"
+          }
+        ],
+        "comparison": "COMPARISON_GT",
+        "duration": "0s",
+        "trigger": {
+          "count": 1
+        },
+        "thresholdValue": 90
+      }
+    }
+  ],
+  "alertStrategy": {
+    "autoClose": "604800s"
+  },
+  "combiner": "OR",
+  "enabled": true
+}
diff --git a/alerts/google-gce/metadata.yaml b/alerts/google-gce/metadata.yaml
@@ -69,3 +69,31 @@ alert_policy_templates:
   related_integrations:
     - id: gce
       platform: GCP
+-
+  id: gpu-utilization-too-high
+  description: "Monitors GPU utilization across all GCE VMs in the current project and will notify you if the GPU utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric."
+  version: 1
+  related_integrations:
+    - id: gce
+      platform: GCP
+-
+  id: gpu-utilization-too-high-within-vm
+  description: "Monitors GPU utilization in the specified GCE VM and will notify you if the GPU utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu utilization metric."
+  version: 1
+  related_integrations:
+    - id: gce
+      platform: GCP
+-
+  id: gpu-memory-utilization-too-high
+  description: "Monitors gpu memory utilization across all GCE VMs in the current project and will notify you if the gpu memory utilization on any VM instance rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on VMs to collect the gpu memory utilization metric."
+  version: 1
+  related_integrations:
+    - id: gce
+      platform: GCP
+-
+  id: gpu-memory-utilization-too-high-within-vm
+  description: "Monitors gpu memory utilization in the specified GCE VM and will notify you if the gpu memory utilization rises above 90% for 5 minutes or more. This requires the Ops Agent to be installed on the VM to collect the gpu memory utilization metric."
+  version: 1
+  related_integrations:
+    - id: gce
+      platform: GCP