-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprometheus_alerts.yaml
88 lines (88 loc) · 3.7 KB
/
prometheus_alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"groups":
- "name": "karpenter"
"rules":
- "alert": "KarpenterCloudProviderErrors"
"annotations":
"dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance"
"description": "The Karpenter provider {{ $labels.provider }} with the controller {{ $labels.controller }} has errors with the method {{ $labels.method }}."
"summary": "Karpenter has Cloud Provider Errors."
"expr": |
sum(
increase(
karpenter_cloudprovider_errors_total{
job=~"karpenter"
}[5m]
)
) by (namespace, job, provider, controller, method) > 0
"for": "5m"
"labels":
"severity": "warning"
- "alert": "KarpenterNodeClaimsTerminationDurationHigh"
"annotations":
"dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity"
"description": "The average node claim termination duration in Karpenter has exceeded 20 minutes for more than 15 minutes in nodepool {{ $labels.nodepool }}. This may indicate cloud provider issues or improper instance termination handling."
"summary": "Karpenter Node Claims Termination Duration is High."
"expr": |
sum(
karpenter_nodeclaims_termination_duration_seconds_sum{
job=~"karpenter"
}
) by (namespace, job, nodepool)
/
sum(
karpenter_nodeclaims_termination_duration_seconds_count{
job=~"karpenter"
}
) by (namespace, job, nodepool) > 1200
"for": "15m"
"labels":
"severity": "warning"
- "alert": "KarpenterNodepoolNearCapacity"
"annotations":
"dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview"
"description": "The resource {{ $labels.resource_type }} in the Karpenter node pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources."
"summary": "Karpenter Nodepool near capacity."
"expr": |
sum (
karpenter_nodepools_usage{job=~"karpenter"}
) by (namespace, job, nodepool, resource_type)
/
sum (
karpenter_nodepools_limit{job=~"karpenter"}
) by (namespace, job, nodepool, resource_type)
* 100 > 75
"for": "15m"
"labels":
"severity": "warning"
- "name": "cluster-autoscaler"
"rules":
- "alert": "ClusterAutoscalerNodeCountNearCapacity"
"annotations":
"dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
"description": "The node count for the cluster autoscaler job {{ $labels.job }} is reaching max limit. Consider scaling node groups."
"summary": "Cluster Autoscaler Node Count near Capacity."
"expr": |
sum (
cluster_autoscaler_nodes_count{job=~"cluster-autoscaler"}
) by (namespace, job)
/
sum (
cluster_autoscaler_max_nodes_count{job=~"cluster-autoscaler"}
) by (namespace, job)
* 100 > 75
"for": "15m"
"labels":
"severity": "warning"
- "alert": "ClusterAutoscalerUnschedulablePods"
"annotations":
"dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
"description": "The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity."
"summary": "Pods Pending Scheduling - Cluster Node Group Scaling Required"
"expr": |
sum (
cluster_autoscaler_unschedulable_pods_count{job=~"cluster-autoscaler"}
) by (namespace, job)
> 0
"for": "15m"
"labels":
"severity": "warning"