Skip to content

Commit 472038a

Browse files
committed
Add HAControlPlaneDown alert
Signed-off-by: João Vilaça <[email protected]>
1 parent 903ac1a commit 472038a

File tree

2 files changed

+48
-0
lines changed

2 files changed

+48
-0
lines changed

hack/prom-rule-ci/observability-prom-rules-tests.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,38 @@ tests:
3535
operator_health_impact: "none"
3636
kubernetes_operator_part_of: "kubevirt"
3737
kubernetes_operator_component: "cnv-observability"
38+
39+
40+
- interval: 1m
41+
input_series:
42+
# Control plane node - n1.cnv.redhat.com
43+
- series: 'kube_node_role{node="n1.cnv.redhat.com",role="control-plane"}'
44+
values: "1+0x30"
45+
# Non-control plane node - n2.cnv.redhat.com
46+
- series: 'kube_node_role{node="n2.cnv.redhat.com",role="worker"}'
47+
values: "0+0x30"
48+
# n1.cnv.redhat.com is ready for 10 minutes and then becomes not ready
49+
- series: 'kube_node_status_condition{condition="Ready",status="true",node="n1.cnv.redhat.com"}'
50+
values: "1+0x10 0+0x10"
51+
# n2.cnv.redhat.com is always not ready but doesn't have the control-plane role
52+
- series: 'kube_node_status_condition{condition="Ready",status="true",node="n2.cnv.redhat.com"}'
53+
values: "0+0x20"
54+
55+
alert_rule_test:
56+
- eval_time: 8m
57+
alertname: HAControlPlaneDown
58+
exp_alerts: [ ]
59+
60+
- eval_time: 18m
61+
alertname: HAControlPlaneDown
62+
exp_alerts:
63+
- exp_annotations:
64+
summary: "Control plane node n1.cnv.redhat.com is not ready"
65+
description: "Control plane node n1.cnv.redhat.com has been not ready for more than 5 minutes."
66+
runbook_url: "https://kubevirt.io/monitoring/runbooks/HAControlPlaneDown"
67+
exp_labels:
68+
node: "n1.cnv.redhat.com"
69+
severity: "critical"
70+
operator_health_impact: "none"
71+
kubernetes_operator_part_of: "kubevirt"
72+
kubernetes_operator_component: "cnv-observability"

pkg/monitoring/observability/rules/alerts/cluster_alerts.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,18 @@ func clusterAlerts() []promv1.Rule {
2121
"operator_health_impact": "none",
2222
},
2323
},
24+
{
25+
Alert: "HAControlPlaneDown",
26+
Expr: intstr.FromString("kube_node_role{role=\"control-plane\"} * on(node) kube_node_status_condition{condition=\"Ready\",status=\"true\"} == 0"),
27+
For: ptr.To(promv1.Duration("5m")),
28+
Annotations: map[string]string{
29+
"summary": "Control plane node {{ $labels.node }} is not ready",
30+
"description": "Control plane node {{ $labels.node }} has been not ready for more than 5 minutes.",
31+
},
32+
Labels: map[string]string{
33+
"severity": "critical",
34+
"operator_health_impact": "none",
35+
},
36+
},
2437
}
2538
}

0 commit comments

Comments
 (0)