Skip to content

Commit 9262d8f

Browse files
committed
Add NodeNetworkInterfaceDown alert
Signed-off-by: João Vilaça <[email protected]>
1 parent 472038a commit 9262d8f

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

hack/prom-rule-ci/observability-prom-rules-tests.yaml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,52 @@ tests:
7070
operator_health_impact: "none"
7171
kubernetes_operator_part_of: "kubevirt"
7272
kubernetes_operator_component: "cnv-observability"
73+
74+
- interval: 1m
75+
input_series:
76+
- series: 'node_network_up{instance="n1.cnv.redhat.com", device="eno0"}'
77+
values: "1+0x10 0+0x10"
78+
- series: 'node_network_up{instance="n2.cnv.redhat.com", device="eno0"}'
79+
values: "1+0x10 0+0x10"
80+
- series: 'node_network_up{instance="n2.cnv.redhat.com", device="eno1"}'
81+
values: "0+0x20"
82+
83+
alert_rule_test:
84+
- eval_time: 8m
85+
alertname: NodeNetworkInterfaceDown
86+
exp_alerts:
87+
- exp_annotations:
88+
summary: "Network interfaces are down"
89+
description: "1 network devices have been down on instance n2.cnv.redhat.com for more than 5 minutes."
90+
runbook_url: "https://kubevirt.io/monitoring/runbooks/NodeNetworkInterfaceDown"
91+
exp_labels:
92+
instance: "n2.cnv.redhat.com"
93+
severity: "warning"
94+
operator_health_impact: "none"
95+
kubernetes_operator_part_of: "kubevirt"
96+
kubernetes_operator_component: "cnv-observability"
97+
98+
- eval_time: 18m
99+
alertname: NodeNetworkInterfaceDown
100+
exp_alerts:
101+
- exp_annotations:
102+
summary: "Network interfaces are down"
103+
description: "2 network devices have been down on instance n2.cnv.redhat.com for more than 5 minutes."
104+
runbook_url: "https://kubevirt.io/monitoring/runbooks/NodeNetworkInterfaceDown"
105+
exp_labels:
106+
instance: "n2.cnv.redhat.com"
107+
severity: "warning"
108+
operator_health_impact: "none"
109+
kubernetes_operator_part_of: "kubevirt"
110+
kubernetes_operator_component: "cnv-observability"
111+
112+
- exp_annotations:
113+
summary: "Network interfaces are down"
114+
description: "1 network devices have been down on instance n1.cnv.redhat.com for more than 5 minutes."
115+
runbook_url: "https://kubevirt.io/monitoring/runbooks/NodeNetworkInterfaceDown"
116+
exp_labels:
117+
instance: "n1.cnv.redhat.com"
118+
severity: "warning"
119+
operator_health_impact: "none"
120+
kubernetes_operator_part_of: "kubevirt"
121+
kubernetes_operator_component: "cnv-observability"

pkg/monitoring/observability/rules/alerts/cluster_alerts.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,18 @@ func clusterAlerts() []promv1.Rule {
3434
"operator_health_impact": "none",
3535
},
3636
},
37+
{
38+
Alert: "NodeNetworkInterfaceDown",
39+
Expr: intstr.FromString("count by (instance) (node_network_up{device!~\"veth.+|tunbr\"} == 0) > 0"),
40+
For: ptr.To(promv1.Duration("5m")),
41+
Annotations: map[string]string{
42+
"summary": "Network interfaces are down",
43+
"description": "{{ $value }} network devices have been down on instance {{ $labels.instance }} for more than 5 minutes.",
44+
},
45+
Labels: map[string]string{
46+
"severity": "warning",
47+
"operator_health_impact": "none",
48+
},
49+
},
3750
}
3851
}

0 commit comments

Comments
 (0)