Skip to content

Commit f37ff27

Browse files
authored
fix(alerts): KubeletDown alert missing cluster label (#1179)
1 parent b3d555f commit f37ff27

File tree

2 files changed

+95
-6
lines changed

2 files changed

+95
-6
lines changed

alerts/kubelet.libsonnet

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,9 +280,26 @@ local utils = import '../lib/utils.libsonnet';
280280
summary: 'Kubelet has failed to renew its server certificate.',
281281
},
282282
},
283-
(import '../lib/absent_alert.libsonnet') {
284-
componentName:: 'Kubelet',
285-
selector:: $._config.kubeletSelector,
283+
{
284+
// Use kube-state-metrics as an anchor to detect kubelet down while
285+
// preserving the cluster label. This fires when kube-state-metrics
286+
// reports nodes exist but no kubelet targets are up for that cluster.
287+
alert: 'KubeletDown',
288+
expr: |||
289+
count by (%(clusterLabel)s) (kube_node_info{%(kubeStateMetricsSelector)s})
290+
unless on (%(clusterLabel)s)
291+
count by (%(clusterLabel)s) (up{%(kubeletSelector)s} == 1)
292+
||| % $._config,
293+
'for': '15m',
294+
labels: {
295+
severity: 'critical',
296+
},
297+
annotations: {
298+
description: 'Kubelet has disappeared from Prometheus target discovery%s.' % [
299+
utils.ifShowMultiCluster($._config, ' on cluster {{ $labels.%(clusterLabel)s }}' % $._config),
300+
],
301+
summary: 'Target disappeared from Prometheus target discovery.',
302+
},
286303
},
287304
],
288305
},

tests/absent_alert-test.yaml

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,59 @@ tests:
2222
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown"
2323

2424
- interval: 1m
25-
name: KubeletDown fires when kubelet target is absent
25+
name: KubeletDown fires when kubelet target is absent but nodes exist
2626
input_series:
27+
# kube_node_info from kube-state-metrics - stays present to indicate nodes exist
28+
- series: 'kube_node_info{job="kube-state-metrics", cluster="test-cluster", node="node1"}'
29+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
30+
# kubelet up metric - goes absent after 5 minutes
31+
- series: 'up{job="kubelet", cluster="test-cluster", instance="node1"}'
32+
values: '1 1 1 1 1 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _'
33+
alert_rule_test:
34+
- eval_time: 10m
35+
alertname: KubeletDown
36+
- eval_time: 25m
37+
alertname: KubeletDown
38+
exp_alerts:
39+
- exp_labels:
40+
severity: "critical"
41+
cluster: "test-cluster"
42+
exp_annotations:
43+
description: "Kubelet has disappeared from Prometheus target discovery."
44+
summary: "Target disappeared from Prometheus target discovery."
45+
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown"
46+
47+
- interval: 1m
48+
name: KubeletDown does not fire when kubelet is up
49+
input_series:
50+
- series: 'kube_node_info{job="kube-state-metrics", cluster="test-cluster", node="node1"}'
51+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
52+
- series: 'up{job="kubelet", cluster="test-cluster", instance="node1"}'
53+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
54+
alert_rule_test:
55+
- eval_time: 25m
56+
alertname: KubeletDown
57+
exp_alerts: []
58+
59+
- interval: 1m
60+
name: KubeletDown does not fire when no nodes exist (no kube_node_info)
61+
input_series:
62+
# No kube_node_info means no nodes in this cluster - kubelet absence is expected
63+
- series: 'up{job="kubelet", cluster="test-cluster", instance="node1"}'
64+
values: '1 1 1 1 1 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _'
65+
alert_rule_test:
66+
- eval_time: 25m
67+
alertname: KubeletDown
68+
exp_alerts: []
69+
70+
- interval: 1m
71+
name: KubeletDown fires without cluster label (single-cluster setup)
72+
input_series:
73+
# Metrics without cluster label - common in single-cluster setups without external_labels
74+
- series: 'kube_node_info{job="kube-state-metrics", node="node1"}'
75+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
2776
- series: 'up{job="kubelet", instance="node1"}'
28-
values: '1 1 1 1 1 0 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _'
77+
values: '1 1 1 1 1 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _'
2978
alert_rule_test:
3079
- eval_time: 10m
3180
alertname: KubeletDown
@@ -34,12 +83,35 @@ tests:
3483
exp_alerts:
3584
- exp_labels:
3685
severity: "critical"
37-
job: "kubelet"
3886
exp_annotations:
3987
description: "Kubelet has disappeared from Prometheus target discovery."
4088
summary: "Target disappeared from Prometheus target discovery."
4189
runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown"
4290

91+
- interval: 1m
92+
name: KubeletDown does not fire when kubelet is up (single-cluster setup)
93+
input_series:
94+
# Metrics without cluster label - healthy single-cluster setup
95+
- series: 'kube_node_info{job="kube-state-metrics", node="node1"}'
96+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
97+
- series: 'up{job="kubelet", instance="node1"}'
98+
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
99+
alert_rule_test:
100+
- eval_time: 25m
101+
alertname: KubeletDown
102+
exp_alerts: []
103+
104+
- interval: 1m
105+
name: KubeletDown does not fire when no nodes exist (single-cluster setup)
106+
input_series:
107+
# No kube_node_info and no cluster label - kubelet absence is expected
108+
- series: 'up{job="kubelet", instance="node1"}'
109+
values: '1 1 1 1 1 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _'
110+
alert_rule_test:
111+
- eval_time: 25m
112+
alertname: KubeletDown
113+
exp_alerts: []
114+
43115
- interval: 1m
44116
name: KubeSchedulerDown fires when kube-scheduler target is absent
45117
input_series:

0 commit comments

Comments
 (0)