Skip to content

Commit b41eb2a

Browse files
authored
Merge branch 'master' into fix/scalar-quotas
2 parents d4c778d + cb72d73 commit b41eb2a

8 files changed

+75
-19
lines changed

Diff for: alerts/apps_alerts.libsonnet

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
local utils = import '../lib/utils.libsonnet';
2+
13
{
24
_config+:: {
35
kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics',
@@ -10,7 +12,8 @@
1012
groups+: [
1113
{
1214
name: 'kubernetes-apps',
13-
rules: [
15+
rules: [utils.wrap_rule_for_labels(rule, $._config) for rule in self.rules_],
16+
rules_:: [
1417
{
1518
expr: |||
1619
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", %(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}[5m]) >= 1

Diff for: alerts/kube_apiserver.libsonnet

+9-8
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ local utils = import '../lib/utils.libsonnet';
44
_config+:: {
55
kubeApiserverSelector: error 'must provide selector for kube-apiserver',
66

7-
kubeAPILatencyWarningSeconds: 1,
8-
97
certExpirationWarningSeconds: 7 * 24 * 3600,
108
certExpirationCriticalSeconds: 1 * 24 * 3600,
119
},
@@ -18,13 +16,16 @@ local utils = import '../lib/utils.libsonnet';
1816
{
1917
alert: 'KubeAPIErrorBudgetBurn',
2018
expr: |||
21-
sum(apiserver_request:burnrate%s) > (%.2f * %.5f)
22-
and
23-
sum(apiserver_request:burnrate%s) > (%.2f * %.5f)
19+
sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f)
20+
and on(%s)
21+
sum by(%s) (apiserver_request:burnrate%s) > (%.2f * %.5f)
2422
||| % [
23+
$._config.clusterLabel,
2524
w.long,
2625
w.factor,
2726
(1 - $._config.SLOs.apiserver.target),
27+
$._config.clusterLabel,
28+
$._config.clusterLabel,
2829
w.short,
2930
w.factor,
3031
(1 - $._config.SLOs.apiserver.target),
@@ -49,7 +50,7 @@ local utils = import '../lib/utils.libsonnet';
4950
{
5051
alert: 'KubeClientCertificateExpiration',
5152
expr: |||
52-
apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s
53+
apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(%(clusterLabel)s, job) histogram_quantile(0.01, sum by (%(clusterLabel)s, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s
5354
||| % $._config,
5455
'for': '5m',
5556
labels: {
@@ -63,7 +64,7 @@ local utils = import '../lib/utils.libsonnet';
6364
{
6465
alert: 'KubeClientCertificateExpiration',
6566
expr: |||
66-
apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s
67+
apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(%(clusterLabel)s, job) histogram_quantile(0.01, sum by (%(clusterLabel)s, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s
6768
||| % $._config,
6869
'for': '5m',
6970
labels: {
@@ -108,7 +109,7 @@ local utils = import '../lib/utils.libsonnet';
108109
{
109110
alert: 'KubeAPITerminatedRequests',
110111
expr: |||
111-
sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum(rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum(rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20
112+
sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) / ( sum by(%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s}[10m])) + sum by(%(clusterLabel)s) (rate(apiserver_request_terminations_total{%(kubeApiserverSelector)s}[10m])) ) > 0.20
112113
||| % $._config,
113114
labels: {
114115
severity: 'warning',

Diff for: config.libsonnet

+10
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,16 @@
3232
windowsExporterSelector: 'job="kubernetes-windows-exporter"',
3333
containerfsSelector: 'container!=""',
3434

35+
// List of labels to join for different type of metrics
36+
// Only works if your environment has the labels kube_%s_labels (e.g. kube_pod_labels) available.
37+
common_join_labels: [],
38+
pods_join_labels: $._config.common_join_labels,
39+
statefulsets_join_labels: $._config.common_join_labels,
40+
deployments_join_labels: $._config.common_join_labels,
41+
daemonsets_join_labels: $._config.common_join_labels,
42+
horizontalpodautoscalers_join_labels: $._config.common_join_labels,
43+
jobs_join_labels: $._config.common_join_labels,
44+
3545
// Grafana dashboard IDs are necessary for stable links for dashboards
3646
grafanaDashboardIDs: {
3747
'apiserver.json': std.md5('apiserver.json'),

Diff for: dashboards/resources/multi-cluster.libsonnet

+2-2
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ local var = g.dashboard.variable;
126126
+ g.panel.table.queryOptions.withTransformations([
127127
g.panel.table.queryOptions.transformation.withId('joinByField')
128128
+ g.panel.table.queryOptions.transformation.withOptions({
129-
byField: 'cluster',
129+
byField: std.format('%s', $._config.clusterLabel),
130130
mode: 'outer',
131131
}),
132132

@@ -225,7 +225,7 @@ local var = g.dashboard.variable;
225225
+ g.panel.table.queryOptions.withTransformations([
226226
g.panel.table.queryOptions.transformation.withId('joinByField')
227227
+ g.panel.table.queryOptions.transformation.withOptions({
228-
byField: 'cluster',
228+
byField: std.format('%s', $._config.clusterLabel),
229229
mode: 'outer',
230230
}),
231231

Diff for: dashboards/resources/node.libsonnet

+2-2
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ local var = g.dashboard.variable;
8181
+ tsPanel.queryOptions.withTargets([
8282
prometheus.new(
8383
'${datasource}',
84-
'sum(kube_node_status_capacity{%(clusterLabel)s="$cluster", node=~"$node", resource="cpu"})' % $._config,
84+
'sum(kube_node_status_capacity{%(clusterLabel)s="$cluster", %(kubeStateMetricsSelector)s, node=~"$node", resource="cpu"})' % $._config,
8585
)
8686
+ prometheus.withLegendFormat('max capacity'),
8787

@@ -180,7 +180,7 @@ local var = g.dashboard.variable;
180180
+ tsPanel.queryOptions.withTargets([
181181
prometheus.new(
182182
'${datasource}',
183-
'sum(kube_node_status_capacity{%(clusterLabel)s="$cluster", node=~"$node", resource="memory"})' % $._config,
183+
'sum(kube_node_status_capacity{%(clusterLabel)s="$cluster", %(kubeStateMetricsSelector)s, node=~"$node", resource="memory"})' % $._config,
184184
)
185185
+ prometheus.withLegendFormat('max capacity'),
186186

Diff for: dashboards/resources/workload-namespace.libsonnet

+4-4
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,10 @@ local var = g.dashboard.variable;
118118
local memRequestsQuery = std.strReplace(cpuRequestsQuery, 'cpu', 'memory');
119119
local memLimitsQuery = std.strReplace(cpuLimitsQuery, 'cpu', 'memory');
120120

121-
local cpuQuotaRequestsQuery = 'scalar(max(kube_resourcequota{%(clusterLabel)s="$cluster", namespace="$namespace", type="hard",resource="requests.cpu"}))' % $._config;
122-
local cpuQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.cpu');
123-
local memoryQuotaRequestsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'requests.memory');
124-
local memoryQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu', 'limits.memory');
121+
local cpuQuotaRequestsQuery = 'scalar(max(kube_resourcequota{%(clusterLabel)s="$cluster", namespace="$namespace", type="hard",resource=~"requests.cpu|cpu"}))' % $._config;
122+
local cpuQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu|cpu', 'limits.cpu');
123+
local memoryQuotaRequestsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu|cpu', 'requests.memory|memory');
124+
local memoryQuotaLimitsQuery = std.strReplace(cpuQuotaRequestsQuery, 'requests.cpu|cpu', 'limits.memory');
125125

126126
local networkColumns = [
127127
|||

Diff for: lib/utils.libsonnet

+41
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,45 @@
1515
if s > 60 * 60 * 24
1616
then '%.1f days' % (s / 60 / 60 / 24)
1717
else '%.1f hours' % (s / 60 / 60),
18+
19+
// Handle adding `group left` to join labels into rule by wrapping the rule in () * on(xxx) group_left(xxx) kube_xxx_labels
20+
// If kind of rule is not defined try to detect rule type by alert name
21+
wrap_rule_for_labels(rule, config):
22+
// Detect Kind of rule from name unless hidden `kind field is passed in the rule`
23+
local kind =
24+
if 'kind' in rule then rule.kind
25+
// Handle Alerts
26+
else if std.objectHas(rule, 'alert') then
27+
if std.startsWith(rule.alert, 'KubePod') then 'pod'
28+
else if std.startsWith(rule.alert, 'KubeContainer') then 'pod'
29+
else if std.startsWith(rule.alert, 'KubeStateful') then 'statefulset'
30+
else if std.startsWith(rule.alert, 'KubeDeploy') then 'deployment'
31+
else if std.startsWith(rule.alert, 'KubeDaemon') then 'daemonset'
32+
else if std.startsWith(rule.alert, 'KubeHpa') then 'horizontalpodautoscaler'
33+
else if std.startsWith(rule.alert, 'KubeJob') then 'job'
34+
else 'none'
35+
else 'none';
36+
37+
local labels = {
38+
join_labels: config['%ss_join_labels' % kind],
39+
// since the label 'job' is reserved, the resource with kind Job uses the label 'job_name' instead
40+
on_labels: ['%s' % (if kind == 'job' then 'job_name' else kind), '%s' % config.namespaceLabel, '%s' % config.clusterLabel],
41+
metric: 'kube_%s_labels' % kind,
42+
};
43+
44+
// Failed to identify kind - return raw rule
45+
if kind == 'none' then rule
46+
// No join labels passed in the config - return raw rule
47+
else if std.length(labels.join_labels) == 0 then rule
48+
// Wrap expr with join group left
49+
else
50+
rule {
51+
local expr = super.expr,
52+
expr: '(%(expr)s) * on (%(on)s) group_left(%(join)s) %(metric)s' % {
53+
expr: expr,
54+
on: std.join(',', labels.on_labels),
55+
join: std.join(',', labels.join_labels),
56+
metric: labels.metric,
57+
},
58+
},
1859
}

Diff for: tests.yaml

+3-2
Original file line numberDiff line numberDiff line change
@@ -1125,9 +1125,9 @@ tests:
11251125

11261126
- interval: 1m
11271127
input_series:
1128-
- series: 'apiserver_request_terminations_total{job="kube-apiserver",apiserver="kube-apiserver"}'
1128+
- series: 'apiserver_request_terminations_total{cluster="kubernetes",job="kube-apiserver",apiserver="kube-apiserver"}'
11291129
values: '1+1x10'
1130-
- series: 'apiserver_request_total{job="kube-apiserver",apiserver="kube-apiserver"}'
1130+
- series: 'apiserver_request_total{cluster="kubernetes",job="kube-apiserver",apiserver="kube-apiserver"}'
11311131
values: '1+2x10'
11321132
alert_rule_test:
11331133
- eval_time: 5m # alert hasn't fired
@@ -1137,6 +1137,7 @@ tests:
11371137
exp_alerts:
11381138
- exp_labels:
11391139
severity: warning
1140+
cluster: "kubernetes"
11401141
exp_annotations:
11411142
summary: "The kubernetes apiserver has terminated 33.33% of its incoming requests."
11421143
description: "The kubernetes apiserver has terminated 33.33% of its incoming requests."

0 commit comments

Comments
 (0)