Skip to content

Ensure burn rate factors are accurate, and add exhaustion label #775

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions kubernetes/controllers/servicelevelobjective_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,28 +155,28 @@ func Test_makePrometheusRule(t *testing.T) {
Alert: "ErrorBudgetBurn",
Expr: intstr.FromString(`http_requests:burnrate5m{job="app",slo="http"} > (14 * (1-0.995)) and http_requests:burnrate1h{job="app",slo="http"} > (14 * (1-0.995))`),
For: "2m",
Labels: map[string]string{"severity": "critical", "job": "app", "long": "1h", "slo": "http", "short": "5m", "team": "foo"},
Labels: map[string]string{"severity": "critical", "job": "app", "long": "1h", "slo": "http", "short": "5m", "team": "foo", "exhaustion": "2d"},
Annotations: map[string]string{"description": "foo"},
},
{
Alert: "ErrorBudgetBurn",
Expr: intstr.FromString(`http_requests:burnrate30m{job="app",slo="http"} > (7 * (1-0.995)) and http_requests:burnrate6h{job="app",slo="http"} > (7 * (1-0.995))`),
Expr: intstr.FromString(`http_requests:burnrate30m{job="app",slo="http"} > (5.6 * (1-0.995)) and http_requests:burnrate6h{job="app",slo="http"} > (5.6 * (1-0.995))`),
For: "15m",
Labels: map[string]string{"severity": "critical", "job": "app", "long": "6h", "slo": "http", "short": "30m", "team": "foo"},
Labels: map[string]string{"severity": "critical", "job": "app", "long": "6h", "slo": "http", "short": "30m", "team": "foo", "exhaustion": "5d"},
Annotations: map[string]string{"description": "foo"},
},
{
Alert: "ErrorBudgetBurn",
Expr: intstr.FromString(`http_requests:burnrate2h{job="app",slo="http"} > (2 * (1-0.995)) and http_requests:burnrate1d{job="app",slo="http"} > (2 * (1-0.995))`),
Expr: intstr.FromString(`http_requests:burnrate2h{job="app",slo="http"} > (2.8 * (1-0.995)) and http_requests:burnrate1d{job="app",slo="http"} > (2.8 * (1-0.995))`),
For: "1h",
Labels: map[string]string{"severity": "warning", "job": "app", "long": "1d", "slo": "http", "short": "2h", "team": "foo"},
Labels: map[string]string{"severity": "warning", "job": "app", "long": "1d", "slo": "http", "short": "2h", "team": "foo", "exhaustion": "10d"},
Annotations: map[string]string{"description": "foo"},
},
{
Alert: "ErrorBudgetBurn",
Expr: intstr.FromString(`http_requests:burnrate6h{job="app",slo="http"} > (1 * (1-0.995)) and http_requests:burnrate4d{job="app",slo="http"} > (1 * (1-0.995))`),
For: "3h",
Labels: map[string]string{"severity": "warning", "job": "app", "long": "4d", "slo": "http", "short": "6h", "team": "foo"},
Labels: map[string]string{"severity": "warning", "job": "app", "long": "4d", "slo": "http", "short": "6h", "team": "foo", "exhaustion": "4w"},
Annotations: map[string]string{"description": "foo"},
},
},
Expand Down Expand Up @@ -268,6 +268,7 @@ func Test_makeConfigMap(t *testing.T) {
> (14 * (1-0.995))
for: 2m
labels:
exhaustion: 2d
job: app
long: 1h
severity: critical
Expand All @@ -277,10 +278,11 @@ func Test_makeConfigMap(t *testing.T) {
- alert: ErrorBudgetBurn
annotations:
description: foo
expr: http_requests:burnrate30m{job="app",slo="http"} > (7 * (1-0.995)) and http_requests:burnrate6h{job="app",slo="http"}
> (7 * (1-0.995))
expr: http_requests:burnrate30m{job="app",slo="http"} > (5.6 * (1-0.995)) and
http_requests:burnrate6h{job="app",slo="http"} > (5.6 * (1-0.995))
for: 15m
labels:
exhaustion: 5d
job: app
long: 6h
severity: critical
Expand All @@ -290,10 +292,11 @@ func Test_makeConfigMap(t *testing.T) {
- alert: ErrorBudgetBurn
annotations:
description: foo
expr: http_requests:burnrate2h{job="app",slo="http"} > (2 * (1-0.995)) and http_requests:burnrate1d{job="app",slo="http"}
> (2 * (1-0.995))
expr: http_requests:burnrate2h{job="app",slo="http"} > (2.8 * (1-0.995)) and http_requests:burnrate1d{job="app",slo="http"}
> (2.8 * (1-0.995))
for: 1h
labels:
exhaustion: 10d
job: app
long: 1d
severity: warning
Expand All @@ -307,6 +310,7 @@ func Test_makeConfigMap(t *testing.T) {
> (1 * (1-0.995))
for: 3h
labels:
exhaustion: 4w
job: app
long: 4d
severity: warning
Expand Down
8 changes: 4 additions & 4 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ func TestAlertsMatchingObjectives(t *testing.T) {
Severity: "critical",
State: objectivesv1alpha1.Alert_inactive,
For: durationpb.New(8 * time.Minute),
Factor: 7,
Factor: 5.6,
Short: &objectivesv1alpha1.Burnrate{
Window: durationpb.New(15 * time.Minute),
Current: -1,
Expand All @@ -297,7 +297,7 @@ func TestAlertsMatchingObjectives(t *testing.T) {
Severity: "warning",
State: objectivesv1alpha1.Alert_inactive,
For: durationpb.New(30 * time.Minute),
Factor: 2,
Factor: 2.8,
Short: &objectivesv1alpha1.Burnrate{
Window: durationpb.New(time.Hour),
Current: -1,
Expand Down Expand Up @@ -373,7 +373,7 @@ func TestAlertsMatchingObjectives(t *testing.T) {
Severity: "critical",
State: objectivesv1alpha1.Alert_inactive,
For: durationpb.New(8 * time.Minute),
Factor: 7,
Factor: 5.6,
Short: &objectivesv1alpha1.Burnrate{
Window: durationpb.New(15 * time.Minute),
Current: -1,
Expand All @@ -390,7 +390,7 @@ func TestAlertsMatchingObjectives(t *testing.T) {
Severity: "warning",
State: objectivesv1alpha1.Alert_inactive,
For: durationpb.New(30 * time.Minute),
Factor: 2,
Factor: 2.8,
Short: &objectivesv1alpha1.Burnrate{
Window: durationpb.New(time.Hour),
Current: -1,
Expand Down
79 changes: 47 additions & 32 deletions slo/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,12 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
alertLabels["short"] = model.Duration(w.Short).String()
alertLabels["long"] = model.Duration(w.Long).String()
alertLabels["severity"] = string(w.Severity)
alertLabels["exhaustion"] = model.Duration(w.Exhaustion).String()

r := monitoringv1.Rule{
Alert: o.AlertName(),
// TODO: Use expr replacer
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))",
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%v * (1-%s)) and %s{%s} > (%v * (1-%s))",
o.BurnrateName(w.Short),
alertMatchersString,
w.Factor,
Expand Down Expand Up @@ -208,11 +209,12 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
alertLabels["short"] = model.Duration(w.Short).String()
alertLabels["long"] = model.Duration(w.Long).String()
alertLabels["severity"] = string(w.Severity)
alertLabels["exhaustion"] = model.Duration(w.Exhaustion).String()

r := monitoringv1.Rule{
Alert: o.AlertName(),
// TODO: Use expr replacer
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))",
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%v * (1-%s)) and %s{%s} > (%v * (1-%s))",
o.BurnrateName(w.Short),
alertMatchersString,
w.Factor,
Expand Down Expand Up @@ -293,11 +295,12 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
alertLabels["short"] = model.Duration(w.Short).String()
alertLabels["long"] = model.Duration(w.Long).String()
alertLabels["severity"] = string(w.Severity)
alertLabels["exhaustion"] = model.Duration(w.Exhaustion).String()

r := monitoringv1.Rule{
Alert: o.AlertName(),
// TODO: Use expr replacer
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))",
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%v * (1-%s)) and %s{%s} > (%v * (1-%s))",
o.BurnrateName(w.Short),
alertMatchersString,
w.Factor,
Expand Down Expand Up @@ -378,11 +381,12 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
alertLabels["short"] = model.Duration(w.Short).String()
alertLabels["long"] = model.Duration(w.Long).String()
alertLabels["severity"] = string(w.Severity)
alertLabels["exhaustion"] = model.Duration(w.Exhaustion).String()

r := monitoringv1.Rule{
Alert: o.AlertName(),
// TODO: Use expr replacer
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))",
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%v * (1-%s)) and %s{%s} > (%v * (1-%s))",
o.BurnrateName(w.Short),
alertMatchersString,
w.Factor,
Expand Down Expand Up @@ -1074,44 +1078,55 @@ const (
warning severity = "warning"
)

// Window represents a multi-window alert for a particular burn rate factor.
type Window struct {
Severity severity
For time.Duration
Long time.Duration
Short time.Duration
Factor float64
Severity severity
For time.Duration
Long time.Duration // Long represents the long window or the alerting window for a multi-window alert.
Short time.Duration // Short represents the short window or reset period of a multi-window alert.
Exhaustion time.Duration // Exhaustion specifies the time it takes to burn the whole error budget.
Factor float64
}

// Windows returns multi-window alerts across four burn rates, for a given SLO window.
// long and short rates are calculated based on the ratio for 28 days.
// Thus the alerts generated, would be based on the same burn rates, across different sloWindows
// which is why they are constant.
//
// The burn rate factors work best for 28 days or other windows that are multiples of 7.
// This can still be used with windows like 30d/90d, but might lead to uneven for/long/short.
// See Test_windows for examples.
func Windows(sloWindow time.Duration) []Window {
// TODO: I'm still not sure if For, Long, Short should really be based on the 28 days ratio...

round := time.Minute // TODO: Change based on sloWindow

// long and short rates are calculated based on the ratio for 28 days.
return []Window{{
Severity: critical,
For: (sloWindow / (28 * 24 * (60 / 2))).Round(round), // 2m for 28d - half short
Long: (sloWindow / (28 * 24)).Round(round), // 1h for 28d
Short: (sloWindow / (28 * 24 * (60 / 5))).Round(round), // 5m for 28d
Factor: 14, // error budget burn: 50% within a day
Severity: critical,
For: (sloWindow / (28 * 24 * (60 / 2))).Round(round), // 2m for 28d - half short
Long: (sloWindow / (28 * 24)).Round(round), // 1h for 28d
Short: (sloWindow / (28 * 24 * (60 / 5))).Round(round), // 5m for 28d
Exhaustion: time.Duration(sloWindow / 14).Round(round), // error budget burn: 50% within a day / 100% within 2d for 28d
Factor: 14, // 50 / ((24/(24 * 28)) * 100)
}, {
Severity: critical,
For: (sloWindow / (28 * 24 * (60 / 15))).Round(round), // 15m for 28d - half short
Long: (sloWindow / (28 * (24 / 6))).Round(round), // 6h for 28d
Short: (sloWindow / (28 * 24 * (60 / 30))).Round(round), // 30m for 28d
Factor: 7, // error budget burn: 20% within a day / 100% within 5 days
Severity: critical,
For: (sloWindow / (28 * 24 * (60 / 15))).Round(round), // 15m for 28d - half short
Long: (sloWindow / (28 * (24 / 6))).Round(round), // 6h for 28d
Short: (sloWindow / (28 * 24 * (60 / 30))).Round(round), // 30m for 28d
Exhaustion: time.Duration(float64(sloWindow) / 5.6).Round(round), // error budget burn: 20% within a day / 100% within 5 days for 28d
Factor: 5.6, // 20 / ((24/(24 * 28)) * 100)
}, {
Severity: warning,
For: (sloWindow / (28 * 24)).Round(round), // 1h for 28d - half short
Long: (sloWindow / 28).Round(round), // 1d for 28d
Short: (sloWindow / (28 * (24 / 2))).Round(round), // 2h for 28d
Factor: 2, // error budget burn: 10% within a day / 100% within 10 days
Severity: warning,
For: (sloWindow / (28 * 24)).Round(round), // 1h for 28d - half short
Long: (sloWindow / 28).Round(round), // 1d for 28d
Short: (sloWindow / (28 * (24 / 2))).Round(round), // 2h for 28d
Exhaustion: time.Duration(float64(sloWindow) / 2.8).Round(round), // error budget burn: 10% within a day / 100% within 10 days for 28d
Factor: 2.8, // 10 / ((24/(24 * 28)) * 100)
}, {
Severity: warning,
For: (sloWindow / (28 * (24 / 3))).Round(round), // 3h for 28d - half short
Long: (sloWindow / 7).Round(round), // 4d for 28d
Short: (sloWindow / (28 * (24 / 6))).Round(round), // 6h for 28d
Factor: 1, // error budget burn: 100% until the end of sloWindow
Severity: warning,
For: (sloWindow / (28 * (24 / 3))).Round(round), // 3h for 28d - half short
Long: (sloWindow / 7).Round(round), // 4d for 28d
Short: (sloWindow / (28 * (24 / 6))).Round(round), // 6h for 28d
Exhaustion: (sloWindow).Round(round), // error budget burn: 100% until the end of sloWindow
Factor: 1, // 100 / (((24 * 28)/(24 * 28)) * 100)
}}
}

Expand Down
Loading