Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Per-policy reconcile duration panel in Grafana dashboard (p99/p50 by namespace and policy)
- ReplicaSet as a supported target workload kind with adapter, RBAC, and Helm clusterrole
- Cross-namespace Secret reference rejection in webhook validation
- `KubeRightsizeHighRevertRate` PrometheusRule alert in Helm chart
- `AttuneHighRevertRate` PrometheusRule alert in Helm chart
- Configurable `burstSensitivity` per resource: controls how much burst detection inflates recommendations (default 0.1, set 0 to disable)
- Canary auto-promotion resets on spec change: editing a policy restarts the observation cycle so new configuration is re-validated
- `attune_burst_factor` Prometheus metric and Grafana dashboard panel showing burst detection multiplier per workload
Expand Down
10 changes: 5 additions & 5 deletions charts/attune/templates/prometheusrule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
- name: attune
rules:
{{- if .Values.metrics.prometheusRule.rules.reconcileErrors.enabled }}
- alert: KubeRightsizeReconcileErrors
- alert: AttuneReconcileErrors
expr: sum(rate(attune_reconcile_errors_total[5m])) > {{ .Values.metrics.prometheusRule.rules.reconcileErrors.threshold }}
for: {{ .Values.metrics.prometheusRule.rules.reconcileErrors.for }}
labels:
Expand All @@ -26,7 +26,7 @@ spec:
Check the operator logs for details.
{{- end }}
{{- if .Values.metrics.prometheusRule.rules.prometheusUnreachable.enabled }}
- alert: KubeRightsizePrometheusUnreachable
- alert: AttunePrometheusUnreachable
expr: sum(rate(attune_prometheus_query_errors_total[5m])) > 0
for: {{ .Values.metrics.prometheusRule.rules.prometheusUnreachable.for }}
labels:
Expand All @@ -39,7 +39,7 @@ spec:
Verify the Prometheus address and connectivity.
{{- end }}
{{- if .Values.metrics.prometheusRule.rules.degraded.enabled }}
- alert: KubeRightsizeDegraded
- alert: AttuneDegraded
expr: sum by (namespace, workload) (increase(attune_reverts_total[15m])) > 3
for: {{ .Values.metrics.prometheusRule.rules.degraded.for }}
labels:
Expand All @@ -53,7 +53,7 @@ spec:
instability.
{{- end }}
{{- if .Values.metrics.prometheusRule.rules.highRevertRate.enabled }}
- alert: KubeRightsizeHighRevertRate
- alert: AttuneHighRevertRate
expr: >-
(
sum(increase(attune_reverts_total[1h]))
Expand All @@ -72,7 +72,7 @@ spec:
overhead settings and workload behavior.
{{- end }}
{{- if .Values.metrics.prometheusRule.rules.reconcileStale.enabled }}
- alert: KubeRightsizeReconcileStale
- alert: AttuneReconcileStale
expr: changes(attune_reconcile_duration_seconds_count[{{ .Values.metrics.prometheusRule.rules.reconcileStale.staleDuration }}]) == 0
for: {{ .Values.metrics.prometheusRule.rules.reconcileStale.for }}
labels:
Expand Down
8 changes: 4 additions & 4 deletions docs/guides/prometheus-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,10 @@ This creates four alerts:

| Alert | Fires when | Default severity |
|-------|-----------|-----------------|
| `KubeRightsizeReconcileErrors` | Reconcile error rate > 0 sustained for 10m | warning |
| `KubeRightsizePrometheusUnreachable` | Prometheus query errors sustained for 10m | warning |
| `KubeRightsizeDegraded` | More than 3 reverts in 15m for a workload | critical |
| `KubeRightsizeReconcileStale` | No reconcile completes within 30m | warning |
| `AttuneReconcileErrors` | Reconcile error rate > 0 sustained for 10m | warning |
| `AttunePrometheusUnreachable` | Prometheus query errors sustained for 10m | warning |
| `AttuneDegraded` | More than 3 reverts in 15m for a workload | critical |
| `AttuneReconcileStale` | No reconcile completes within 30m | warning |

Individual alerts can be disabled or tuned:

Expand Down
28 changes: 14 additions & 14 deletions docs/savings-calculator.md
Original file line number Diff line number Diff line change
Expand Up @@ -451,13 +451,13 @@ function calculate() {
const breakdown = [];

workloads.forEach(w => {
const cpuRightsized = Math.max(w.cpuP95 * cpuMargin, w.cpuP95);
const memRightsized = Math.max(w.memP95 * memMargin, w.memP95);
const cpuTuned = Math.max(w.cpuP95 * cpuMargin, w.cpuP95);
const memTuned = Math.max(w.memP95 * memMargin, w.memP95);

const cpuCurrentCost = (w.cpuReq / 1000) * cpuPrice * hoursPerMonth * w.replicas;
const memCurrentCost = (w.memReq / 1024) * memPrice * hoursPerMonth * w.replicas;
const cpuNewCost = (Math.min(cpuRightsized, w.cpuReq) / 1000) * cpuPrice * hoursPerMonth * w.replicas;
const memNewCost = (Math.min(memRightsized, w.memReq) / 1024) * memPrice * hoursPerMonth * w.replicas;
const cpuNewCost = (Math.min(cpuTuned, w.cpuReq) / 1000) * cpuPrice * hoursPerMonth * w.replicas;
const memNewCost = (Math.min(memTuned, w.memReq) / 1024) * memPrice * hoursPerMonth * w.replicas;

const monthlySaved = (cpuCurrentCost + memCurrentCost) - (cpuNewCost + memNewCost);
totalMonthlySavings += monthlySaved;
Expand All @@ -469,15 +469,15 @@ function calculate() {
totalMemReq += w.memReq * w.replicas;
totalMemP95 += w.memP95 * w.replicas;

const cpuUnderProv = cpuRightsized > w.cpuReq && w.cpuReq > 0;
const memUnderProv = memRightsized > w.memReq && w.memReq > 0;
const cpuUnderProv = cpuTuned > w.cpuReq && w.cpuReq > 0;
const memUnderProv = memTuned > w.memReq && w.memReq > 0;

breakdown.push({
name: w.name,
cpuReq: w.cpuReq,
cpuNew: Math.round(cpuUnderProv ? cpuRightsized : Math.min(cpuRightsized, w.cpuReq)),
cpuNew: Math.round(cpuUnderProv ? cpuTuned : Math.min(cpuTuned, w.cpuReq)),
memReq: w.memReq,
memNew: Math.round(memUnderProv ? memRightsized : Math.min(memRightsized, w.memReq)),
memNew: Math.round(memUnderProv ? memTuned : Math.min(memTuned, w.memReq)),
saved: monthlySaved,
underProv: cpuUnderProv || memUnderProv
});
Expand All @@ -494,18 +494,18 @@ function calculate() {
document.getElementById('overallReduction').textContent = reduction + '%';

const cpuUtilBefore = totalCpuReq > 0 ? Math.round((totalCpuP95 / totalCpuReq) * 100) : 0;
const cpuRightsizedTotal = totalCpuP95 * cpuMargin;
const cpuUtilAfter = cpuRightsizedTotal > 0
? Math.min(100, Math.round((totalCpuP95 / Math.min(cpuRightsizedTotal, totalCpuReq)) * 100))
const cpuTunedTotal = totalCpuP95 * cpuMargin;
const cpuUtilAfter = cpuTunedTotal > 0
? Math.min(100, Math.round((totalCpuP95 / Math.min(cpuTunedTotal, totalCpuReq)) * 100))
: 0;
document.getElementById('cpuUtilBefore').textContent = cpuUtilBefore + '%';
document.getElementById('cpuUtilAfter').textContent = cpuUtilAfter + '%';
document.getElementById('cpuBar').style.width = cpuUtilAfter + '%';

const memUtilBefore = totalMemReq > 0 ? Math.round((totalMemP95 / totalMemReq) * 100) : 0;
const memRightsizedTotal = totalMemP95 * memMargin;
const memUtilAfter = memRightsizedTotal > 0
? Math.min(100, Math.round((totalMemP95 / Math.min(memRightsizedTotal, totalMemReq)) * 100))
const memTunedTotal = totalMemP95 * memMargin;
const memUtilAfter = memTunedTotal > 0
? Math.min(100, Math.round((totalMemP95 / Math.min(memTunedTotal, totalMemReq)) * 100))
: 0;
document.getElementById('memUtilBefore').textContent = memUtilBefore + '%';
document.getElementById('memUtilAfter').textContent = memUtilAfter + '%';
Expand Down
Loading