fix: revert failure metric, NaN/Inf guard, and validator mutation (#277)

SebTardif · web-flow · commit e12c53787329 · 2026-06-02T20:22:52.000Z
* fix: revert failure metric, NaN/Inf guard, and validator mutation

- Add attune_revert_failures_total metric so failed revert attempts
  are visible to monitoring and alerting. Previously, a failed revert
  (the operator's worst failure mode) was invisible to Prometheus.
- Filter NaN/Inf samples in QueryRangeGrouped to prevent non-finite
  values from Prometheus flowing into the recommendation engine.
- Stop validator from mutating the input AttunePolicy when
  UpdateStrategy is nil; use a local variable instead.
- Remove redundant header clone in headerTransport.RoundTrip
  (req.Clone already deep-copies headers).
- Add FROM/TO resource values to the revert log for easier debugging.
- Add Grafana dashboard panel, PrometheusRule alert, docs, and
  troubleshooting section for the new metric.

Signed-off-by: Sebastien Tardif &lt;sebtardif@ncf.ca&gt;

* fix: add revert failures panel to source Grafana dashboard

Signed-off-by: Sebastien Tardif &lt;sebtardif@ncf.ca&gt;

---------

Signed-off-by: Sebastien Tardif &lt;sebtardif@ncf.ca&gt;
diff --git a/charts/attune/README.md b/charts/attune/README.md
@@ -48,10 +48,10 @@ helm install attune oci://ghcr.io/attune-io/charts/attune \
 | logging.format | string | `"json"` | Log format (json, text) |
 | logging.level | string | `"info"` | Log level (debug, info, warn, error) |
 | maxConcurrentReconciles | string | `""` | Maximum number of AttunePolicy reconciles running in parallel. Increase for large clusters with many policies (e.g. 4 for 200+ policies). |
-| metrics | object | `{"enabled":true,"port":8080,"prometheusRule":{"additionalLabels":{},"enabled":false,"rules":{"budgetExhausted":{"enabled":true,"for":"30m","severity":"warning"},"dataQuality":{"enabled":true,"for":"30m","severity":"warning"},"degraded":{"enabled":true,"for":"5m","severity":"critical"},"highRevertRate":{"enabled":true,"for":"15m","severity":"critical","threshold":"0.5"},"prometheusUnreachable":{"enabled":true,"for":"10m","severity":"warning"},"reconcileErrors":{"enabled":true,"for":"10m","severity":"warning","threshold":"0"},"reconcileStale":{"enabled":true,"for":"5m","severity":"warning","staleDuration":"30m"},"requestsClamped":{"enabled":true,"for":"1h","severity":"info"}}},"serviceMonitor":{"additionalLabels":{},"enabled":false,"interval":"30s"}}` | Metrics endpoint |
+| metrics | object | `{"enabled":true,"port":8080,"prometheusRule":{"additionalLabels":{},"enabled":false,"rules":{"budgetExhausted":{"enabled":true,"for":"30m","severity":"warning"},"dataQuality":{"enabled":true,"for":"30m","severity":"warning"},"degraded":{"enabled":true,"for":"5m","severity":"critical"},"highRevertRate":{"enabled":true,"for":"15m","severity":"critical","threshold":"0.5"},"prometheusUnreachable":{"enabled":true,"for":"10m","severity":"warning"},"reconcileErrors":{"enabled":true,"for":"10m","severity":"warning","threshold":"0"},"reconcileStale":{"enabled":true,"for":"5m","severity":"warning","staleDuration":"30m"},"requestsClamped":{"enabled":true,"for":"1h","severity":"info"},"revertFailures":{"enabled":true,"for":"5m","severity":"critical"}}},"serviceMonitor":{"additionalLabels":{},"enabled":false,"interval":"30s"}}` | Metrics endpoint |
 | metrics.prometheusRule.additionalLabels | object | `{}` | Additional labels for the PrometheusRule |
 | metrics.prometheusRule.enabled | bool | `false` | Create a PrometheusRule for out-of-the-box alerting. Requires the Prometheus Operator CRDs (monitoring.coreos.com/v1). |
-| metrics.prometheusRule.rules | object | `{"budgetExhausted":{"enabled":true,"for":"30m","severity":"warning"},"dataQuality":{"enabled":true,"for":"30m","severity":"warning"},"degraded":{"enabled":true,"for":"5m","severity":"critical"},"highRevertRate":{"enabled":true,"for":"15m","severity":"critical","threshold":"0.5"},"prometheusUnreachable":{"enabled":true,"for":"10m","severity":"warning"},"reconcileErrors":{"enabled":true,"for":"10m","severity":"warning","threshold":"0"},"reconcileStale":{"enabled":true,"for":"5m","severity":"warning","staleDuration":"30m"},"requestsClamped":{"enabled":true,"for":"1h","severity":"info"}}` | Override default alert rules. Each key matches a rule name; set enabled: false to disable individual rules. |
+| metrics.prometheusRule.rules | object | `{"budgetExhausted":{"enabled":true,"for":"30m","severity":"warning"},"dataQuality":{"enabled":true,"for":"30m","severity":"warning"},"degraded":{"enabled":true,"for":"5m","severity":"critical"},"highRevertRate":{"enabled":true,"for":"15m","severity":"critical","threshold":"0.5"},"prometheusUnreachable":{"enabled":true,"for":"10m","severity":"warning"},"reconcileErrors":{"enabled":true,"for":"10m","severity":"warning","threshold":"0"},"reconcileStale":{"enabled":true,"for":"5m","severity":"warning","staleDuration":"30m"},"requestsClamped":{"enabled":true,"for":"1h","severity":"info"},"revertFailures":{"enabled":true,"for":"5m","severity":"critical"}}` | Override default alert rules. Each key matches a rule name; set enabled: false to disable individual rules. |
 | metrics.prometheusRule.rules.budgetExhausted.for | string | `"30m"` | How long the condition must persist before firing |
 | metrics.prometheusRule.rules.dataQuality.for | string | `"30m"` | How long the condition must persist before firing |
 | metrics.prometheusRule.rules.highRevertRate.for | string | `"15m"` | How long the condition must persist before firing |
@@ -60,6 +60,7 @@ helm install attune oci://ghcr.io/attune-io/charts/attune \
 | metrics.prometheusRule.rules.reconcileErrors.threshold | string | `"0"` | Error rate threshold (per second, averaged over 5m) |
 | metrics.prometheusRule.rules.reconcileStale.staleDuration | string | `"30m"` | Fire when no reconcile completes within this duration |
 | metrics.prometheusRule.rules.requestsClamped.for | string | `"1h"` | How long the condition must persist before firing |
+| metrics.prometheusRule.rules.revertFailures.for | string | `"5m"` | How long the condition must persist before firing |
 | metrics.serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor |
 | metrics.serviceMonitor.enabled | bool | `false` | Create a ServiceMonitor for Prometheus Operator |
 | metrics.serviceMonitor.interval | string | `"30s"` | Scrape interval |
diff --git a/charts/attune/files/grafana-dashboard.json b/charts/attune/files/grafana-dashboard.json
@@ -300,6 +300,40 @@
         }
       ]
     },
+    {
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 10
+          },
+          "color": {
+            "mode": "fixed",
+            "fixedColor": "red"
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 3
+      },
+      "id": 32,
+      "options": {
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "title": "Revert Failures",
+      "type": "timeseries",
+      "targets": [
+        {
+          "expr": "sum by (namespace, workload) (rate(attune_revert_failures_total[$__rate_interval]))",
+          "legendFormat": "{{ namespace }}/{{ workload }}"
+        }
+      ]
+    },
     {
       "fieldConfig": {
         "defaults": {
diff --git a/charts/attune/templates/prometheusrule.yaml b/charts/attune/templates/prometheusrule.yaml
@@ -126,4 +126,18 @@ spec:
               clamping resource requests to container limits for {{ .Values.metrics.prometheusRule.rules.requestsClamped.for }}.
               Consider increasing limits or switching to controlledValues: RequestsAndLimits.
         {{- end }}
+        {{- if .Values.metrics.prometheusRule.rules.revertFailures.enabled }}
+        - alert: AttuneRevertFailures
+          expr: sum by (namespace, workload) (rate(attune_revert_failures_total[5m])) > 0
+          for: {{ .Values.metrics.prometheusRule.rules.revertFailures.for }}
+          labels:
+            severity: {{ .Values.metrics.prometheusRule.rules.revertFailures.severity }}
+          annotations:
+            summary: attune resize revert is failing
+            description: >-
+              Workload {{ "{{ $labels.namespace }}" }}/{{ "{{ $labels.workload }}" }}
+              had a resize revert failure. The pod may be running with
+              problematic resources. Check operator logs for the revert error
+              and verify RBAC for the pods/resize subresource.
+        {{- end }}
 {{- end }}
diff --git a/charts/attune/tests/prometheusrule_test.yaml b/charts/attune/tests/prometheusrule_test.yaml
@@ -20,7 +20,7 @@ tests:
           path: spec.groups[0].name
           value: attune
 
-  - it: should include all eight default alerts
+  - it: should include all nine default alerts
     set:
       metrics:
         enabled: true
@@ -29,7 +29,7 @@ tests:
     asserts:
       - lengthEqual:
           path: spec.groups[0].rules
-          count: 8
+          count: 9
 
   - it: should disable individual rules
     set:
@@ -45,7 +45,7 @@ tests:
     asserts:
       - lengthEqual:
           path: spec.groups[0].rules
-          count: 6
+          count: 7
 
   - it: should apply additional labels
     set:
diff --git a/charts/attune/values.schema.json b/charts/attune/values.schema.json
@@ -253,6 +253,15 @@
                     "for": { "type": "string", "default": "1h" },
                     "severity": { "type": "string", "default": "info" }
                   }
+                },
+                "revertFailures": {
+                  "type": "object",
+                  "additionalProperties": false,
+                  "properties": {
+                    "enabled": { "type": "boolean", "default": true },
+                    "for": { "type": "string", "default": "5m" },
+                    "severity": { "type": "string", "default": "critical" }
+                  }
                 }
               }
             }
diff --git a/charts/attune/values.yaml b/charts/attune/values.yaml
@@ -136,6 +136,11 @@ metrics:
         # -- How long the condition must persist before firing
         for: 1h
         severity: info
+      revertFailures:
+        enabled: true
+        # -- How long the condition must persist before firing
+        for: 5m
+        severity: critical
 
 grafanaDashboard:
   # -- Create a ConfigMap with the Grafana dashboard (auto-discovered by Grafana sidecar)
diff --git a/deploy/grafana/dashboard.json b/deploy/grafana/dashboard.json
@@ -338,6 +338,40 @@
         }
       ]
     },
+    {
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "drawStyle": "line",
+            "fillOpacity": 10
+          },
+          "color": {
+            "mode": "fixed",
+            "fixedColor": "red"
+          }
+        }
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 3
+      },
+      "id": 32,
+      "options": {
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "title": "Revert Failures",
+      "type": "timeseries",
+      "targets": [
+        {
+          "expr": "sum by (namespace, workload) (rate(attune_revert_failures_total[$__rate_interval]))",
+          "legendFormat": "{{ namespace }}/{{ workload }}"
+        }
+      ]
+    },
     {
       "datasource": {
         "type": "prometheus",
diff --git a/docs/guides/troubleshooting.md b/docs/guides/troubleshooting.md
@@ -401,6 +401,34 @@ Common causes:
 - **restart**: the application crashes at the new resource level. Check application logs.
 - **notready**: readiness probe fails post-resize. Verify probe configuration.
 
+### Revert failures
+
+**Symptom**: Entries in `.status.resizeHistory` show `result: Failed`, or
+`attune_revert_failures_total` is incrementing.
+
+**Cause**: The operator detected a safety issue (OOMKill, throttle, etc.)
+and tried to revert the pod to its original resources, but the `/resize`
+subresource call failed. The pod remains at the post-resize resource level.
+
+**Fix**: Check operator logs for the revert error:
+
+```bash
+kubectl logs -l app.kubernetes.io/name=attune --tail=100 | grep "Failed to revert"
+```
+
+Common causes:
+
+- **Conflict**: another controller (HPA, VPA) is modifying the same pod.
+  Use `attune_revert_failures_total` to track frequency.
+- **Pod evicted**: the pod was evicted between the safety check and revert.
+- **RBAC**: the operator ServiceAccount lacks `update` on the `pods/resize`
+  subresource.
+
+```promql
+# Alert when reverts are failing
+sum by (namespace, workload) (rate(attune_revert_failures_total[5m])) > 0
+```
+
 ### Resizes not happening during expected window
 
 **Symptom**: Operator logs "Outside resize window, skipping resize" even
diff --git a/docs/reference/metrics.md b/docs/reference/metrics.md
@@ -24,6 +24,24 @@ Total number of resize reverts triggered by the safety monitor.
 | `workload` | Workload name |
 | `reason` | `oomkill`, `throttle`, `restart`, `notready`, `re-fetch-failed`, or `annotation-persist-failed` |
 
+### attune_revert_failures_total
+
+Total number of failed resize revert attempts. A non-zero value means the
+operator tried to restore a pod's original resources but the `/resize`
+subresource call failed, leaving the pod running with post-resize resources
+that may be causing issues.
+
+| Label | Description |
+|-------|-------------|
+| `namespace` | Workload namespace |
+| `workload` | Workload name |
+| `reason` | Same reason labels as `attune_reverts_total` |
+
+```promql
+# Alert when reverts are failing
+sum by (namespace, workload) (rate(attune_revert_failures_total[5m])) > 0
+```
+
 ### attune_prometheus_query_errors_total
 
 Total number of failed Prometheus queries.
diff --git a/internal/controller/resize.go b/internal/controller/resize.go
@@ -550,6 +550,7 @@ func (r *AttunePolicyReconciler) resizeContainer(
 		revertFailed := false
 		if revertErr := monitor.RevertPod(ctx, revertRecord); revertErr != nil {
 			logger.Error(revertErr, "Failed to revert pod after "+reason, "pod", pod.Name)
+			operatormetrics.RevertFailuresTotal.WithLabelValues(pod.Namespace, workloadName, reason).Inc()
 			revertFailed = true
 		}
 		if !revertFailed {
diff --git a/internal/metrics/collector.go b/internal/metrics/collector.go
@@ -160,7 +160,6 @@ func (t *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 		return t.base.RoundTrip(req)
 	}
 	clone := req.Clone(req.Context())
-	clone.Header = req.Header.Clone()
 	for k, v := range t.headers {
 		clone.Header.Set(k, v)
 	}
@@ -295,9 +294,13 @@ func (c *PrometheusCollector) QueryRangeGrouped(ctx context.Context, query strin
 	for _, series := range matrix {
 		container := string(series.Metric[model.LabelName("container")])
 		for _, sp := range series.Values {
+			v := float64(sp.Value)
+			if math.IsNaN(v) || math.IsInf(v, 0) {
+				continue
+			}
 			grouped[container] = append(grouped[container], Sample{
 				Timestamp: sp.Timestamp.Time(),
-				Value:     float64(sp.Value),
+				Value:     v,
 			})
 		}
 	}
diff --git a/internal/metrics/collector_test.go b/internal/metrics/collector_test.go
@@ -179,6 +179,46 @@ func TestQueryRangeGrouped_Success(t *testing.T) {
 	assert.InDelta(t, 0.05, grouped["sidecar"][0].Value, 0.001)
 }
 
+func TestQueryRangeGrouped_NaNInfFiltered(t *testing.T) {
+	response := `{
+		"status": "success",
+		"data": {
+			"resultType": "matrix",
+			"result": [
+				{
+					"metric": {"__name__": "cpu_usage", "container": "app"},
+					"values": [
+						[1700000000, "0.25"],
+						[1700000060, "NaN"],
+						[1700000120, "Inf"],
+						[1700000180, "-Inf"],
+						[1700000240, "0.75"]
+					]
+				}
+			]
+		}
+	}`
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write([]byte(response))
+	}))
+	defer server.Close()
+
+	collector, err := NewPrometheusCollector(server.URL, logr.Discard(), http.DefaultTransport)
+	require.NoError(t, err)
+
+	start := time.Unix(1700000000, 0)
+	end := time.Unix(1700000300, 0)
+	step := 60 * time.Second
+
+	grouped, err := collector.QueryRangeGrouped(context.Background(), "cpu_usage", start, end, step)
+	require.NoError(t, err)
+	require.Len(t, grouped["app"], 2, "NaN, +Inf, and -Inf samples should be filtered out")
+	assert.InDelta(t, 0.25, grouped["app"][0].Value, 0.001)
+	assert.InDelta(t, 0.75, grouped["app"][1].Value, 0.001)
+}
+
 func TestQuery_Success(t *testing.T) {
 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Content-Type", "application/json")
diff --git a/internal/operatormetrics/metrics.go b/internal/operatormetrics/metrics.go
@@ -229,6 +229,14 @@ var (
 		},
 		[]string{"namespace", "policy", "container", "metric_type"},
 	)
+
+	RevertFailuresTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "attune_revert_failures_total",
+			Help: "Total number of failed resize revert attempts",
+		},
+		[]string{"namespace", "workload", "reason"},
+	)
 )
 
 // WebhookTimer tracks webhook operation duration and result.
@@ -283,5 +291,6 @@ func init() {
 		StaleRecommendationsTotal,
 		RequestClampedTotal,
 		NanInfSamplesTotal,
+		RevertFailuresTotal,
 	)
 }
diff --git a/internal/safety/monitor.go b/internal/safety/monitor.go
@@ -375,8 +375,20 @@ func (m *Monitor) RevertPod(ctx context.Context, record ResizeRecord) error {
 			return nil
 		}
 
-		m.logger.Info("reverting pod resize", "pod", record.PodName,
-			"namespace", record.Namespace, "container", record.Container)
+		logFields := []any{
+			"pod", record.PodName,
+			"namespace", record.Namespace,
+			"container", record.Container,
+			"toCPU", record.OriginalResources.Requests.Cpu().String(),
+			"toMemory", record.OriginalResources.Requests.Memory().String(),
+		}
+		if len(record.NewResources.Requests) > 0 {
+			logFields = append(logFields,
+				"fromCPU", record.NewResources.Requests.Cpu().String(),
+				"fromMemory", record.NewResources.Requests.Memory().String(),
+			)
+		}
+		m.logger.Info("reverting pod resize", logFields...)
 
 		_, err = m.client.CoreV1().Pods(record.Namespace).UpdateResize(ctx, record.PodName, updated, metav1.UpdateOptions{})
 		if err != nil {
diff --git a/internal/webhook/validation.go b/internal/webhook/validation.go

Original file line number	Diff line number	Diff line change
`@@ -253,6 +253,15 @@`
`253`	`253`	`"for": { "type": "string", "default": "1h" },`
`254`	`254`	`"severity": { "type": "string", "default": "info" }`
`255`	`255`	`}`
	`256`	`+ },`
	`257`	`+ "revertFailures": {`
	`258`	`+ "type": "object",`
	`259`	`+ "additionalProperties": false,`
	`260`	`+ "properties": {`
	`261`	`+ "enabled": { "type": "boolean", "default": true },`
	`262`	`+ "for": { "type": "string", "default": "5m" },`
	`263`	`+ "severity": { "type": "string", "default": "critical" }`
	`264`	`+ }`
`256`	`265`	`}`
`257`	`266`	`}`
`258`	`267`	`}`
Original file line number	Diff line number	Diff line change
`@@ -550,6 +550,7 @@ func (r *AttunePolicyReconciler) resizeContainer(`
`550`	`550`	`revertFailed := false`
`551`	`551`	`if revertErr := monitor.RevertPod(ctx, revertRecord); revertErr != nil {`
`552`	`552`	`logger.Error(revertErr, "Failed to revert pod after "+reason, "pod", pod.Name)`
	`553`	`+ operatormetrics.RevertFailuresTotal.WithLabelValues(pod.Namespace, workloadName, reason).Inc()`
`553`	`554`	`revertFailed = true`
`554`	`555`	`}`
`555`	`556`	`if !revertFailed {`