Skip to content

Commit 8ab69a3

Browse files
authored
feat: add Prometheus metrics for request clamping and NaN/Inf data quality (#177)
Add two new Prometheus counter metrics: - attune_request_clamped_total: incremented when a recommended request exceeds the container limit and is capped. Labels: namespace, policy, container, resource. - attune_nan_inf_samples_total: incremented when all samples for a container metric type are non-finite (NaN or Inf), indicating a data quality issue with the metrics backend. Labels: namespace, policy, container, metric_type. These metrics make clamping events and data quality issues visible in Grafana dashboards without requiring debug-level logging. Changes: - internal/operatormetrics/metrics.go: register both counter vectors - internal/controller/resize.go: buildResizeTarget returns clamped resource list; caller increments RequestClampedTotal - internal/controller/prometheus.go: detect all-NaN/Inf profiles in computeRecommendations and increment NanInfSamplesTotal - 4 new tests covering both metrics and partial clamping Closes #174 Signed-off-by: Sebastien Tardif <sebtardif@ncf.ca>
1 parent 26ac873 commit 8ab69a3

4 files changed

Lines changed: 116 additions & 3 deletions

File tree

internal/controller/attunepolicy_controller_test.go

Lines changed: 78 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7030,11 +7030,12 @@ func TestBuildResizeTarget_OmitsLimitsWhenZero(t *testing.T) {
70307030
MemoryRequest: resource.MustParse("128Mi"),
70317031
},
70327032
}
7033-
target, _ := buildResizeTarget(rec)
7033+
target, clamped := buildResizeTarget(rec)
70347034
assert.Equal(t, int64(100), target.Requests.Cpu().MilliValue())
70357035
wantMem := resource.MustParse("128Mi")
70367036
assert.Equal(t, wantMem.Value(), target.Requests.Memory().Value())
70377037
assert.Nil(t, target.Limits, "Limits should be nil when recommendation limits are zero")
7038+
assert.Empty(t, clamped, "nothing should be clamped when no limits present")
70387039
}
70397040

70407041
func TestBuildResizeTarget_IncludesLimitsWhenNonZero(t *testing.T) {
@@ -7047,11 +7048,12 @@ func TestBuildResizeTarget_IncludesLimitsWhenNonZero(t *testing.T) {
70477048
MemoryLimit: resource.MustParse("256Mi"),
70487049
},
70497050
}
7050-
target, _ := buildResizeTarget(rec)
7051+
target, clamped := buildResizeTarget(rec)
70517052
require.NotNil(t, target.Limits)
70527053
assert.Equal(t, int64(200), target.Limits.Cpu().MilliValue())
70537054
wantMemLim := resource.MustParse("256Mi")
70547055
assert.Equal(t, wantMemLim.Value(), target.Limits.Memory().Value())
7056+
assert.Empty(t, clamped, "nothing should be clamped when requests are below limits")
70557057
}
70567058

70577059
func TestBuildResizeTarget_PartialLimits(t *testing.T) {
@@ -7063,11 +7065,12 @@ func TestBuildResizeTarget_PartialLimits(t *testing.T) {
70637065
MemoryRequest: resource.MustParse("128Mi"),
70647066
},
70657067
}
7066-
target, _ := buildResizeTarget(rec)
7068+
target, clamped := buildResizeTarget(rec)
70677069
require.NotNil(t, target.Limits, "Limits should be non-nil when any limit is non-zero")
70687070
assert.Equal(t, int64(200), target.Limits.Cpu().MilliValue())
70697071
_, hasMemLimit := target.Limits[corev1.ResourceMemory]
70707072
assert.False(t, hasMemLimit, "Memory limit should not be set when zero in recommendation")
7073+
assert.Empty(t, clamped, "nothing should be clamped when requests are below limits")
70717074
}
70727075

70737076
func TestBuildResizeTarget_ClampsRequestsToLimits(t *testing.T) {
@@ -7127,6 +7130,78 @@ func TestBuildResizeTarget_PartialClamping(t *testing.T) {
71277130
"only CPU should be reported as clamped")
71287131
}
71297132

7133+
func TestComputeRecommendations_NanInfSamplesMetric(t *testing.T) {
7134+
policy := newTestPolicy("test-policy", "default")
7135+
deploy := newTestDeployment("api-server", "default", nil)
7136+
reconciler := newReconcilerWithClient()
7137+
7138+
// Return NaN/Inf samples so BuildProfile yields 0 data points.
7139+
mc := &mockCollector{
7140+
queryRangeGroupedFunc: func(_ context.Context, query string, _, _ time.Time, _ time.Duration) (map[string][]rsmetrics.Sample, error) {
7141+
return map[string][]rsmetrics.Sample{
7142+
"main": {
7143+
{Timestamp: time.Now().Add(-1 * time.Hour), Value: math.NaN()},
7144+
{Timestamp: time.Now().Add(-2 * time.Hour), Value: math.Inf(1)},
7145+
{Timestamp: time.Now().Add(-3 * time.Hour), Value: math.Inf(-1)},
7146+
},
7147+
}, nil
7148+
},
7149+
}
7150+
7151+
before := promtestutil.ToFloat64(operatormetrics.NanInfSamplesTotal.WithLabelValues("default", "test-policy", "main", "cpu"))
7152+
rec, _, _, _, err := reconciler.computeRecommendations(context.Background(), policy, deploy, mc, nil, nil, nil, nil)
7153+
assert.NoError(t, err)
7154+
assert.Nil(t, rec, "should produce no recommendation when all data is NaN/Inf")
7155+
after := promtestutil.ToFloat64(operatormetrics.NanInfSamplesTotal.WithLabelValues("default", "test-policy", "main", "cpu"))
7156+
assert.Equal(t, before+1, after, "NanInfSamplesTotal should increment for CPU")
7157+
}
7158+
7159+
func TestExecuteResizes_RequestClampedMetric(t *testing.T) {
7160+
pod := newResizePod("api-server", "600m", "1Gi", "500m", "512Mi")
7161+
deploy := newTestDeployment("api-server", "default", map[string]string{"app": "api-server"})
7162+
policy := newTestPolicy("test-policy", "default")
7163+
policy.Spec.UpdateStrategy.Type = attunev1alpha1.UpdateTypeAuto
7164+
clientset := kubefake.NewSimpleClientset(pod)
7165+
7166+
reconciler := newReconcilerWithClient(pod, deploy)
7167+
reconciler.Clientset = clientset
7168+
7169+
recommendations := []attunev1alpha1.WorkloadRecommendation{
7170+
{
7171+
Workload: "api-server",
7172+
Kind: "Deployment",
7173+
Containers: []attunev1alpha1.ContainerRecommendation{
7174+
{
7175+
Name: "main",
7176+
Recommended: attunev1alpha1.ResourceValues{
7177+
CPURequest: resource.MustParse("800m"), // Will be clamped to 500m limit
7178+
MemoryRequest: resource.MustParse("2Gi"), // Will be clamped to 512Mi limit
7179+
CPULimit: resource.MustParse("500m"),
7180+
MemoryLimit: resource.MustParse("512Mi"),
7181+
},
7182+
Current: attunev1alpha1.ResourceValues{
7183+
CPURequest: resource.MustParse("600m"),
7184+
MemoryRequest: resource.MustParse("1Gi"),
7185+
CPULimit: resource.MustParse("500m"),
7186+
MemoryLimit: resource.MustParse("512Mi"),
7187+
},
7188+
},
7189+
},
7190+
},
7191+
}
7192+
7193+
beforeCPU := promtestutil.ToFloat64(operatormetrics.RequestClampedTotal.WithLabelValues("default", "test-policy", "main", "cpu"))
7194+
beforeMem := promtestutil.ToFloat64(operatormetrics.RequestClampedTotal.WithLabelValues("default", "test-policy", "main", "memory"))
7195+
7196+
reconciler.executeResizes(context.Background(), policy, []client.Object{deploy},
7197+
recommendations, podMap("api-server", pod), nil, nil)
7198+
7199+
afterCPU := promtestutil.ToFloat64(operatormetrics.RequestClampedTotal.WithLabelValues("default", "test-policy", "main", "cpu"))
7200+
afterMem := promtestutil.ToFloat64(operatormetrics.RequestClampedTotal.WithLabelValues("default", "test-policy", "main", "memory"))
7201+
assert.Equal(t, beforeCPU+1, afterCPU, "RequestClampedTotal should increment for CPU")
7202+
assert.Equal(t, beforeMem+1, afterMem, "RequestClampedTotal should increment for memory")
7203+
}
7204+
71307205
func TestTryEvictionFallback_EvictsWhenMultipleReplicas(t *testing.T) {
71317206
pod1 := newTestPod("api-server-abc-1", "default", map[string]string{"app": "api-server"})
71327207
pod2 := newTestPod("api-server-abc-2", "default", map[string]string{"app": "api-server"})

internal/controller/prometheus.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,21 @@ func (r *AttunePolicyReconciler) computeRecommendations(
293293
cpuProfile := rsmetrics.BuildProfile(cpuSamples)
294294
memProfile := rsmetrics.BuildProfile(memSamples)
295295

296+
// Detect when all samples were non-finite (NaN/Inf). This means
297+
// Prometheus returned data but every value was unusable.
298+
if len(cpuSamples) > 0 && cpuProfile.DataPoints == 0 {
299+
operatormetrics.NanInfSamplesTotal.WithLabelValues(
300+
policy.Namespace, policy.Name, containerName, "cpu").Inc()
301+
logger.V(1).Info("All CPU samples are NaN/Inf, data quality issue",
302+
"container", containerName, "rawSamples", len(cpuSamples))
303+
}
304+
if len(memSamples) > 0 && memProfile.DataPoints == 0 {
305+
operatormetrics.NanInfSamplesTotal.WithLabelValues(
306+
policy.Namespace, policy.Name, containerName, "memory").Inc()
307+
logger.V(1).Info("All memory samples are NaN/Inf, data quality issue",
308+
"container", containerName, "rawSamples", len(memSamples))
309+
}
310+
296311
// Track maximum data points across all containers.
297312
if pts := cpuProfile.DataPoints; pts > maxDataPoints {
298313
maxDataPoints = pts

internal/controller/resize.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,10 @@ func (r *AttunePolicyReconciler) executeResizes(
258258
logger.V(1).Info("Requests clamped to limits",
259259
"pod", pod.Name, "container", containerRec.Name,
260260
"clampedResources", clamped)
261+
for _, res := range clamped {
262+
operatormetrics.RequestClampedTotal.WithLabelValues(
263+
policy.Namespace, policy.Name, containerRec.Name, res).Inc()
264+
}
261265
}
262266
cpuIncrease, memIncrease := budgetIncrease(&pod, containerRec.Name, target)
263267

@@ -670,6 +674,7 @@ func (r *AttunePolicyReconciler) persistResizeAnnotations(
670674
// Limits are included when non-zero: for RequestsOnly they equal the current limits (no-op),
671675
// for RequestsAndLimits they are scaled proportionally. Pods that never had limits produce
672676
// zero-valued limit fields, which are omitted to avoid Kubernetes rejecting the resize.
677+
// Returns the target resources and the list of resource names that were clamped.
673678
func buildResizeTarget(rec attunev1alpha1.ContainerRecommendation) (corev1.ResourceRequirements, []string) {
674679
target := corev1.ResourceRequirements{
675680
Requests: corev1.ResourceList{

internal/operatormetrics/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,22 @@ var (
213213
},
214214
[]string{"namespace", "policy"},
215215
)
216+
217+
RequestClampedTotal = prometheus.NewCounterVec(
218+
prometheus.CounterOpts{
219+
Name: "attune_request_clamped_total",
220+
Help: "Total times a recommended request was capped at the container's limit value",
221+
},
222+
[]string{"namespace", "policy", "container", "resource"},
223+
)
224+
225+
NanInfSamplesTotal = prometheus.NewCounterVec(
226+
prometheus.CounterOpts{
227+
Name: "attune_nan_inf_samples_total",
228+
Help: "Total times all samples for a container metric were non-finite (NaN or Inf)",
229+
},
230+
[]string{"namespace", "policy", "container", "metric_type"},
231+
)
216232
)
217233

218234
// WebhookTimer tracks webhook operation duration and result.
@@ -265,5 +281,7 @@ func init() {
265281
BurstFactor,
266282
StartupBoostTotal,
267283
StaleRecommendationsTotal,
284+
RequestClampedTotal,
285+
NanInfSamplesTotal,
268286
)
269287
}

0 commit comments

Comments
 (0)