From dfaae140f6fcf1485f931cab736aac7b7857fd68 Mon Sep 17 00:00:00 2001 From: Simone Tiraboschi Date: Tue, 25 Nov 2025 19:14:14 +0100 Subject: [PATCH] feat(lownodeutilization): expose node classification metrics Expose classification metrics to track and observe node utilization and decisions: - descheduler_lownodeutilization_node_utilization_threshold: threshold values (0-1) - descheduler_lownodeutilization_node_utilization_value: actual utilization (0-1) - descheduler_lownodeutilization_node_classification: classification result (0/1/2) - descheduler_lownodeutilization_threshold_mode: static vs deviation-based (0/1) All utilization metrics use 0-1 ratio format for consistency with Prometheus best practices and easy comparison in queries. Signed-off-by: Simone Tiraboschi --- .../nodeutilization/lownodeutilization.go | 40 ++++++++- .../plugins/nodeutilization/metrics.go | 90 +++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 pkg/framework/plugins/nodeutilization/metrics.go diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization.go b/pkg/framework/plugins/nodeutilization/lownodeutilization.go index 5748e73778..42b065e09a 100644 --- a/pkg/framework/plugins/nodeutilization/lownodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/lownodeutilization.go @@ -116,6 +116,9 @@ func NewLowNodeUtilization( } } + // Register metrics for this plugin + RegisterMetrics() + return &LowNodeUtilization{ logger: logger, handle: handle, @@ -170,6 +173,7 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra l.args.Thresholds, l.args.TargetThresholds, ) + LowNodeUtilizationThresholdModeMetric.Set(1) } else { usage, thresholds = assessNodesUsagesAndStaticThresholds( nodesUsageMap, @@ -177,6 +181,7 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra l.args.Thresholds, l.args.TargetThresholds, ) + LowNodeUtilizationThresholdModeMetric.Set(0) } // classify nodes in under and over utilized. we will later try to move @@ -235,8 +240,41 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra } } - // log nodes that are appropriately utilized. + // log nodes that are appropriately utilized and record metrics for all nodes for nodeName := range nodesMap { + classification := 1.0 + if _, isUnder := nodeGroups[0][nodeName]; isUnder { + classification = 0.0 + } else if _, isOver := nodeGroups[1][nodeName]; isOver { + classification = 2.0 + } + LowNodeUtilizationClassificationMetric.WithLabelValues(nodeName).Set(classification) + + // Record utilization values and thresholds for each resource + if nodeUsage, ok := usage[nodeName]; ok { + for resourceName, utilizationPercentage := range nodeUsage { + resourceStr := string(resourceName) + LowNodeUtilizationValueMetric.WithLabelValues(nodeName, resourceStr).Set(float64(utilizationPercentage) / 100.0) + } + } + + if nodeThresholds, ok := thresholds[nodeName]; ok { + // Record the underutilization threshold (first threshold) as "low" + if len(nodeThresholds) > 0 { + for resourceName, thresholdPercentage := range nodeThresholds[0] { + resourceStr := string(resourceName) + LowNodeUtilizationThresholdMetric.WithLabelValues(nodeName, resourceStr, "low").Set(float64(thresholdPercentage) / 100.0) + } + } + // Record the target/overutilization threshold (second threshold) as "high" + if len(nodeThresholds) > 1 { + for resourceName, thresholdPercentage := range nodeThresholds[1] { + resourceStr := string(resourceName) + LowNodeUtilizationThresholdMetric.WithLabelValues(nodeName, resourceStr, "high").Set(float64(thresholdPercentage) / 100.0) + } + } + } + if !classifiedNodes[nodeName] { logger.Info( "Node is appropriately utilized", diff --git a/pkg/framework/plugins/nodeutilization/metrics.go b/pkg/framework/plugins/nodeutilization/metrics.go new file mode 100644 index 0000000000..b1aed1b427 --- /dev/null +++ b/pkg/framework/plugins/nodeutilization/metrics.go @@ -0,0 +1,90 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeutilization + +import ( + "sync" + + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" + deschedulermetrics "sigs.k8s.io/descheduler/metrics" +) + +const ( + lowNodeUtilizationSubsystem = deschedulermetrics.DeschedulerSubsystem + "_lownodeutilization" +) + +var ( + // LowNodeUtilizationThresholdMetric tracks threshold values for node utilization (0-1 range) + LowNodeUtilizationThresholdMetric = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: lowNodeUtilizationSubsystem, + Name: "node_utilization_threshold", + Help: "Threshold values for node utilization (0-1 range)", + StabilityLevel: metrics.ALPHA, + }, []string{ + "node", // Node name + "resource", // Resource type (cpu, memory, etc.) + "threshold_type", // "low" or "high" + }) + + // LowNodeUtilizationValueMetric tracks actual utilization values for nodes (0-1 range) + LowNodeUtilizationValueMetric = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: lowNodeUtilizationSubsystem, + Name: "node_utilization_value", + Help: "Actual utilization values for nodes (0-1 range)", + StabilityLevel: metrics.ALPHA, + }, []string{"node", "resource"}) + + // LowNodeUtilizationClassificationMetric tracks node classification result + // 0=underutilized, 1=appropriately utilized, 2=overutilized + LowNodeUtilizationClassificationMetric = metrics.NewGaugeVec( + &metrics.GaugeOpts{ + Subsystem: lowNodeUtilizationSubsystem, + Name: "node_classification", + Help: "Node classification result: 0=underutilized, 1=appropriately utilized, 2=overutilized", + StabilityLevel: metrics.ALPHA, + }, []string{"node"}) + + // LowNodeUtilizationThresholdModeMetric tracks threshold mode: 0=static, 1=deviation-based + LowNodeUtilizationThresholdModeMetric = metrics.NewGauge( + &metrics.GaugeOpts{ + Subsystem: lowNodeUtilizationSubsystem, + Name: "threshold_mode", + Help: "Threshold mode: 0=static, 1=deviation-based", + StabilityLevel: metrics.ALPHA, + }) + + lowNodeUtilizationMetricsList = []metrics.Registerable{ + LowNodeUtilizationThresholdMetric, + LowNodeUtilizationValueMetric, + LowNodeUtilizationClassificationMetric, + LowNodeUtilizationThresholdModeMetric, + } +) + +var registerLowNodeUtilizationMetrics sync.Once + +// RegisterMetrics registers the LowNodeUtilization metrics. +func RegisterMetrics() { + registerLowNodeUtilizationMetrics.Do(func() { + for _, metric := range lowNodeUtilizationMetricsList { + legacyregistry.MustRegister(metric) + } + }) +}