Feat: Add saturation and capacity Prometheus metrics (#912)

ev-shindin · ev-shindin · commit b63fd32c861f · 2026-03-25T18:00:44.000+02:00
Add 5 new Prometheus gauge metrics exposing saturation analysis outputs
that drive scaling decisions, giving operators visibility into why
scaling happens:

- wva_saturation_utilization: per-variant utilization ratio (0.0-1.0)
- wva_spare_capacity: per-variant spare capacity (0.0-1.0)
- wva_required_capacity: model-level required capacity (&gt;0 = scale-up)
- wva_kv_cache_tokens_used: KV cache tokens in use per variant
- wva_kv_cache_tokens_total: KV cache token capacity per variant

Metrics are populated in both V1 (percentage-based) and V2
(token-based) engine paths and emitted during applySaturationDecisions.
diff --git a/internal/actuator/actuator.go b/internal/actuator/actuator.go
@@ -7,6 +7,7 @@ import (
 	llmdOptv1alpha1 "github.com/llm-d/llm-d-workload-variant-autoscaler/api/v1alpha1"
 	appsv1 "k8s.io/api/apps/v1"
 
+	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/interfaces"
 	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/metrics"
 	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/utils"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -95,3 +96,18 @@ func (a *Actuator) EmitMetrics(ctx context.Context, VariantAutoscaling *llmdOptv
 		"accelerator", VariantAutoscaling.Status.DesiredOptimizedAlloc.Accelerator)
 	return nil
 }
+
+// EmitSaturationMetrics emits saturation analysis and KV cache capacity metrics from a decision.
+func (a *Actuator) EmitSaturationMetrics(ctx context.Context, decision interfaces.VariantDecision) error {
+	return a.MetricsEmitter.EmitSaturationMetrics(
+		ctx,
+		decision.VariantName,
+		decision.Namespace,
+		decision.AcceleratorName,
+		decision.Utilization,
+		decision.SpareCapacity,
+		decision.RequiredCapacity,
+		decision.KvCacheTokensUsed,
+		decision.KvCacheTokensTotal,
+	)
+}
diff --git a/internal/constants/metrics.go b/internal/constants/metrics.go
@@ -108,6 +108,26 @@ const (
 	// WVADesiredRatio is a gauge that tracks the ratio of desired to current replicas.
 	// Labels: variant_name, namespace, accelerator_type
 	WVADesiredRatio = "wva_desired_ratio"
+
+	// WVASaturationUtilization is a gauge that tracks per-variant utilization ratio (0.0-1.0).
+	// Labels: variant_name, namespace, accelerator_type
+	WVASaturationUtilization = "wva_saturation_utilization"
+
+	// WVASpareCapacity is a gauge that tracks per-variant spare capacity (0.0-1.0).
+	// Labels: variant_name, namespace, accelerator_type
+	WVASpareCapacity = "wva_spare_capacity"
+
+	// WVARequiredCapacity is a gauge that tracks model-level required capacity.
+	// >0 means scale-up needed. Labels: variant_name, namespace
+	WVARequiredCapacity = "wva_required_capacity"
+
+	// WVAKvCacheTokensUsed is a gauge that tracks total KV cache tokens currently in use per variant.
+	// Labels: variant_name, namespace
+	WVAKvCacheTokensUsed = "wva_kv_cache_tokens_used"
+
+	// WVAKvCacheTokensTotal is a gauge that tracks total KV cache token capacity per variant.
+	// Labels: variant_name, namespace
+	WVAKvCacheTokensTotal = "wva_kv_cache_tokens_total"
 )
 
 // Metric Label Names
diff --git a/internal/engines/pipeline/cost_aware_optimizer.go b/internal/engines/pipeline/cost_aware_optimizer.go
@@ -292,18 +292,21 @@ func buildDecisionsWithOptimizer(
 		}
 
 		decisions = append(decisions, interfaces.VariantDecision{
-			VariantName:     name,
-			ModelID:         req.ModelID,
-			Namespace:       req.Namespace,
-			AcceleratorName: vc.AcceleratorName,
-			Cost:            vc.Cost,
-			Role:            state.Role,
-			CurrentReplicas: state.CurrentReplicas,
-			TargetReplicas:  target,
-			Action:          action,
-			Reason:          reason,
-			MinReplicas:     state.MinReplicas,
-			MaxReplicas:     state.MaxReplicas,
+			VariantName:      name,
+			ModelID:          req.ModelID,
+			Namespace:        req.Namespace,
+			AcceleratorName:  vc.AcceleratorName,
+			Cost:             vc.Cost,
+			Role:             state.Role,
+			CurrentReplicas:  state.CurrentReplicas,
+			TargetReplicas:   target,
+			Action:           action,
+			Reason:           reason,
+			MinReplicas:      state.MinReplicas,
+			MaxReplicas:      state.MaxReplicas,
+			Utilization:      vc.Utilization,
+			SpareCapacity:    1.0 - vc.Utilization,
+			RequiredCapacity: req.Result.RequiredCapacity,
 		})
 	}
 	return decisions
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -361,6 +361,9 @@ func (e *Engine) optimizeV1(
 			// Convert saturation targets to decisions first, then apply enforcer
 			finalDecisions = e.convertSaturationTargetsToDecisions(ctx, saturationTargets, saturationAnalysis, data.variantStates)
 
+			// Enrich decisions with saturation metrics for observability (V1 path)
+			enrichDecisionsFromReplicaMetrics(finalDecisions, data.replicaMetrics, saturationAnalysis.ShouldScaleUp)
+
 			// Check if any variant has minReplicas > 0 — if so, skip scale-to-zero enforcement
 			if !hasMinReplicasAboveZero(data.variantStates) {
 				// Apply scale-to-zero enforcement on decisions
@@ -435,6 +438,8 @@ func (e *Engine) optimizeV2(
 
 	// Stage 1: Collect ModelScalingRequests for all models
 	var requests []pipeline.ModelScalingRequest
+	// modelReplicaMetrics collects per-model replica metrics for KV token enrichment
+	modelReplicaMetrics := make(map[string][]interfaces.ReplicaMetrics)
 
 	for groupKey, modelVAs := range modelGroups {
 		modelID := modelVAs[0].Spec.ModelID
@@ -475,6 +480,7 @@ func (e *Engine) optimizeV2(
 		}
 
 		requests = append(requests, *req)
+		modelReplicaMetrics[modelID] = data.replicaMetrics
 	}
 
 	if len(requests) == 0 {
@@ -522,6 +528,11 @@ func (e *Engine) optimizeV2(
 		}
 	}
 
+	// Stage 4: Enrich decisions with KV cache token data from replicaMetrics.
+	// Utilization, RequiredCapacity, and SpareCapacity are already set by
+	// buildDecisionsWithOptimizer from AnalyzerResult.
+	enrichDecisionsWithKvTokenData(allDecisions, modelReplicaMetrics)
+
 	return allDecisions
 }
 
@@ -746,6 +757,77 @@ func (e *Engine) convertSaturationTargetsToDecisions(
 	return decisions
 }
 
+// enrichDecisionsFromReplicaMetrics populates saturation observability fields on decisions
+// by aggregating per-pod ReplicaMetrics per variant. Used by the V1 path where
+// Utilization and RequiredCapacity are not set by the optimizer.
+func enrichDecisionsFromReplicaMetrics(decisions []interfaces.VariantDecision, replicaMetrics []interfaces.ReplicaMetrics, shouldScaleUp bool) {
+	// Aggregate per variant
+	type variantAgg struct {
+		kvUsed     int64
+		kvTotal    int64
+		kvUsageSum float64
+		count      int
+	}
+	agg := make(map[string]*variantAgg)
+	for _, rm := range replicaMetrics {
+		a, ok := agg[rm.VariantName]
+		if !ok {
+			a = &variantAgg{}
+			agg[rm.VariantName] = a
+		}
+		a.kvUsed += rm.TokensInUse
+		a.kvTotal += rm.TotalKvCapacityTokens
+		a.kvUsageSum += rm.KvCacheUsage
+		a.count++
+	}
+
+	requiredCapacity := float64(0)
+	if shouldScaleUp {
+		requiredCapacity = 1.0
+	}
+
+	for i := range decisions {
+		d := &decisions[i]
+		d.RequiredCapacity = requiredCapacity
+		if a, ok := agg[d.VariantName]; ok && a.count > 0 {
+			d.KvCacheTokensUsed = a.kvUsed
+			d.KvCacheTokensTotal = a.kvTotal
+			d.Utilization = a.kvUsageSum / float64(a.count)
+		}
+	}
+}
+
+// enrichDecisionsWithKvTokenData sets KvCacheTokensUsed and KvCacheTokensTotal on decisions
+// from replica metrics aggregated per variant. Used by V2 path where Utilization and
+// RequiredCapacity are already set from AnalyzerResult.
+func enrichDecisionsWithKvTokenData(decisions []interfaces.VariantDecision, modelReplicaMetrics map[string][]interfaces.ReplicaMetrics) {
+	// Build per-variant KV token aggregation across all models
+	type kvAgg struct {
+		kvUsed  int64
+		kvTotal int64
+	}
+	agg := make(map[string]*kvAgg)
+	for _, metrics := range modelReplicaMetrics {
+		for _, rm := range metrics {
+			a, ok := agg[rm.VariantName]
+			if !ok {
+				a = &kvAgg{}
+				agg[rm.VariantName] = a
+			}
+			a.kvUsed += rm.TokensInUse
+			a.kvTotal += rm.TotalKvCapacityTokens
+		}
+	}
+
+	for i := range decisions {
+		d := &decisions[i]
+		if a, ok := agg[d.VariantName]; ok {
+			d.KvCacheTokensUsed = a.kvUsed
+			d.KvCacheTokensTotal = a.kvTotal
+		}
+	}
+}
+
 // hasMinReplicasAboveZero returns true if any variant in the states has MinReplicas > 0.
 func hasMinReplicasAboveZero(states []interfaces.VariantReplicaState) bool {
 	for _, state := range states {
@@ -1089,6 +1171,13 @@ func (e *Engine) applySaturationDecisions(
 			updateVa.Status.Actuation.Applied = true
 		}
 
+		// Emit saturation and capacity metrics for observability
+		if hasDecision {
+			if err := act.EmitSaturationMetrics(ctx, decision); err != nil {
+				logger.Error(err, "Failed to emit saturation metrics", "variant", updateVa.Name)
+			}
+		}
+
 		// Update Shared State and Trigger Reconcile via Channel
 		// This avoids any API server interaction from the Engine.
 
diff --git a/internal/interfaces/saturation_analyzer.go b/internal/interfaces/saturation_analyzer.go
@@ -186,7 +186,22 @@ type VariantDecision struct {
 	// SpareCapacity indicates how much spare capacity this variant has.
 	// 0.0 = fully saturated, 1.0 = completely idle.
 	// Used by allocation algorithms to prioritize saturated variants.
+	// V1: threshold-relative spare KV capacity (AvgSpareKvCapacity).
+	// V2: 1.0 - Utilization (absolute spare).
 	SpareCapacity float64
+	// Utilization is the variant-level utilization ratio (0.0-1.0).
+	// V2: from AnalyzerResult.VariantCapacities[].Utilization.
+	// V1: average KvCacheUsage across this variant's replicas.
+	Utilization float64
+	// KvCacheTokensUsed is the sum of TokensInUse across this variant's replicas.
+	KvCacheTokensUsed int64
+	// KvCacheTokensTotal is the sum of TotalKvCapacityTokens across this variant's replicas.
+	KvCacheTokensTotal int64
+	// RequiredCapacity is the model-level required capacity (>0 means scale-up needed).
+	// Same value for all variants of a model.
+	// V1: binary (1.0 if shouldScaleUp, else 0.0).
+	// V2: continuous token-based demand from AnalyzerResult.
+	RequiredCapacity float64
 	// ScaleTargetRef references the Deployment/StatefulSet for scheduling constraints
 	ScaleTargetRef *autoscalingv2.CrossVersionObjectReference
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
@@ -19,6 +19,13 @@ var (
 	currentReplicas     *prometheus.GaugeVec
 	desiredRatio        *prometheus.GaugeVec
 
+	// Saturation and capacity metrics
+	saturationUtilization *prometheus.GaugeVec
+	spareCapacity         *prometheus.GaugeVec
+	requiredCapacity      *prometheus.GaugeVec
+	kvCacheTokensUsed     *prometheus.GaugeVec
+	kvCacheTokensTotal    *prometheus.GaugeVec
+
 	// controllerInstance stores the optional controller instance identifier.
 	// When set, it's added as a label to all emitted metrics.
 	controllerInstance string
@@ -41,10 +48,13 @@ func InitMetrics(registry prometheus.Registerer) error {
 	// Build label sets based on whether controller_instance is configured
 	baseLabels := []string{constants.LabelVariantName, constants.LabelNamespace, constants.LabelAcceleratorType}
 	scalingLabels := []string{constants.LabelVariantName, constants.LabelNamespace, constants.LabelDirection, constants.LabelReason}
+	// modelLabels: variant_name + namespace only (no accelerator_type) for model-level and token metrics
+	modelLabels := []string{constants.LabelVariantName, constants.LabelNamespace}
 
 	if controllerInstance != "" {
 		baseLabels = append(baseLabels, constants.LabelControllerInstance)
 		scalingLabels = append(scalingLabels, constants.LabelControllerInstance)
+		modelLabels = append(modelLabels, constants.LabelControllerInstance)
 	}
 
 	replicaScalingTotal = prometheus.NewCounterVec(
@@ -75,6 +85,41 @@ func InitMetrics(registry prometheus.Registerer) error {
 		},
 		baseLabels,
 	)
+	saturationUtilization = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: constants.WVASaturationUtilization,
+			Help: "Per-variant utilization ratio (0.0-1.0) from saturation analysis",
+		},
+		baseLabels,
+	)
+	spareCapacity = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: constants.WVASpareCapacity,
+			Help: "Per-variant spare capacity (0.0-1.0) from saturation analysis",
+		},
+		baseLabels,
+	)
+	requiredCapacity = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: constants.WVARequiredCapacity,
+			Help: "Model-level required capacity; >0 indicates scale-up needed (V1: binary 0/1, V2: continuous token demand)",
+		},
+		modelLabels,
+	)
+	kvCacheTokensUsed = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: constants.WVAKvCacheTokensUsed,
+			Help: "Total KV cache tokens currently in use across all replicas of a variant",
+		},
+		modelLabels,
+	)
+	kvCacheTokensTotal = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: constants.WVAKvCacheTokensTotal,
+			Help: "Total KV cache token capacity across all replicas of a variant",
+		},
+		modelLabels,
+	)
 
 	// Register metrics with the registry
 	if err := registry.Register(replicaScalingTotal); err != nil {
@@ -89,6 +134,21 @@ func InitMetrics(registry prometheus.Registerer) error {
 	if err := registry.Register(desiredRatio); err != nil {
 		return fmt.Errorf("failed to register desiredRatio metric: %w", err)
 	}
+	if err := registry.Register(saturationUtilization); err != nil {
+		return fmt.Errorf("failed to register saturationUtilization metric: %w", err)
+	}
+	if err := registry.Register(spareCapacity); err != nil {
+		return fmt.Errorf("failed to register spareCapacity metric: %w", err)
+	}
+	if err := registry.Register(requiredCapacity); err != nil {
+		return fmt.Errorf("failed to register requiredCapacity metric: %w", err)
+	}
+	if err := registry.Register(kvCacheTokensUsed); err != nil {
+		return fmt.Errorf("failed to register kvCacheTokensUsed metric: %w", err)
+	}
+	if err := registry.Register(kvCacheTokensTotal); err != nil {
+		return fmt.Errorf("failed to register kvCacheTokensTotal metric: %w", err)
+	}
 
 	return nil
 }
@@ -163,3 +223,39 @@ func (m *MetricsEmitter) EmitReplicaMetrics(ctx context.Context, va *llmdOptv1al
 	desiredRatio.With(baseLabels).Set(float64(desired) / float64(current))
 	return nil
 }
+
+// EmitSaturationMetrics emits saturation analysis and KV cache capacity metrics
+func (m *MetricsEmitter) EmitSaturationMetrics(
+	ctx context.Context,
+	variantName, namespace, acceleratorType string,
+	utilization, spare, required float64,
+	kvTokensUsed, kvTokensTotal int64,
+) error {
+	if saturationUtilization == nil || spareCapacity == nil || requiredCapacity == nil ||
+		kvCacheTokensUsed == nil || kvCacheTokensTotal == nil {
+		return fmt.Errorf("saturation metrics not initialized")
+	}
+
+	accelLabels := prometheus.Labels{
+		constants.LabelVariantName:     variantName,
+		constants.LabelNamespace:       namespace,
+		constants.LabelAcceleratorType: acceleratorType,
+	}
+	modelLabels := prometheus.Labels{
+		constants.LabelVariantName: variantName,
+		constants.LabelNamespace:   namespace,
+	}
+
+	if controllerInstance != "" {
+		accelLabels[constants.LabelControllerInstance] = controllerInstance
+		modelLabels[constants.LabelControllerInstance] = controllerInstance
+	}
+
+	saturationUtilization.With(accelLabels).Set(utilization)
+	spareCapacity.With(accelLabels).Set(spare)
+	requiredCapacity.With(modelLabels).Set(required)
+	kvCacheTokensUsed.With(modelLabels).Set(float64(kvTokensUsed))
+	kvCacheTokensTotal.With(modelLabels).Set(float64(kvTokensTotal))
+
+	return nil
+}
diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go
diff --git a/internal/metrics/suite_test.go b/internal/metrics/suite_test.go