Improvements: rename KvCacheTokensTotal, expand help text, use unit label

ev-shindin · ev-shindin · commit 71231a8d9dff · 2026-04-16T17:18:24.000+03:00
- Rename KvCacheTokensTotal -&gt; KvCacheTokensCapacity on VariantDecision,
  Actuator.EmitSaturationMetrics, MetricsEmitter.EmitSaturationMetrics,
  DeleteSaturationMetrics, and the Prometheus metric name itself
  (wva_kv_cache_tokens_total -&gt; wva_kv_cache_tokens_capacity). "Total" was
  confusing — the metric is a gauge of capacity, not a cumulative counter.

- Replace the analyzer_version="v1"/"v2" label on wva_required_capacity
  with a unit="binary"/"continuous" label. The label's purpose is to
  describe the unit of the metric value (a boolean scale-up signal in V1,
  a continuous token demand in V2), not the code path that produced it.
  "binary"/"continuous" remains meaningful after V1 is deprecated, whereas
  "v1"/"v2" becomes vestigial.

  Rename VariantDecision.AnalyzerVersion -&gt; RequiredCapacityUnit.
  Rename constants.LabelAnalyzerVersion -&gt; LabelUnit.
  Rename constants.AnalyzerVersionV1/V2 -&gt; UnitBinary/UnitContinuous.

- Expand help strings on wva_saturation_utilization, wva_spare_capacity,
  wva_kv_cache_tokens_used, and wva_kv_cache_tokens_capacity to specify
  what is being measured (KV-cache) and how V1 vs V2 paths differ.

- Use constants.LabelUnit, UnitBinary, UnitContinuous in the
  wva_required_capacity help string via fmt.Sprintf, for consistency with
  how labels are referenced elsewhere.
diff --git a/internal/actuator/actuator.go b/internal/actuator/actuator.go
@@ -100,12 +100,21 @@ func (a *Actuator) EmitSaturationMetrics(ctx context.Context, decision interface
 		ctx,
 		decision.VariantName,
 		decision.Namespace,
+		decision.ModelID,
 		decision.AcceleratorName,
-		decision.AnalyzerVersion,
+		decision.RequiredCapacityUnit,
 		decision.Utilization,
 		decision.SpareCapacity,
 		decision.RequiredCapacity,
 		decision.KvCacheTokensUsed,
-		decision.KvCacheTokensTotal,
+		decision.KvCacheTokensCapacity,
 	)
 }
+
+// DeleteSaturationMetricsForVariant removes all saturation metric series for a
+// variant. Call this when the current optimization cycle produced no fresh
+// decision for the variant, or when the VA is being deleted — so dashboards
+// don't show stale values.
+func (a *Actuator) DeleteSaturationMetricsForVariant(variantName, namespace string) {
+	a.MetricsEmitter.DeleteSaturationMetricsForVariant(variantName, namespace)
+}
diff --git a/internal/constants/metrics.go b/internal/constants/metrics.go
@@ -110,28 +110,28 @@ const (
 	WVADesiredRatio = "wva_desired_ratio"
 
 	// WVASaturationUtilization is a gauge that tracks per-variant utilization ratio (0.0-1.0).
-	// Labels: variant_name, namespace, accelerator_type
+	// Labels: variant_name, namespace, model_name, accelerator_type
 	WVASaturationUtilization = "wva_saturation_utilization"
 
 	// WVASpareCapacity is a gauge that tracks per-variant spare capacity (0.0-1.0).
-	// Labels: variant_name, namespace, accelerator_type
+	// Labels: variant_name, namespace, model_name, accelerator_type
 	WVASpareCapacity = "wva_spare_capacity"
 
 	// WVARequiredCapacity is a gauge that tracks model-level required capacity.
 	// >0 means scale-up needed.
-	// Units differ by analyzer (use the analyzer_version label to distinguish):
-	//   - V1: binary signal (0.0 = no scale-up, 1.0 = scale-up needed)
-	//   - V2: continuous token-based demand
-	// Labels: variant_name, namespace, analyzer_version
+	// Value semantics differ by analyzer (use the "unit" label to distinguish):
+	//   - unit="binary"     (V1): 0.0 = no scale-up, 1.0 = scale-up needed
+	//   - unit="continuous" (V2): continuous token-based demand
+	// Labels: variant_name, namespace, model_name, unit
 	WVARequiredCapacity = "wva_required_capacity"
 
 	// WVAKvCacheTokensUsed is a gauge that tracks total KV cache tokens currently in use per variant.
-	// Labels: variant_name, namespace
+	// Labels: variant_name, namespace, model_name
 	WVAKvCacheTokensUsed = "wva_kv_cache_tokens_used"
 
-	// WVAKvCacheTokensTotal is a gauge that tracks total KV cache token capacity per variant.
-	// Labels: variant_name, namespace
-	WVAKvCacheTokensTotal = "wva_kv_cache_tokens_total"
+	// WVAKvCacheTokensCapacity is a gauge that tracks total KV cache token capacity per variant.
+	// Labels: variant_name, namespace, model_name
+	WVAKvCacheTokensCapacity = "wva_kv_cache_tokens_capacity"
 )
 
 // Metric Label Names
@@ -144,11 +144,16 @@ const (
 	LabelReason             = "reason"
 	LabelAcceleratorType    = "accelerator_type"
 	LabelControllerInstance = "controller_instance"
-	LabelAnalyzerVersion    = "analyzer_version"
+	// LabelUnit distinguishes the unit of a metric value when a single metric name
+	// carries values with different semantic units. Currently applied to
+	// wva_required_capacity, whose value is either a binary scale-up signal (V1)
+	// or a continuous token-demand value (V2).
+	LabelUnit = "unit"
 )
 
-// Analyzer version label values used in saturation metrics.
+// Values for the LabelUnit Prometheus label, describing how to interpret the
+// metric value ("binary" 0/1 vs. "continuous" absolute quantity).
 const (
-	AnalyzerVersionV1 = "v1"
-	AnalyzerVersionV2 = "v2"
+	UnitBinary     = "binary"
+	UnitContinuous = "continuous"
 )
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -764,17 +764,23 @@ func enrichDecisionsFromReplicaMetrics(decisions []interfaces.VariantDecision, r
 	for i := range decisions {
 		d := &decisions[i]
 		d.RequiredCapacity = requiredCapacity
-		d.AnalyzerVersion = constants.AnalyzerVersionV1
+		d.RequiredCapacityUnit = constants.UnitBinary
 		if a, ok := agg[d.VariantName]; ok && a.count > 0 {
 			d.KvCacheTokensUsed = a.kvUsed
-			d.KvCacheTokensTotal = a.kvTotal
+			d.KvCacheTokensCapacity = a.kvTotal
+			// V1 reasons about saturation per-replica using KvCacheUsage fractions
+			// (rm.KvCacheUsage is 0.0-1.0), not tokens. Report the mean of those
+			// per-replica fractions as the variant-level utilization — this
+			// matches what the V1 analyzer actually evaluates against its
+			// thresholds. V2 uses a different (token-demand / capacity) formula;
+			// see the field doc on VariantDecision.Utilization.
 			d.Utilization = a.kvUsageSum / float64(a.count)
 		}
 	}
 }
 
-// enrichDecisionsWithKvTokenData sets KvCacheTokensUsed, KvCacheTokensTotal, and
-// AnalyzerVersion on decisions from replica metrics aggregated per (model, variant).
+// enrichDecisionsWithKvTokenData sets KvCacheTokensUsed, KvCacheTokensCapacity, and
+// RequiredCapacityUnit on decisions from replica metrics aggregated per (model, variant).
 // Used by V2 path where Utilization and RequiredCapacity are already set from
 // AnalyzerResult.
 //
@@ -805,10 +811,10 @@ func enrichDecisionsWithKvTokenData(decisions []interfaces.VariantDecision, mode
 
 	for i := range decisions {
 		d := &decisions[i]
-		d.AnalyzerVersion = constants.AnalyzerVersionV2
+		d.RequiredCapacityUnit = constants.UnitContinuous
 		if a, ok := agg[variantKey{modelID: d.ModelID, variant: d.VariantName}]; ok {
 			d.KvCacheTokensUsed = a.kvUsed
-			d.KvCacheTokensTotal = a.kvTotal
+			d.KvCacheTokensCapacity = a.kvTotal
 		}
 	}
 }
@@ -1157,14 +1163,20 @@ func (e *Engine) applySaturationDecisions(
 		}
 
 		// Emit saturation and capacity metrics for observability.
-		// Note: stale time series for deleted VAs are not cleaned up automatically here.
-		// The metrics package exposes DeleteSaturationMetrics for callers (e.g., the
-		// VariantAutoscaling reconciler's delete handler / finalizer) to remove series
-		// when a VA is removed.
+		// When this cycle produced no fresh decision for the variant, actively
+		// clear the existing series so dashboards show a gap ("no fresh data")
+		// rather than stale values that would otherwise persist until Prometheus'
+		// 5-minute staleness marker fires. For fully-deleted VAs, additional
+		// cleanup via the reconciler's delete handler / finalizer is still
+		// required (see DeleteSaturationMetricsForVariant).
 		if hasDecision {
 			if err := act.EmitSaturationMetrics(ctx, decision); err != nil {
 				logger.Error(err, "Failed to emit saturation metrics", "variant", updateVa.Name)
 			}
+		} else {
+			act.DeleteSaturationMetricsForVariant(updateVa.Name, updateVa.Namespace)
+			logger.V(logging.DEBUG).Info("Cleared stale saturation metrics (no fresh decision this cycle)",
+				"variant", updateVa.Name, "namespace", updateVa.Namespace)
 		}
 
 		// Update Shared State and Trigger Reconcile via Channel
diff --git a/internal/interfaces/saturation_analyzer.go b/internal/interfaces/saturation_analyzer.go
@@ -195,25 +195,32 @@ type VariantDecision struct {
 	// V1: threshold-relative spare KV capacity (AvgSpareKvCapacity).
 	// V2: 1.0 - Utilization (absolute spare).
 	SpareCapacity float64
-	// Utilization is the variant-level utilization ratio (0.0-1.0).
-	// V2: from AnalyzerResult.VariantCapacities[].Utilization.
-	// V1: average KvCacheUsage across this variant's replicas.
+	// Utilization is the variant-level utilization ratio (0.0-1.0) reported for
+	// observability. The exact formula differs by analyzer because V1 and V2
+	// reason about saturation differently:
+	//   V1: mean of per-replica KvCacheUsage fractions (matches what V1's
+	//       per-replica threshold check operates on).
+	//   V2: TotalDemand / TotalCapacity from AnalyzerResult (token-demand-based).
+	// For uniform-capacity replicas the two are numerically equivalent; for
+	// mixed-capacity replicas V2's value is capacity-weighted.
 	Utilization float64
 	// KvCacheTokensUsed is the sum of TokensInUse across this variant's replicas.
 	KvCacheTokensUsed int64
-	// KvCacheTokensTotal is the sum of TotalKvCapacityTokens across this variant's replicas.
-	KvCacheTokensTotal int64
+	// KvCacheTokensCapacity is the sum of TotalKvCapacityTokens across this variant's replicas.
+	KvCacheTokensCapacity int64
 	// RequiredCapacity is the model-level required capacity (>0 means scale-up needed).
 	// Same value for all variants of a model.
 	// V1: binary (1.0 if shouldScaleUp, else 0.0).
 	// V2: continuous token-based demand from AnalyzerResult.
-	// Use AnalyzerVersion to disambiguate the units when consuming this field
+	// Use RequiredCapacityUnit to disambiguate the units when consuming this field
 	// (or its corresponding Prometheus metric).
 	RequiredCapacity float64
-	// AnalyzerVersion identifies which analyzer produced this decision ("v1" or "v2").
-	// Exposed as a Prometheus label on saturation metrics so dashboards can filter
-	// by analyzer to handle the V1/V2 unit difference in RequiredCapacity.
-	AnalyzerVersion string
+	// RequiredCapacityUnit describes the unit of RequiredCapacity ("binary" or "continuous").
+	// Exposed as the `unit` Prometheus label on wva_required_capacity so dashboards
+	// can filter by semantics rather than by which analyzer produced the value.
+	//   "binary":     V1 path, value is 0.0 or 1.0
+	//   "continuous": V2 path, value is a token-demand magnitude
+	RequiredCapacityUnit string
 	// ScaleTargetRef references the Deployment/StatefulSet for scheduling constraints
 	ScaleTargetRef *autoscalingv2.CrossVersionObjectReference
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go