Improvements: analyzer_version label, model-keyed V2 aggregation, delete hook

ev-shindin · ev-shindin · commit 40b8062abdb0 · 2026-04-14T15:21:32.000+03:00
- Add analyzer_version label to wva_required_capacity to disambiguate V1 (binary
  0/1) from V2 (continuous token demand) units. Add AnalyzerVersion field to
  VariantDecision; set "v1" in enrichDecisionsFromReplicaMetrics and "v2" in
  enrichDecisionsWithKvTokenData.
- Add AnalyzerVersionV1/V2 constants and LabelAnalyzerVersion constant.
- Key V2 KV-token aggregation by (modelID, variantName) instead of just
  variantName; variant names can collide across models in the same cycle.
- Add MetricsEmitter.DeleteSaturationMetrics() so the controller delete handler
  can remove stale time series when a VariantAutoscaling is deleted.
- Update tests: cover V1/V2 label distinction, Delete behavior, and analyzer
  version on controller_instance test.
diff --git a/internal/actuator/actuator.go b/internal/actuator/actuator.go
@@ -104,6 +104,7 @@ func (a *Actuator) EmitSaturationMetrics(ctx context.Context, decision interface
 		decision.VariantName,
 		decision.Namespace,
 		decision.AcceleratorName,
+		decision.AnalyzerVersion,
 		decision.Utilization,
 		decision.SpareCapacity,
 		decision.RequiredCapacity,
diff --git a/internal/constants/metrics.go b/internal/constants/metrics.go
@@ -118,7 +118,11 @@ const (
 	WVASpareCapacity = "wva_spare_capacity"
 
 	// WVARequiredCapacity is a gauge that tracks model-level required capacity.
-	// >0 means scale-up needed. Labels: variant_name, namespace
+	// >0 means scale-up needed.
+	// Units differ by analyzer (use the analyzer_version label to distinguish):
+	//   - V1: binary signal (0.0 = no scale-up, 1.0 = scale-up needed)
+	//   - V2: continuous token-based demand
+	// Labels: variant_name, namespace, analyzer_version
 	WVARequiredCapacity = "wva_required_capacity"
 
 	// WVAKvCacheTokensUsed is a gauge that tracks total KV cache tokens currently in use per variant.
@@ -140,4 +144,11 @@ const (
 	LabelReason             = "reason"
 	LabelAcceleratorType    = "accelerator_type"
 	LabelControllerInstance = "controller_instance"
+	LabelAnalyzerVersion    = "analyzer_version"
+)
+
+// Analyzer version label values used in saturation metrics.
+const (
+	AnalyzerVersionV1 = "v1"
+	AnalyzerVersionV2 = "v2"
 )
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -37,6 +37,7 @@ import (
 	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/collector/registration"
 	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/collector/source"
 	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/config"
+	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/constants"
 	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/discovery"
 	queueingmodel "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/engines/analyzers/queueingmodel"
 	saturation_v2 "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/engines/analyzers/saturation_v2"
@@ -789,6 +790,7 @@ func enrichDecisionsFromReplicaMetrics(decisions []interfaces.VariantDecision, r
 	for i := range decisions {
 		d := &decisions[i]
 		d.RequiredCapacity = requiredCapacity
+		d.AnalyzerVersion = constants.AnalyzerVersionV1
 		if a, ok := agg[d.VariantName]; ok && a.count > 0 {
 			d.KvCacheTokensUsed = a.kvUsed
 			d.KvCacheTokensTotal = a.kvTotal
@@ -797,22 +799,30 @@ func enrichDecisionsFromReplicaMetrics(decisions []interfaces.VariantDecision, r
 	}
 }
 
-// enrichDecisionsWithKvTokenData sets KvCacheTokensUsed and KvCacheTokensTotal on decisions
-// from replica metrics aggregated per variant. Used by V2 path where Utilization and
-// RequiredCapacity are already set from AnalyzerResult.
+// enrichDecisionsWithKvTokenData sets KvCacheTokensUsed, KvCacheTokensTotal, and
+// AnalyzerVersion on decisions from replica metrics aggregated per (model, variant).
+// Used by V2 path where Utilization and RequiredCapacity are already set from
+// AnalyzerResult.
+//
+// Aggregation is keyed by (modelID, variantName) — not just variantName — because
+// variant names can collide across different models in the same reconcile cycle.
 func enrichDecisionsWithKvTokenData(decisions []interfaces.VariantDecision, modelReplicaMetrics map[string][]interfaces.ReplicaMetrics) {
-	// Build per-variant KV token aggregation across all models
 	type kvAgg struct {
 		kvUsed  int64
 		kvTotal int64
 	}
-	agg := make(map[string]*kvAgg)
-	for _, metrics := range modelReplicaMetrics {
+	type variantKey struct {
+		modelID string
+		variant string
+	}
+	agg := make(map[variantKey]*kvAgg)
+	for modelID, metrics := range modelReplicaMetrics {
 		for _, rm := range metrics {
-			a, ok := agg[rm.VariantName]
+			k := variantKey{modelID: modelID, variant: rm.VariantName}
+			a, ok := agg[k]
 			if !ok {
 				a = &kvAgg{}
-				agg[rm.VariantName] = a
+				agg[k] = a
 			}
 			a.kvUsed += rm.TokensInUse
 			a.kvTotal += rm.TotalKvCapacityTokens
@@ -821,7 +831,8 @@ func enrichDecisionsWithKvTokenData(decisions []interfaces.VariantDecision, mode
 
 	for i := range decisions {
 		d := &decisions[i]
-		if a, ok := agg[d.VariantName]; ok {
+		d.AnalyzerVersion = constants.AnalyzerVersionV2
+		if a, ok := agg[variantKey{modelID: d.ModelID, variant: d.VariantName}]; ok {
 			d.KvCacheTokensUsed = a.kvUsed
 			d.KvCacheTokensTotal = a.kvTotal
 		}
@@ -1171,7 +1182,11 @@ func (e *Engine) applySaturationDecisions(
 			updateVa.Status.Actuation.Applied = true
 		}
 
-		// Emit saturation and capacity metrics for observability
+		// Emit saturation and capacity metrics for observability.
+		// Note: stale time series for deleted VAs are not cleaned up automatically here.
+		// The metrics package exposes DeleteSaturationMetrics for callers (e.g., the
+		// VariantAutoscaling reconciler's delete handler / finalizer) to remove series
+		// when a VA is removed.
 		if hasDecision {
 			if err := act.EmitSaturationMetrics(ctx, decision); err != nil {
 				logger.Error(err, "Failed to emit saturation metrics", "variant", updateVa.Name)
diff --git a/internal/interfaces/saturation_analyzer.go b/internal/interfaces/saturation_analyzer.go
@@ -201,7 +201,13 @@ type VariantDecision struct {
 	// Same value for all variants of a model.
 	// V1: binary (1.0 if shouldScaleUp, else 0.0).
 	// V2: continuous token-based demand from AnalyzerResult.
+	// Use AnalyzerVersion to disambiguate the units when consuming this field
+	// (or its corresponding Prometheus metric).
 	RequiredCapacity float64
+	// AnalyzerVersion identifies which analyzer produced this decision ("v1" or "v2").
+	// Exposed as a Prometheus label on saturation metrics so dashboards can filter
+	// by analyzer to handle the V1/V2 unit difference in RequiredCapacity.
+	AnalyzerVersion string
 	// ScaleTargetRef references the Deployment/StatefulSet for scheduling constraints
 	ScaleTargetRef *autoscalingv2.CrossVersionObjectReference
 
diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go
@@ -50,11 +50,15 @@ func InitMetrics(registry prometheus.Registerer) error {
 	scalingLabels := []string{constants.LabelVariantName, constants.LabelNamespace, constants.LabelDirection, constants.LabelReason}
 	// modelLabels: variant_name + namespace only (no accelerator_type) for model-level and token metrics
 	modelLabels := []string{constants.LabelVariantName, constants.LabelNamespace}
+	// requiredCapacityLabels: model labels + analyzer_version to disambiguate V1 (binary)
+	// vs V2 (continuous tokens) units of the wva_required_capacity gauge
+	requiredCapacityLabels := []string{constants.LabelVariantName, constants.LabelNamespace, constants.LabelAnalyzerVersion}
 
 	if controllerInstance != "" {
 		baseLabels = append(baseLabels, constants.LabelControllerInstance)
 		scalingLabels = append(scalingLabels, constants.LabelControllerInstance)
 		modelLabels = append(modelLabels, constants.LabelControllerInstance)
+		requiredCapacityLabels = append(requiredCapacityLabels, constants.LabelControllerInstance)
 	}
 
 	replicaScalingTotal = prometheus.NewCounterVec(
@@ -102,9 +106,9 @@ func InitMetrics(registry prometheus.Registerer) error {
 	requiredCapacity = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: constants.WVARequiredCapacity,
-			Help: "Model-level required capacity; >0 indicates scale-up needed (V1: binary 0/1, V2: continuous token demand)",
+			Help: "Model-level required capacity; >0 indicates scale-up needed. Use the analyzer_version label to distinguish units (V1: binary 0/1, V2: continuous token demand).",
 		},
-		modelLabels,
+		requiredCapacityLabels,
 	)
 	kvCacheTokensUsed = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
@@ -224,10 +228,12 @@ func (m *MetricsEmitter) EmitReplicaMetrics(ctx context.Context, va *llmdOptv1al
 	return nil
 }
 
-// EmitSaturationMetrics emits saturation analysis and KV cache capacity metrics
+// EmitSaturationMetrics emits saturation analysis and KV cache capacity metrics.
+// analyzerVersion ("v1" or "v2") is used as a label on wva_required_capacity to
+// disambiguate the units of the required value (V1: binary, V2: continuous tokens).
 func (m *MetricsEmitter) EmitSaturationMetrics(
 	ctx context.Context,
-	variantName, namespace, acceleratorType string,
+	variantName, namespace, acceleratorType, analyzerVersion string,
 	utilization, spare, required float64,
 	kvTokensUsed, kvTokensTotal int64,
 ) error {
@@ -245,17 +251,60 @@ func (m *MetricsEmitter) EmitSaturationMetrics(
 		constants.LabelVariantName: variantName,
 		constants.LabelNamespace:   namespace,
 	}
+	requiredLabels := prometheus.Labels{
+		constants.LabelVariantName:     variantName,
+		constants.LabelNamespace:       namespace,
+		constants.LabelAnalyzerVersion: analyzerVersion,
+	}
 
 	if controllerInstance != "" {
 		accelLabels[constants.LabelControllerInstance] = controllerInstance
 		modelLabels[constants.LabelControllerInstance] = controllerInstance
+		requiredLabels[constants.LabelControllerInstance] = controllerInstance
 	}
 
 	saturationUtilization.With(accelLabels).Set(utilization)
 	spareCapacity.With(accelLabels).Set(spare)
-	requiredCapacity.With(modelLabels).Set(required)
+	requiredCapacity.With(requiredLabels).Set(required)
 	kvCacheTokensUsed.With(modelLabels).Set(float64(kvTokensUsed))
 	kvCacheTokensTotal.With(modelLabels).Set(float64(kvTokensTotal))
 
 	return nil
 }
+
+// DeleteSaturationMetrics removes saturation metric series for the given variant.
+// Should be called when a VariantAutoscaling resource is deleted to prevent stale
+// time series from accumulating in Prometheus.
+//
+// TODO: wire this from the controller's VariantAutoscaling delete handler / finalizer.
+// Until that wiring exists, deleted VAs leave their last-emitted metric values in the
+// registry indefinitely.
+func (m *MetricsEmitter) DeleteSaturationMetrics(variantName, namespace, acceleratorType, analyzerVersion string) {
+	if saturationUtilization == nil {
+		return
+	}
+	accelLabels := prometheus.Labels{
+		constants.LabelVariantName:     variantName,
+		constants.LabelNamespace:       namespace,
+		constants.LabelAcceleratorType: acceleratorType,
+	}
+	modelLabels := prometheus.Labels{
+		constants.LabelVariantName: variantName,
+		constants.LabelNamespace:   namespace,
+	}
+	requiredLabels := prometheus.Labels{
+		constants.LabelVariantName:     variantName,
+		constants.LabelNamespace:       namespace,
+		constants.LabelAnalyzerVersion: analyzerVersion,
+	}
+	if controllerInstance != "" {
+		accelLabels[constants.LabelControllerInstance] = controllerInstance
+		modelLabels[constants.LabelControllerInstance] = controllerInstance
+		requiredLabels[constants.LabelControllerInstance] = controllerInstance
+	}
+	saturationUtilization.Delete(accelLabels)
+	spareCapacity.Delete(accelLabels)
+	requiredCapacity.Delete(requiredLabels)
+	kvCacheTokensUsed.Delete(modelLabels)
+	kvCacheTokensTotal.Delete(modelLabels)
+}
diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go