fix: always set MetricsAvailable condition in VA status (llm-d#567)

clubanderson · mamy-CS · commit 2c3935afde8d · 2026-02-10T12:38:38.000-05:00
* fix: always set MetricsAvailable condition in VA status

  The MetricsAvailable condition was not showing in VA status because the
  code tried to copy it from a local VA object where it was never set.

  Instead of copying a potentially nil condition, we now directly set
  MetricsAvailable based on whether we have metrics data:
  - True if we have an allocation (from metrics collection) or a decision
    (from saturation analysis)
  - False otherwise, indicating pods may not be ready or metrics not yet
    scraped

* fix: use more accurate MetricsAvailable condition message

  Address Copilot review feedback - the message now accurately reflects
  that metrics data is available rather than implying active collection.

* fix: persist MetricsAvailable condition via decision cache

  The previous fix set the condition on a local object that was never
  persisted. The condition must flow through the DecisionCache to the
  controller which actually updates the API server.

  Changes:
  - Add MetricsAvailable fields to VariantDecision struct
  - Store metrics availability in the decision cache
  - Controller reads from cache and sets the condition on VA status

* fix: set MetricsAvailable=False even when no accelerator info

  When pods aren't ready yet, the engine skips full status updates due to
  missing accelerator info. However, we still need to set MetricsAvailable=False
  so users can see the condition in the VA status.

  Now populates the cache and triggers reconciliation even in this case.

* debug: add INFO logging for cache operations

* fix: only update DesiredOptimizedAlloc if values are valid

  When cache entry only has MetricsAvailable=false (no accelerator/replicas),
  don't try to update DesiredOptimizedAlloc as it would fail CRD validation.
  Still apply MetricsAvailable condition in all cases.

* refactor: extract MetricsAvailable constants and add explanatory comment

  Address Copilot review feedback:
  - Extract duplicated MetricsReason/MetricsMessage strings as constants
  - Add comment explaining hasAllocation || hasDecision logic

* fix: address additional Copilot review feedback

- Add comment explaining partial decision for metrics status only
- Allow numReplicas=0 for scale-to-zero scenarios (only require accelerator)
diff --git a/internal/controller/variantautoscaling_controller.go b/internal/controller/variantautoscaling_controller.go
@@ -187,18 +187,34 @@ func (r *VariantAutoscalingReconciler) Reconcile(ctx context.Context, req ctrl.R
 	// Process Engine Decisions from Shared Cache
 	// This mechanism allows the Engine to trigger updates without touching the API server directly.
 	if decision, ok := common.DecisionCache.Get(va.Name, va.Namespace); ok {
+		logger.Info("Found decision in cache", "va", va.Name, "namespace", va.Namespace, "metricsAvailable", decision.MetricsAvailable)
 		// Only apply if the decision is fresher than the last one applied or if we haven't applied it
 		// Note: We blindly apply for now, assuming the Engine acts as the source of truth for "Desired" state
 		numReplicas, accelerator, lastRunTime := common.DecisionToOptimizedAlloc(decision)
 
-		va.Status.DesiredOptimizedAlloc.NumReplicas = numReplicas
-		va.Status.DesiredOptimizedAlloc.Accelerator = accelerator
-		va.Status.DesiredOptimizedAlloc.LastRunTime = lastRunTime
+		// Only update DesiredOptimizedAlloc if we have a valid accelerator (required by CRD).
+		// Note: numReplicas may legitimately be 0 for scale-to-zero scenarios.
+		if accelerator != "" {
+			va.Status.DesiredOptimizedAlloc.NumReplicas = numReplicas
+			va.Status.DesiredOptimizedAlloc.Accelerator = accelerator
+			va.Status.DesiredOptimizedAlloc.LastRunTime = lastRunTime
+		}
+
+		// Always apply MetricsAvailable condition from cache
+		metricsStatus := metav1.ConditionFalse
+		if decision.MetricsAvailable {
+			metricsStatus = metav1.ConditionTrue
+		}
+		llmdVariantAutoscalingV1alpha1.SetCondition(&va,
+			llmdVariantAutoscalingV1alpha1.TypeMetricsAvailable,
+			metricsStatus,
+			decision.MetricsReason,
+			decision.MetricsMessage)
 
 		// Note: CurrentAlloc is removed from Status.
 		// Internal allocation state is managed by the Engine and Actuator.
 	} else {
-		logger.V(logging.DEBUG).Info("No decision found in cache for VA", "variant", va.Name)
+		logger.Info("No decision found in cache for VA", "va", va.Name, "namespace", va.Namespace)
 	}
 
 	// Update Status if we have changes (Conditions or OptimizedAlloc)
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -45,6 +45,14 @@ import (
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/utils"
 )
 
+// Constants for MetricsAvailable condition
+const (
+	MetricsReasonAvailable   = "MetricsAvailable"
+	MetricsReasonUnavailable = "MetricsUnavailable"
+	MetricsMessageAvailable  = "Saturation metrics data is available for scaling decisions"
+	MetricsMessageUnavailable = "No saturation metrics available - pods may not be ready or metrics not yet scraped"
+)
+
 type Engine struct {
 	client   client.Client
 	scheme   *runtime.Scheme
@@ -511,15 +519,8 @@ func (e *Engine) applySaturationDecisions(
 			// Now we just don't update status with it.
 		}
 
-		// Copy MetricsAvailable condition from local analysis (set during metrics collection)
-		// This condition was set on `va` but we fetched a fresh `updateVa` from API server
-		if metricsCondition := llmdVariantAutoscalingV1alpha1.GetCondition(va, llmdVariantAutoscalingV1alpha1.TypeMetricsAvailable); metricsCondition != nil {
-			llmdVariantAutoscalingV1alpha1.SetCondition(&updateVa,
-				llmdVariantAutoscalingV1alpha1.TypeMetricsAvailable,
-				metricsCondition.Status,
-				metricsCondition.Reason,
-				metricsCondition.Message)
-		}
+		// Check if we have metrics data for this VA (used for cache below)
+		_, hasAllocation := currentAllocations[vaName]
 
 		// Determine target replicas and accelerator
 		var targetReplicas int
@@ -548,9 +549,25 @@ func (e *Engine) applySaturationDecisions(
 		}
 
 		// If we still don't have an accelerator name (e.g. new VA, no decision, no current alloc), we can't update status sensibly
+		// But we still need to set MetricsAvailable condition via the cache
 		if acceleratorName == "" {
-			logger.Info("Skipping status update for VA without accelerator info",
-				"variant", vaName)
+			logger.Info("Skipping status update for VA without accelerator info, but setting MetricsAvailable=False",
+				"variant", vaName, "cacheKey.name", va.Name, "cacheKey.namespace", va.Namespace)
+			// Still set the cache entry so the controller can set MetricsAvailable=False.
+			// This is a partial decision for metrics status only - other fields like
+			// TargetReplicas and AcceleratorName are left at zero values since we don't
+			// have enough information to set them.
+			common.DecisionCache.Set(va.Name, va.Namespace, interfaces.VariantDecision{
+				VariantName:      vaName,
+				Namespace:        va.Namespace,
+				MetricsAvailable: false,
+				MetricsReason:    MetricsReasonUnavailable,
+				MetricsMessage:   MetricsMessageUnavailable,
+			})
+			// Trigger reconciler to apply the condition
+			common.DecisionTrigger <- event.GenericEvent{
+				Object: &updateVa,
+			}
 			continue
 		}
 
@@ -627,14 +644,30 @@ func (e *Engine) applySaturationDecisions(
 		// This avoids any API server interaction from the Engine.
 
 		// 1. Update Cache
+		// Determine MetricsAvailable status for the cache.
+		// - hasAllocation is true when we successfully collected current replica metrics
+		//   for this variant during this loop (metrics pipeline is working).
+		// - hasDecision is true when the optimizer produced a scaling decision based on
+		//   saturation metrics in this run.
+		// Either condition implies saturation metrics were available and usable.
+		metricsAvailable := hasAllocation || hasDecision
+		metricsReason := MetricsReasonUnavailable
+		metricsMessage := MetricsMessageUnavailable
+		if metricsAvailable {
+			metricsReason = MetricsReasonAvailable
+			metricsMessage = MetricsMessageAvailable
+		}
+
 		common.DecisionCache.Set(va.Name, va.Namespace, interfaces.VariantDecision{
 			VariantName:       vaName,
 			Namespace:         va.Namespace,
 			TargetReplicas:    targetReplicas,
 			AcceleratorName:   acceleratorName,
 			LastRunTime:       metav1.Now(),
 			CurrentAllocation: currentAllocations[vaName],
-			// Pass other fields if needed, but these are crucial for Status
+			MetricsAvailable:  metricsAvailable,
+			MetricsReason:     metricsReason,
+			MetricsMessage:    metricsMessage,
 		})
 
 		// 2. Trigger Reconciler
diff --git a/internal/interfaces/saturation_analyzer.go b/internal/interfaces/saturation_analyzer.go
@@ -85,11 +85,16 @@ type VariantDecision struct {
 	LastRunTime        metav1.Time // Time when decision was made (for status updates)
 	SaturationOnly     bool        // True if operating in saturation-only mode (no model-based analysis)
 
-	// CurrentAllocation carries the collected metrics/allocation state
-	// This helps the Controller update status without re-collecting metrics
 	// CurrentAllocation carries the collected metrics/allocation state
 	// This helps the Controller update status without re-collecting metrics
 	CurrentAllocation *Allocation
+
+	// MetricsAvailable indicates whether saturation metrics were available for this decision
+	MetricsAvailable bool
+	// MetricsReason is the reason for the MetricsAvailable condition
+	MetricsReason string
+	// MetricsMessage is the human-readable message for the MetricsAvailable condition
+	MetricsMessage string
 }
 
 // SaturationAction represents the scaling action