Skip to content

Commit 2c3935a

Browse files
clubandersonmamy-CS
authored andcommitted
fix: always set MetricsAvailable condition in VA status (llm-d#567)
* fix: always set MetricsAvailable condition in VA status The MetricsAvailable condition was not showing in VA status because the code tried to copy it from a local VA object where it was never set. Instead of copying a potentially nil condition, we now directly set MetricsAvailable based on whether we have metrics data: - True if we have an allocation (from metrics collection) or a decision (from saturation analysis) - False otherwise, indicating pods may not be ready or metrics not yet scraped * fix: use more accurate MetricsAvailable condition message Address Copilot review feedback - the message now accurately reflects that metrics data is available rather than implying active collection. * fix: persist MetricsAvailable condition via decision cache The previous fix set the condition on a local object that was never persisted. The condition must flow through the DecisionCache to the controller which actually updates the API server. Changes: - Add MetricsAvailable fields to VariantDecision struct - Store metrics availability in the decision cache - Controller reads from cache and sets the condition on VA status * fix: set MetricsAvailable=False even when no accelerator info When pods aren't ready yet, the engine skips full status updates due to missing accelerator info. However, we still need to set MetricsAvailable=False so users can see the condition in the VA status. Now populates the cache and triggers reconciliation even in this case. * debug: add INFO logging for cache operations * fix: only update DesiredOptimizedAlloc if values are valid When cache entry only has MetricsAvailable=false (no accelerator/replicas), don't try to update DesiredOptimizedAlloc as it would fail CRD validation. Still apply MetricsAvailable condition in all cases. * refactor: extract MetricsAvailable constants and add explanatory comment Address Copilot review feedback: - Extract duplicated MetricsReason/MetricsMessage strings as constants - Add comment explaining hasAllocation || hasDecision logic * fix: address additional Copilot review feedback - Add comment explaining partial decision for metrics status only - Allow numReplicas=0 for scale-to-zero scenarios (only require accelerator)
1 parent dec0bfd commit 2c3935a

File tree

3 files changed

+72
-18
lines changed

3 files changed

+72
-18
lines changed

internal/controller/variantautoscaling_controller.go

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -187,18 +187,34 @@ func (r *VariantAutoscalingReconciler) Reconcile(ctx context.Context, req ctrl.R
187187
// Process Engine Decisions from Shared Cache
188188
// This mechanism allows the Engine to trigger updates without touching the API server directly.
189189
if decision, ok := common.DecisionCache.Get(va.Name, va.Namespace); ok {
190+
logger.Info("Found decision in cache", "va", va.Name, "namespace", va.Namespace, "metricsAvailable", decision.MetricsAvailable)
190191
// Only apply if the decision is fresher than the last one applied or if we haven't applied it
191192
// Note: We blindly apply for now, assuming the Engine acts as the source of truth for "Desired" state
192193
numReplicas, accelerator, lastRunTime := common.DecisionToOptimizedAlloc(decision)
193194

194-
va.Status.DesiredOptimizedAlloc.NumReplicas = numReplicas
195-
va.Status.DesiredOptimizedAlloc.Accelerator = accelerator
196-
va.Status.DesiredOptimizedAlloc.LastRunTime = lastRunTime
195+
// Only update DesiredOptimizedAlloc if we have a valid accelerator (required by CRD).
196+
// Note: numReplicas may legitimately be 0 for scale-to-zero scenarios.
197+
if accelerator != "" {
198+
va.Status.DesiredOptimizedAlloc.NumReplicas = numReplicas
199+
va.Status.DesiredOptimizedAlloc.Accelerator = accelerator
200+
va.Status.DesiredOptimizedAlloc.LastRunTime = lastRunTime
201+
}
202+
203+
// Always apply MetricsAvailable condition from cache
204+
metricsStatus := metav1.ConditionFalse
205+
if decision.MetricsAvailable {
206+
metricsStatus = metav1.ConditionTrue
207+
}
208+
llmdVariantAutoscalingV1alpha1.SetCondition(&va,
209+
llmdVariantAutoscalingV1alpha1.TypeMetricsAvailable,
210+
metricsStatus,
211+
decision.MetricsReason,
212+
decision.MetricsMessage)
197213

198214
// Note: CurrentAlloc is removed from Status.
199215
// Internal allocation state is managed by the Engine and Actuator.
200216
} else {
201-
logger.V(logging.DEBUG).Info("No decision found in cache for VA", "variant", va.Name)
217+
logger.Info("No decision found in cache for VA", "va", va.Name, "namespace", va.Namespace)
202218
}
203219

204220
// Update Status if we have changes (Conditions or OptimizedAlloc)

internal/engines/saturation/engine.go

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@ import (
4545
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/utils"
4646
)
4747

48+
// Constants for MetricsAvailable condition
49+
const (
50+
MetricsReasonAvailable = "MetricsAvailable"
51+
MetricsReasonUnavailable = "MetricsUnavailable"
52+
MetricsMessageAvailable = "Saturation metrics data is available for scaling decisions"
53+
MetricsMessageUnavailable = "No saturation metrics available - pods may not be ready or metrics not yet scraped"
54+
)
55+
4856
type Engine struct {
4957
client client.Client
5058
scheme *runtime.Scheme
@@ -511,15 +519,8 @@ func (e *Engine) applySaturationDecisions(
511519
// Now we just don't update status with it.
512520
}
513521

514-
// Copy MetricsAvailable condition from local analysis (set during metrics collection)
515-
// This condition was set on `va` but we fetched a fresh `updateVa` from API server
516-
if metricsCondition := llmdVariantAutoscalingV1alpha1.GetCondition(va, llmdVariantAutoscalingV1alpha1.TypeMetricsAvailable); metricsCondition != nil {
517-
llmdVariantAutoscalingV1alpha1.SetCondition(&updateVa,
518-
llmdVariantAutoscalingV1alpha1.TypeMetricsAvailable,
519-
metricsCondition.Status,
520-
metricsCondition.Reason,
521-
metricsCondition.Message)
522-
}
522+
// Check if we have metrics data for this VA (used for cache below)
523+
_, hasAllocation := currentAllocations[vaName]
523524

524525
// Determine target replicas and accelerator
525526
var targetReplicas int
@@ -548,9 +549,25 @@ func (e *Engine) applySaturationDecisions(
548549
}
549550

550551
// If we still don't have an accelerator name (e.g. new VA, no decision, no current alloc), we can't update status sensibly
552+
// But we still need to set MetricsAvailable condition via the cache
551553
if acceleratorName == "" {
552-
logger.Info("Skipping status update for VA without accelerator info",
553-
"variant", vaName)
554+
logger.Info("Skipping status update for VA without accelerator info, but setting MetricsAvailable=False",
555+
"variant", vaName, "cacheKey.name", va.Name, "cacheKey.namespace", va.Namespace)
556+
// Still set the cache entry so the controller can set MetricsAvailable=False.
557+
// This is a partial decision for metrics status only - other fields like
558+
// TargetReplicas and AcceleratorName are left at zero values since we don't
559+
// have enough information to set them.
560+
common.DecisionCache.Set(va.Name, va.Namespace, interfaces.VariantDecision{
561+
VariantName: vaName,
562+
Namespace: va.Namespace,
563+
MetricsAvailable: false,
564+
MetricsReason: MetricsReasonUnavailable,
565+
MetricsMessage: MetricsMessageUnavailable,
566+
})
567+
// Trigger reconciler to apply the condition
568+
common.DecisionTrigger <- event.GenericEvent{
569+
Object: &updateVa,
570+
}
554571
continue
555572
}
556573

@@ -627,14 +644,30 @@ func (e *Engine) applySaturationDecisions(
627644
// This avoids any API server interaction from the Engine.
628645

629646
// 1. Update Cache
647+
// Determine MetricsAvailable status for the cache.
648+
// - hasAllocation is true when we successfully collected current replica metrics
649+
// for this variant during this loop (metrics pipeline is working).
650+
// - hasDecision is true when the optimizer produced a scaling decision based on
651+
// saturation metrics in this run.
652+
// Either condition implies saturation metrics were available and usable.
653+
metricsAvailable := hasAllocation || hasDecision
654+
metricsReason := MetricsReasonUnavailable
655+
metricsMessage := MetricsMessageUnavailable
656+
if metricsAvailable {
657+
metricsReason = MetricsReasonAvailable
658+
metricsMessage = MetricsMessageAvailable
659+
}
660+
630661
common.DecisionCache.Set(va.Name, va.Namespace, interfaces.VariantDecision{
631662
VariantName: vaName,
632663
Namespace: va.Namespace,
633664
TargetReplicas: targetReplicas,
634665
AcceleratorName: acceleratorName,
635666
LastRunTime: metav1.Now(),
636667
CurrentAllocation: currentAllocations[vaName],
637-
// Pass other fields if needed, but these are crucial for Status
668+
MetricsAvailable: metricsAvailable,
669+
MetricsReason: metricsReason,
670+
MetricsMessage: metricsMessage,
638671
})
639672

640673
// 2. Trigger Reconciler

internal/interfaces/saturation_analyzer.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,16 @@ type VariantDecision struct {
8585
LastRunTime metav1.Time // Time when decision was made (for status updates)
8686
SaturationOnly bool // True if operating in saturation-only mode (no model-based analysis)
8787

88-
// CurrentAllocation carries the collected metrics/allocation state
89-
// This helps the Controller update status without re-collecting metrics
9088
// CurrentAllocation carries the collected metrics/allocation state
9189
// This helps the Controller update status without re-collecting metrics
9290
CurrentAllocation *Allocation
91+
92+
// MetricsAvailable indicates whether saturation metrics were available for this decision
93+
MetricsAvailable bool
94+
// MetricsReason is the reason for the MetricsAvailable condition
95+
MetricsReason string
96+
// MetricsMessage is the human-readable message for the MetricsAvailable condition
97+
MetricsMessage string
9398
}
9499

95100
// SaturationAction represents the scaling action

0 commit comments

Comments
 (0)