Skip to content

Commit 8e75498

Browse files
authored
Add scaling-decision logging and E2E failure/debug dumps (llm-d#796)
* more logging Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * update logging for scalefrom zero Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> --------- Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent 66148f3 commit 8e75498

8 files changed

Lines changed: 63 additions & 8 deletions

File tree

internal/controller/variantautoscaling_controller.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,15 @@ func (r *VariantAutoscalingReconciler) Reconcile(ctx context.Context, req ctrl.R
180180
// Process Engine Decisions from Shared Cache
181181
// This mechanism allows the Engine to trigger updates without touching the API server directly.
182182
if decision, ok := common.DecisionCache.Get(va.Name, va.Namespace); ok {
183-
logger.Info("Found decision in cache", "va", va.Name, "namespace", va.Namespace, "metricsAvailable", decision.MetricsAvailable)
183+
// Log scaling outcome and reason for E2E and operator debugging (why did/didn't scaling happen).
184+
logger.Info("Applying scaling decision from cache",
185+
"va", va.Name,
186+
"namespace", va.Namespace,
187+
"desiredReplicas", decision.TargetReplicas,
188+
"metricsAvailable", decision.MetricsAvailable,
189+
"metricsReason", decision.MetricsReason,
190+
"metricsMessage", decision.MetricsMessage,
191+
"reason", decision.Reason)
184192
// Only apply if the decision is fresher than the last one applied or if we haven't applied it
185193
// Note: We blindly apply for now, assuming the Engine acts as the source of truth for "Desired" state
186194
numReplicas, accelerator, lastRunTime := common.DecisionToOptimizedAlloc(decision)

internal/engines/saturation/engine.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,6 +1010,7 @@ func (e *Engine) applySaturationDecisions(
10101010
if hasDecision {
10111011
logger.Info("Applied saturation decision via shared cache",
10121012
"variant", vaName,
1013+
"namespace", updateVa.Namespace,
10131014
"action", decision.Action,
10141015
"target", targetReplicas,
10151016
"reason", reason)

internal/engines/scalefromzero/engine.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,10 @@ func (e *Engine) processInactiveVariant(ctx context.Context, va wvav1alpha1.Vari
238238
// Use EPP source from registry
239239
eppSource := e.Datastore.PoolGetMetricsSource(pool.Name)
240240
if eppSource == nil {
241+
logger.Info("Scale-from-zero: skipping VA, EPP metrics source not found in datastore",
242+
"va", va.Name,
243+
"namespace", va.Namespace,
244+
"pool", pool.Name)
241245
return errors.New("endpointpicker metrics source not found in datastore")
242246
}
243247

@@ -263,7 +267,11 @@ func (e *Engine) processInactiveVariant(ctx context.Context, va wvav1alpha1.Vari
263267
}
264268

265269
if !pendingRequestExist {
266-
logger.V(logging.DEBUG).Info("No pending requests found in the flowcontrol queue - skipping scaling up from zero")
270+
// Scale-from-zero loop runs every 100ms; log at DEBUG to avoid flooding (10/sec per inactive VA).
271+
logger.V(logging.DEBUG).Info("Scale-from-zero: skipping VA, no pending requests in flow control queue",
272+
"va", va.Name,
273+
"namespace", va.Namespace,
274+
"modelID", va.Spec.ModelID)
267275
return nil
268276
}
269277

@@ -353,5 +361,12 @@ func (e *Engine) processInactiveVariant(ctx context.Context, va wvav1alpha1.Vari
353361
Object: &va,
354362
}
355363

364+
// Log scaling decision for E2E and operators (mirrors saturation engine "Applied ... via shared cache").
365+
logger.Info("Scale-from-zero decision written to cache",
366+
"va", va.Name,
367+
"namespace", va.Namespace,
368+
"targetReplicas", targetWorkloadReplicas,
369+
"reason", reason)
370+
356371
return nil
357372
}

test/e2e/fixtures/model_service_builder.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,9 @@ func buildModelServerArgs(modelID string, useSimulator bool, maxNumSeqs int) []s
212212
// text completion API.
213213
// Note: blockSize must be one of {8, 16, 32, 64, 128} per simulator validation.
214214
const (
215-
simulatorKVCacheSize = 1 // minimal cache: 1 unique block / 1 max block = 100% usage during load
216-
simulatorBlockSize = 8 // minimum valid block size; 8 tokens / 8 = 1 block per request
217-
simulatorMaxModelLen = 512 // must exceed prompt tokens + max_tokens (burst load uses ~9 + 400 = 409)
215+
simulatorKVCacheSize = 1 // minimal cache: 1 unique block / 1 max block = 100% usage during load
216+
simulatorBlockSize = 8 // minimum valid block size; 8 tokens / 8 = 1 block per request
217+
simulatorMaxModelLen = 512 // must exceed prompt tokens + max_tokens (burst load uses ~9 + 400 = 409)
218218
simulatorTTFT = "2000ms" // time-to-first-token (slow to hold KV cache)
219219
simulatorITL = "100ms" // inter-token latency (slow to keep requests active)
220220
)

test/e2e/parallel_load_scaleup_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,9 +345,15 @@ var _ = Describe("Parallel Load Scale-Up Test", Label("full"), Ordered, func() {
345345
It("should detect increased load and trigger scale-up", func() {
346346
By("Waiting for load generation to ramp up (30 seconds)")
347347
time.Sleep(30 * time.Second)
348+
GinkgoWriter.Println("Load ramp-up complete, monitoring VA for scale-up (up to 5m)")
348349

349350
By("Monitoring VariantAutoscaling for scale-up")
351+
start := time.Now()
352+
attempt := 0
350353
Eventually(func(g Gomega) {
354+
attempt++
355+
elapsed := time.Since(start)
356+
351357
va := &variantautoscalingv1alpha1.VariantAutoscaling{}
352358
err := crClient.Get(ctx, client.ObjectKey{
353359
Namespace: cfg.LLMDNamespace,
@@ -357,8 +363,8 @@ var _ = Describe("Parallel Load Scale-Up Test", Label("full"), Ordered, func() {
357363

358364
scaledOptimized = int32(va.Status.DesiredOptimizedAlloc.NumReplicas)
359365

360-
GinkgoWriter.Printf("VA optimized replicas: %d (initial: %d, minReplicas: %d)\n",
361-
scaledOptimized, initialOptimized, hpaMinReplicas)
366+
GinkgoWriter.Printf("VA check #%d (%v elapsed): optimized=%d (initial=%d, minReplicas=%d)\n",
367+
attempt, elapsed.Round(time.Second), scaledOptimized, initialOptimized, hpaMinReplicas)
362368

363369
if !lowLoad {
364370
// Scale-up means we should have MORE replicas than our initial stabilized state

test/e2e/scale_from_zero_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,16 @@ var _ = PDescribe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, fu
317317

318318
optimized := va.Status.DesiredOptimizedAlloc.NumReplicas
319319

320+
metricsCond := variantautoscalingv1alpha1.GetCondition(va, variantautoscalingv1alpha1.TypeMetricsAvailable)
321+
optCond := variantautoscalingv1alpha1.GetCondition(va, variantautoscalingv1alpha1.TypeOptimizationReady)
322+
320323
GinkgoWriter.Printf("VA DesiredOptimizedAlloc.NumReplicas: %d (waiting for > 0)\n", optimized)
324+
if metricsCond != nil {
325+
GinkgoWriter.Printf(" MetricsAvailable: %s/%s (%s)\n", metricsCond.Status, metricsCond.Reason, metricsCond.Message)
326+
}
327+
if optCond != nil {
328+
GinkgoWriter.Printf(" OptimizationReady: %s/%s (%s)\n", optCond.Status, optCond.Reason, optCond.Message)
329+
}
321330

322331
// Scale-from-zero engine should detect pending requests and recommend scaling up
323332
g.Expect(optimized).To(BeNumerically(">", 0),

test/e2e/suite_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"sigs.k8s.io/controller-runtime/pkg/log/zap"
2828

2929
variantautoscalingv1alpha1 "github.com/llm-d/llm-d-workload-variant-autoscaler/api/v1alpha1"
30+
"github.com/llm-d/llm-d-workload-variant-autoscaler/test/utils"
3031
// +kubebuilder:scaffold:imports
3132
)
3233

@@ -145,6 +146,21 @@ var _ = BeforeSuite(func() {
145146
GinkgoWriter.Println("BeforeSuite completed successfully - infrastructure ready")
146147
})
147148

149+
// ReportAfterEach dumps controller logs and VA status after a failed test.
150+
// This makes E2E failures self-contained and easier to debug (why scaling happened / didn't happen).
151+
var _ = ReportAfterEach(func(report SpecReport) {
152+
if !report.Failed() {
153+
return
154+
}
155+
if k8sClient == nil || crClient == nil {
156+
return
157+
}
158+
159+
GinkgoWriter.Printf("\n=== Failure diagnostics: %s ===\n", report.FullText())
160+
utils.DumpControllerLogs(context.Background(), k8sClient, cfg.WVANamespace, GinkgoWriter)
161+
utils.DumpVAStatus(context.Background(), crClient, GinkgoWriter)
162+
})
163+
148164
var _ = AfterSuite(func() {
149165
By("Cleaning up any leftover test resources")
150166
if k8sClient != nil && crClient != nil {

test/utils/debug_helpers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ func DumpVAStatus(ctx context.Context, crClient client.Client, w io.Writer) {
6565
_, _ = fmt.Fprintf(w, " LastRunTime: %v\n", va.Status.DesiredOptimizedAlloc.LastRunTime)
6666
_, _ = fmt.Fprintf(w, " Conditions:\n")
6767
for _, cond := range va.Status.Conditions {
68-
_, _ = fmt.Fprintf(w, " - Type: %s, Status: %s, Reason: %s\n", cond.Type, cond.Status, cond.Reason)
68+
_, _ = fmt.Fprintf(w, " - Type: %s, Status: %s, Reason: %s, Message: %q\n", cond.Type, cond.Status, cond.Reason, cond.Message)
6969
}
7070
}
7171
}

0 commit comments

Comments
 (0)