Add scaling-decision logging and E2E failure/debug dumps (llm-d#796)

mamy-CS · web-flow · commit 8e75498217fc · 2026-03-03T10:38:46.000-05:00
* more logging

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;

* update logging for scalefrom zero

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;

---------

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;
diff --git a/internal/controller/variantautoscaling_controller.go b/internal/controller/variantautoscaling_controller.go
@@ -180,7 +180,15 @@ func (r *VariantAutoscalingReconciler) Reconcile(ctx context.Context, req ctrl.R
 	// Process Engine Decisions from Shared Cache
 	// This mechanism allows the Engine to trigger updates without touching the API server directly.
 	if decision, ok := common.DecisionCache.Get(va.Name, va.Namespace); ok {
-		logger.Info("Found decision in cache", "va", va.Name, "namespace", va.Namespace, "metricsAvailable", decision.MetricsAvailable)
+		// Log scaling outcome and reason for E2E and operator debugging (why did/didn't scaling happen).
+		logger.Info("Applying scaling decision from cache",
+			"va", va.Name,
+			"namespace", va.Namespace,
+			"desiredReplicas", decision.TargetReplicas,
+			"metricsAvailable", decision.MetricsAvailable,
+			"metricsReason", decision.MetricsReason,
+			"metricsMessage", decision.MetricsMessage,
+			"reason", decision.Reason)
 		// Only apply if the decision is fresher than the last one applied or if we haven't applied it
 		// Note: We blindly apply for now, assuming the Engine acts as the source of truth for "Desired" state
 		numReplicas, accelerator, lastRunTime := common.DecisionToOptimizedAlloc(decision)
diff --git a/internal/engines/saturation/engine.go b/internal/engines/saturation/engine.go
@@ -1010,6 +1010,7 @@ func (e *Engine) applySaturationDecisions(
 		if hasDecision {
 			logger.Info("Applied saturation decision via shared cache",
 				"variant", vaName,
+				"namespace", updateVa.Namespace,
 				"action", decision.Action,
 				"target", targetReplicas,
 				"reason", reason)
diff --git a/internal/engines/scalefromzero/engine.go b/internal/engines/scalefromzero/engine.go
@@ -238,6 +238,10 @@ func (e *Engine) processInactiveVariant(ctx context.Context, va wvav1alpha1.Vari
 	// Use EPP source from registry
 	eppSource := e.Datastore.PoolGetMetricsSource(pool.Name)
 	if eppSource == nil {
+		logger.Info("Scale-from-zero: skipping VA, EPP metrics source not found in datastore",
+			"va", va.Name,
+			"namespace", va.Namespace,
+			"pool", pool.Name)
 		return errors.New("endpointpicker metrics source not found in datastore")
 	}
 
@@ -263,7 +267,11 @@ func (e *Engine) processInactiveVariant(ctx context.Context, va wvav1alpha1.Vari
 	}
 
 	if !pendingRequestExist {
-		logger.V(logging.DEBUG).Info("No pending requests found in the flowcontrol queue - skipping scaling up from zero")
+		// Scale-from-zero loop runs every 100ms; log at DEBUG to avoid flooding (10/sec per inactive VA).
+		logger.V(logging.DEBUG).Info("Scale-from-zero: skipping VA, no pending requests in flow control queue",
+			"va", va.Name,
+			"namespace", va.Namespace,
+			"modelID", va.Spec.ModelID)
 		return nil
 	}
 
@@ -353,5 +361,12 @@ func (e *Engine) processInactiveVariant(ctx context.Context, va wvav1alpha1.Vari
 		Object: &va,
 	}
 
+	// Log scaling decision for E2E and operators (mirrors saturation engine "Applied ... via shared cache").
+	logger.Info("Scale-from-zero decision written to cache",
+		"va", va.Name,
+		"namespace", va.Namespace,
+		"targetReplicas", targetWorkloadReplicas,
+		"reason", reason)
+
 	return nil
 }
diff --git a/test/e2e/fixtures/model_service_builder.go b/test/e2e/fixtures/model_service_builder.go
@@ -212,9 +212,9 @@ func buildModelServerArgs(modelID string, useSimulator bool, maxNumSeqs int) []s
 		// text completion API.
 		// Note: blockSize must be one of {8, 16, 32, 64, 128} per simulator validation.
 		const (
-			simulatorKVCacheSize = 1   // minimal cache: 1 unique block / 1 max block = 100% usage during load
-			simulatorBlockSize   = 8   // minimum valid block size; 8 tokens / 8 = 1 block per request
-			simulatorMaxModelLen = 512 // must exceed prompt tokens + max_tokens (burst load uses ~9 + 400 = 409)
+			simulatorKVCacheSize = 1        // minimal cache: 1 unique block / 1 max block = 100% usage during load
+			simulatorBlockSize   = 8        // minimum valid block size; 8 tokens / 8 = 1 block per request
+			simulatorMaxModelLen = 512      // must exceed prompt tokens + max_tokens (burst load uses ~9 + 400 = 409)
 			simulatorTTFT        = "2000ms" // time-to-first-token (slow to hold KV cache)
 			simulatorITL         = "100ms"  // inter-token latency (slow to keep requests active)
 		)
diff --git a/test/e2e/parallel_load_scaleup_test.go b/test/e2e/parallel_load_scaleup_test.go
@@ -345,9 +345,15 @@ var _ = Describe("Parallel Load Scale-Up Test", Label("full"), Ordered, func() {
 	It("should detect increased load and trigger scale-up", func() {
 		By("Waiting for load generation to ramp up (30 seconds)")
 		time.Sleep(30 * time.Second)
+		GinkgoWriter.Println("Load ramp-up complete, monitoring VA for scale-up (up to 5m)")
 
 		By("Monitoring VariantAutoscaling for scale-up")
+		start := time.Now()
+		attempt := 0
 		Eventually(func(g Gomega) {
+			attempt++
+			elapsed := time.Since(start)
+
 			va := &variantautoscalingv1alpha1.VariantAutoscaling{}
 			err := crClient.Get(ctx, client.ObjectKey{
 				Namespace: cfg.LLMDNamespace,
@@ -357,8 +363,8 @@ var _ = Describe("Parallel Load Scale-Up Test", Label("full"), Ordered, func() {
 
 			scaledOptimized = int32(va.Status.DesiredOptimizedAlloc.NumReplicas)
 
-			GinkgoWriter.Printf("VA optimized replicas: %d (initial: %d, minReplicas: %d)\n",
-				scaledOptimized, initialOptimized, hpaMinReplicas)
+			GinkgoWriter.Printf("VA check #%d (%v elapsed): optimized=%d (initial=%d, minReplicas=%d)\n",
+				attempt, elapsed.Round(time.Second), scaledOptimized, initialOptimized, hpaMinReplicas)
 
 			if !lowLoad {
 				// Scale-up means we should have MORE replicas than our initial stabilized state
diff --git a/test/e2e/scale_from_zero_test.go b/test/e2e/scale_from_zero_test.go
@@ -317,7 +317,16 @@ var _ = PDescribe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, fu
 
 				optimized := va.Status.DesiredOptimizedAlloc.NumReplicas
 
+				metricsCond := variantautoscalingv1alpha1.GetCondition(va, variantautoscalingv1alpha1.TypeMetricsAvailable)
+				optCond := variantautoscalingv1alpha1.GetCondition(va, variantautoscalingv1alpha1.TypeOptimizationReady)
+
 				GinkgoWriter.Printf("VA DesiredOptimizedAlloc.NumReplicas: %d (waiting for > 0)\n", optimized)
+				if metricsCond != nil {
+					GinkgoWriter.Printf("  MetricsAvailable: %s/%s (%s)\n", metricsCond.Status, metricsCond.Reason, metricsCond.Message)
+				}
+				if optCond != nil {
+					GinkgoWriter.Printf("  OptimizationReady: %s/%s (%s)\n", optCond.Status, optCond.Reason, optCond.Message)
+				}
 
 				// Scale-from-zero engine should detect pending requests and recommend scaling up
 				g.Expect(optimized).To(BeNumerically(">", 0),
diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go
@@ -27,6 +27,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
 
 	variantautoscalingv1alpha1 "github.com/llm-d/llm-d-workload-variant-autoscaler/api/v1alpha1"
+	"github.com/llm-d/llm-d-workload-variant-autoscaler/test/utils"
 	// +kubebuilder:scaffold:imports
 )
 
@@ -145,6 +146,21 @@ var _ = BeforeSuite(func() {
 	GinkgoWriter.Println("BeforeSuite completed successfully - infrastructure ready")
 })
 
+// ReportAfterEach dumps controller logs and VA status after a failed test.
+// This makes E2E failures self-contained and easier to debug (why scaling happened / didn't happen).
+var _ = ReportAfterEach(func(report SpecReport) {
+	if !report.Failed() {
+		return
+	}
+	if k8sClient == nil || crClient == nil {
+		return
+	}
+
+	GinkgoWriter.Printf("\n=== Failure diagnostics: %s ===\n", report.FullText())
+	utils.DumpControllerLogs(context.Background(), k8sClient, cfg.WVANamespace, GinkgoWriter)
+	utils.DumpVAStatus(context.Background(), crClient, GinkgoWriter)
+})
+
 var _ = AfterSuite(func() {
 	By("Cleaning up any leftover test resources")
 	if k8sClient != nil && crClient != nil {
diff --git a/test/utils/debug_helpers.go b/test/utils/debug_helpers.go
@@ -65,7 +65,7 @@ func DumpVAStatus(ctx context.Context, crClient client.Client, w io.Writer) {
 		_, _ = fmt.Fprintf(w, "    LastRunTime: %v\n", va.Status.DesiredOptimizedAlloc.LastRunTime)
 		_, _ = fmt.Fprintf(w, "  Conditions:\n")
 		for _, cond := range va.Status.Conditions {
-			_, _ = fmt.Fprintf(w, "    - Type: %s, Status: %s, Reason: %s\n", cond.Type, cond.Status, cond.Reason)
+			_, _ = fmt.Fprintf(w, "    - Type: %s, Status: %s, Reason: %s, Message: %q\n", cond.Type, cond.Status, cond.Reason, cond.Message)
 		}
 	}
 }

Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ func DumpVAStatus(ctx context.Context, crClient client.Client, w io.Writer) {`
`65`	`65`	`_, _ = fmt.Fprintf(w, " LastRunTime: %v\n", va.Status.DesiredOptimizedAlloc.LastRunTime)`
`66`	`66`	`_, _ = fmt.Fprintf(w, " Conditions:\n")`
`67`	`67`	`for _, cond := range va.Status.Conditions {`
`68`		`- _, _ = fmt.Fprintf(w, " - Type: %s, Status: %s, Reason: %s\n", cond.Type, cond.Status, cond.Reason)`
	`68`	`+ _, _ = fmt.Fprintf(w, " - Type: %s, Status: %s, Reason: %s, Message: %q\n", cond.Type, cond.Status, cond.Reason, cond.Message)`
`69`	`69`	`}`
`70`	`70`	`}`
`71`	`71`	`}`