add GIE queuing for scale from zero e2es (#849)

mamy-CS · web-flow · commit 11c31300dfa3 · 2026-03-06T15:51:50.000-05:00
* add GIE queuing for scale from zero e2es

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;

* makes sure leftover scale-from-zero resources are cleaned up

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;

* autodetect inference_pool_api_group

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;

* doc nits and clarify

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;

---------

Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;
diff --git a/charts/workload-variant-autoscaler/templates/manager/wva-deployment-controller-manager.yaml b/charts/workload-variant-autoscaler/templates/manager/wva-deployment-controller-manager.yaml
@@ -62,6 +62,10 @@ spec:
           - name: CONTROLLER_INSTANCE
             value: {{ .Values.wva.controllerInstance | quote }}
           {{- end }}
+          {{- if .Values.wva.poolGroup }}
+          - name: POOL_GROUP
+            value: {{ .Values.wva.poolGroup | quote }}
+          {{- end }}
         name: manager
         ports:
           - name: healthz
diff --git a/charts/workload-variant-autoscaler/values.yaml b/charts/workload-variant-autoscaler/values.yaml
@@ -56,6 +56,12 @@ wva:
   # Useful for parallel e2e tests where multiple WVA controllers run simultaneously
   controllerInstance: ""
 
+  # InferencePool API group to watch. Must match the API group used by InferencePools in the cluster.
+  # - inference.networking.x-k8s.io (default): v1alpha2 / x-k8s.io
+  # - inference.networking.k8s.io: v1 (used by GIE helm charts such as gaie-sim on kind-emulator)
+  # When empty, WVA defaults to inference.networking.x-k8s.io.
+  poolGroup: ""
+
   # Saturation-based scaling configuration
   # These thresholds determine when replicas are saturated and when to scale up
   capacityScaling:
diff --git a/deploy/inference-objective-e2e.yaml b/deploy/inference-objective-e2e.yaml
@@ -0,0 +1,14 @@
+# InferenceObjective for GIE queuing (scale-from-zero e2e and flow control).
+# Applied when E2E_TESTS_ENABLED or ENABLE_SCALE_TO_ZERO is true.
+# poolRef.name is templated by install.sh to match the deployed InferencePool.
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceObjective
+metadata:
+  name: e2e-default
+  namespace: NAMESPACE_PLACEHOLDER
+spec:
+  priority: 0
+  poolRef:
+    name: POOL_NAME_PLACEHOLDER
+    kind: InferencePool
+    group: inference.networking.x-k8s.io
diff --git a/deploy/install.sh b/deploy/install.sh
@@ -503,6 +503,18 @@ set_wva_logging_level() {
     echo ""
 }
 
+# Detect which InferencePool API group is in use in the cluster (v1 vs v1alpha2).
+# Sets DETECTED_POOL_GROUP to inference.networking.k8s.io or inference.networking.x-k8s.io
+# so WVA can be upgraded to watch the correct group (required for scale-from-zero datastore).
+detect_inference_pool_api_group() {
+    DETECTED_POOL_GROUP=""
+    if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
+        DETECTED_POOL_GROUP="inference.networking.k8s.io"
+    elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
+        DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
+    fi
+}
+
 deploy_wva_controller() {
     log_info "Deploying Workload-Variant-Autoscaler..."
     log_info "Using image: $WVA_IMAGE_REPO:$WVA_IMAGE_TAG"
@@ -545,6 +557,7 @@ deploy_wva_controller() {
         --set wva.namespaceScoped=$NAMESPACE_SCOPED \
         --set wva.metrics.secure=$WVA_METRICS_SECURE \
         ${CONTROLLER_INSTANCE:+--set wva.controllerInstance=$CONTROLLER_INSTANCE} \
+        ${POOL_GROUP:+--set wva.poolGroup=$POOL_GROUP} \
         ${KV_SPARE_TRIGGER:+--set wva.capacityScaling.default.kvSpareTrigger=$KV_SPARE_TRIGGER} \
         ${QUEUE_SPARE_TRIGGER:+--set wva.capacityScaling.default.queueSpareTrigger=$QUEUE_SPARE_TRIGGER}
 
@@ -982,9 +995,9 @@ deploy_llm_d_infrastructure() {
         fi
     fi
 
-    # Patch llm-d-inference-scheduler deployment if scale-to-zero is enabled
-    if [ "$ENABLE_SCALE_TO_ZERO" == "true" ]; then
-        # Patch llm-d-inference-scheduler to enable flowcontrol and use new image
+    # Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero
+    # or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior).
+    if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
         log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image"
         if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
             kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
@@ -1003,14 +1016,53 @@ deploy_llm_d_infrastructure() {
                 }
             ]'
         else
-            log_warning "Skipping inference-scheduler patch for SCALE_TO_ZERO: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
+            log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
+        fi
+    fi
+
+    # Deploy InferenceObjective for GIE queuing when flow control is enabled (scale-from-zero / e2e).
+    # Enables gateway-level queuing so inference_extension_flow_control_queue_size is populated.
+    if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
+        if kubectl get crd inferenceobjectives.inference.networking.x-k8s.io &>/dev/null; then
+            local infobj_file="${WVA_PROJECT}/deploy/inference-objective-e2e.yaml"
+            if [ -f "$infobj_file" ]; then
+                local pool_ref_name="${RELEASE_NAME_POSTFIX:+gaie-$RELEASE_NAME_POSTFIX}"
+                pool_ref_name="${pool_ref_name:-gaie-$WELL_LIT_PATH_NAME}"
+                log_info "Applying InferenceObjective e2e-default (poolRef.name=$pool_ref_name) for GIE queuing"
+                if sed -e "s/NAMESPACE_PLACEHOLDER/${LLMD_NS}/g" -e "s/POOL_NAME_PLACEHOLDER/${pool_ref_name}/g" "$infobj_file" | kubectl apply -f -; then
+                    log_success "InferenceObjective e2e-default applied"
+                else
+                    log_warning "Failed to apply InferenceObjective (pool $pool_ref_name may not exist yet)"
+                fi
+            else
+                log_warning "InferenceObjective manifest not found at $infobj_file"
+            fi
+        else
+            log_warning "InferenceObjective CRD not found; GIE may not support InferenceObjective yet"
         fi
     fi
 
     log_info "Waiting for llm-d components to initialize..."
     kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout=60s || \
         log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
 
+    # Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
+    # llm-d version determines whether pools are inference.networking.k8s.io (v1) or inference.networking.x-k8s.io (v1alpha2).
+    if [ "$DEPLOY_WVA" == "true" ]; then
+        detect_inference_pool_api_group
+        if [ -n "$DETECTED_POOL_GROUP" ]; then
+            log_info "Detected InferencePool API group: $DETECTED_POOL_GROUP; upgrading WVA to watch it (scale-from-zero)"
+            if helm upgrade "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
+                -n $WVA_NS --reuse-values --set wva.poolGroup=$DETECTED_POOL_GROUP --wait --timeout=60s; then
+                log_success "WVA upgraded with wva.poolGroup=$DETECTED_POOL_GROUP"
+            else
+                log_warning "WVA upgrade with poolGroup failed - scale-from-zero may not see the InferencePool"
+            fi
+        else
+            log_warning "Could not detect InferencePool API group - WVA may have empty datastore for scale-from-zero"
+        fi
+    fi
+
     # Deploy second model infrastructure for multi-model testing (limiter e2e tests)
     if [ "$MULTI_MODEL_TESTING" == "true" ]; then
         deploy_second_model_infrastructure
diff --git a/deploy/kind-emulator/install.sh b/deploy/kind-emulator/install.sh
@@ -36,6 +36,8 @@ WVA_NS=${WVA_NS:-"workload-variant-autoscaler-system"}
 WVA_RECONCILE_INTERVAL=${WVA_RECONCILE_INTERVAL:-"60s"} # WVA controller reconcile interval - tests set 30s interval
 SKIP_TLS_VERIFY=true  # Skip TLS verification in emulated environments
 WVA_LOG_LEVEL="debug" # WVA log level set to debug for emulated environments
+# Initial WVA pool group; install.sh auto-detects the actual InferencePool API group after llm-d deploy and upgrades WVA (scale-from-zero).
+POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
 
 # llm-d Configuration
 LLM_D_INFERENCE_SIM_IMG_REPO=${LLM_D_INFERENCE_SIM_IMG_REPO:-"ghcr.io/llm-d/llm-d-inference-sim"}
diff --git a/docs/developer-guide/testing.md b/docs/developer-guide/testing.md
@@ -159,6 +159,8 @@ This deploys:
 - Prometheus stack and Prometheus Adapter (or KEDA when `SCALER_BACKEND=keda`)
 - **No** VariantAutoscaling, HPA, or model services (tests create these)
 
+When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script also enables **GIE queuing** so scale-from-zero tests can run: it patches the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true` and applies an **InferenceObjective** (`e2e-default`) that references the default InferencePool. This ensures the metric `inference_extension_flow_control_queue_size` is populated when requests hit the gateway.
+
 Alternatively, use the Makefile to deploy infra and run tests in one go:
 
 ```bash
diff --git a/docs/developer-guide/troubleshooting.md b/docs/developer-guide/troubleshooting.md
@@ -13,7 +13,9 @@
    kubectl get inferencepool
    ```
    
-   **Solution**: Ensure InferencePool is created and reconciled before creating VariantAutoscaling.
+   WVA watches a single InferencePool API group (`inference.networking.k8s.io` or `inference.networking.x-k8s.io`). If the cluster's pools use the other group, the datastore stays empty and scale-from-zero never gets a recommendation.
+   
+   **Solution**: Ensure InferencePool is created and reconciled before creating VariantAutoscaling. When using `deploy/install.sh` with llm-d (e.g. kind-emulator or CI), the script auto-detects the pool API group after llm-d deploy and upgrades WVA with the correct `wva.poolGroup` so both local and CI work regardless of llm-d version.
 
 2. **Labels mismatch**:
    ```bash
@@ -54,6 +56,10 @@
 
    **Solution**: Verify requests are being sent to the correct model endpoint.
 
+### E2E and infra-only deploys
+
+For e2e and infra-only deploys, the install script enables EPP flow control and optionally applies an InferenceObjective when `E2E_TESTS_ENABLED=true` or `ENABLE_SCALE_TO_ZERO=true`. See [deploy/install.sh](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/install.sh) and [deploy/inference-objective-e2e.yaml](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/inference-objective-e2e.yaml).
+
 ## Slow Scale-Up Response
 
 **Symptom**: Deployment takes too long to scale up from zero.
diff --git a/docs/user-guide/scale-from-zero.md b/docs/user-guide/scale-from-zero.md
@@ -42,7 +42,7 @@ The ScaleFromZero engine continuously monitors inactive VariantAutoscaling resou
 ## Prerequisites
 
 - WVA and llm-d installed and running - deployment options available for [kind](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kind-emulator/README.md), [OpenShift](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/openshift/README.md) and [Kubernetes](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kubernetes/README.md)
-- EndpointPicker (EPP) configured with flowcontrol enabled - required for queue metrics collection (set EPP env variable `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER`)
+- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (set EPP env `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
 
 
 ## Usage
diff --git a/test/e2e/README.md b/test/e2e/README.md
@@ -216,7 +216,8 @@ ginkgo -v --label-filter="smoke" ./test/e2e/
    - Verify independent scaling per VA
 
 3. **Scale-From-Zero** (~7 min)
-   - Create HPA with minReplicas=0
+   - Requires EPP flow control enabled so the metric `inference_extension_flow_control_queue_size` is populated (InferenceObjective is not required for this metric). When deploying infra with `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the install script enables flow control on the EPP and optionally applies an InferenceObjective for e2e.
+   - Create HPA (or KEDA ScaledObject) with minReplicas=0
    - Verify deployment scales to 0 when idle
    - Generate first request, verify scale-up from 0 → 1
    - Verify request queuing during cold start
diff --git a/test/e2e/config.go b/test/e2e/config.go
@@ -57,9 +57,16 @@ type E2EConfig struct {
 
 // LoadConfigFromEnv reads e2e test configuration from environment variables
 func LoadConfigFromEnv() E2EConfig {
+	env := getEnv("ENVIRONMENT", "kind-emulator")
+	eppServiceDefault := "gaie-inference-scheduling-epp"
+	if env == "kind-emulator" {
+		// kind-emulator deploy uses gaie-<NAMESPACE_SUFFIX>-epp with NAMESPACE_SUFFIX=sim
+		eppServiceDefault = "gaie-sim-epp"
+	}
+
 	cfg := E2EConfig{
 		// Cluster defaults
-		Environment: getEnv("ENVIRONMENT", "kind-emulator"),
+		Environment: env,
 		Kubeconfig:  getEnv("KUBECONFIG", os.Getenv("HOME")+"/.kube/config"),
 
 		// Namespace defaults
@@ -78,11 +85,11 @@ func LoadConfigFromEnv() E2EConfig {
 		ScalerBackend: getEnv("SCALER_BACKEND", "prometheus-adapter"),
 		KEDANamespace: getEnv("KEDA_NAMESPACE", "keda-system"),
 
-		// EPP defaults
+		// EPP defaults (kind-emulator uses gaie-sim-epp; other envs use gaie-inference-scheduling-epp)
 		EPPMode:          getEnv("EPP_MODE", "poolName"),
 		PoolName:         getEnv("POOL_NAME", ""),
 		EndpointSelector: parseEndpointSelector(getEnv("ENDPOINT_SELECTOR", "")),
-		EPPServiceName:   getEnv("EPP_SERVICE_NAME", "gaie-inference-scheduling-epp"),
+		EPPServiceName:   getEnv("EPP_SERVICE_NAME", eppServiceDefault),
 
 		// Model defaults
 		ModelID:         getEnv("MODEL_ID", "unsloth/Meta-Llama-3.1-8B"),
diff --git a/test/e2e/scale_from_zero_test.go b/test/e2e/scale_from_zero_test.go
@@ -23,9 +23,10 @@ import (
 )
 
 // Scale-from-zero test validates that the WVA controller correctly detects pending requests
-// and scales up deployments from zero replicas when the HPAScaleToZero feature gate is enabled.
-// NOTE: Disabled — standard HPA rejects minReplicas=0; requires KEDA ScaledObject support.
-var _ = PDescribe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, func() {
+// and scales up deployments from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
+// on EPP and an InferenceObjective); deploy with E2E_TESTS_ENABLED=true or ENABLE_SCALE_TO_ZERO=true.
+// Uses KEDA ScaledObject when standard HPA rejects minReplicas=0 (e.g. OpenShift).
+var _ = Describe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, func() {
 	var (
 		poolName         = "scale-from-zero-pool"
 		modelServiceName = "scale-from-zero-ms"
@@ -34,10 +35,10 @@ var _ = PDescribe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, fu
 	)
 
 	BeforeAll(func() {
-		// Skip if HPAScaleToZero is not enabled
-		// if !cfg.ScaleToZeroEnabled {
-		// 	Skip("HPAScaleToZero feature gate is not enabled; skipping scale-from-zero test")
-		// }
+		// Scale-from-zero is not validated on OpenShift (POOL_GROUP / flow control setup differs; HPA minReplicas=0 often unsupported).
+		if cfg.Environment == "openshift" {
+			Skip("Scale-from-zero test is disabled on OpenShift")
+		}
 
 		// Note: InferencePool should already exist from infra-only deployment
 		// We no longer create InferencePools in individual tests
@@ -160,7 +161,10 @@ var _ = PDescribe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, fu
 		// InferencePool reconciler runs again. We wait here to allow time for reconciliation.
 		// We wait longer to ensure the InferencePool reconciler has had time to register the pool
 		// in the datastore before the scale-from-zero engine runs.
-		time.Sleep(30 * time.Second) // Allow time for VA reconciliation and InferencePool registration
+		// When running in the full smoke suite (not focused), other specs may have run first and the
+		// leader may have just been elected or cache may still be syncing; wait longer so the
+		// InferencePool reconciler on the leader has populated the datastore.
+		time.Sleep(60 * time.Second) // Allow time for VA reconciliation and InferencePool registration
 
 		GinkgoWriter.Println("Scale-from-zero test setup complete with deployment at 0 replicas")
 	})
diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go
@@ -279,7 +279,7 @@ func cleanupTestResources(ctx context.Context, k8sClient *kubernetes.Clientset,
 
 	// Helper function to check if resource name matches test patterns
 	isTestResource := func(name string) bool {
-		return strings.HasPrefix(name, "test-") || strings.HasPrefix(name, "smoke-") || strings.HasPrefix(name, "saturation-") || strings.HasPrefix(name, "error-test-") || strings.HasPrefix(name, "target-condition-")
+		return strings.HasPrefix(name, "test-") || strings.HasPrefix(name, "smoke-") || strings.HasPrefix(name, "saturation-") || strings.HasPrefix(name, "error-test-") || strings.HasPrefix(name, "target-condition-") || strings.HasPrefix(name, "scale-from-zero-")
 	}
 
 	// List and delete test VAs