🐛 Enable scale-from-zero E2E on CKS and OCP with KEDA support (#865)

clubanderson · web-flow · commit 7196434be264 · 2026-03-12T20:55:12.000-04:00
* 🐛 Enable scale-from-zero on CKS and OCP with KEDA support

- Remove environment skip in scale_from_zero_test.go — test now runs on
  all platforms (KEDA must be pre-installed on the cluster)
- Add retry logic to detect_inference_pool_api_group() to handle the race
  where InferencePool instances haven't been created yet after helmfile deploy
- Make deploy_keda() skip helm install when KEDA CRD already exists
  (pre-installed on OCP via CMA operator, on CKS via helm)
- Remove environment guard on SCALER_BACKEND=keda — supported everywhere

Signed-off-by: Andy Anderson &lt;andy@clubanderson.com&gt;
Signed-off-by: Andrew Anderson &lt;andy@clubanderson.com&gt;

* 🐛 Increase deploy wait timeout from 60s to 600s for model loading

The kubectl wait --timeout=60s for all deployments in the llm-d
namespace was too short for model-serving pods (vLLM) that need to
download and load large models (e.g. Meta-Llama-3.1-8B) into GPU
memory. This caused both OCP and CKS nightly E2E to fail at the
"Deploy guide via WVA install.sh" step.

Default is now 600s (10 min), overridable via DEPLOY_WAIT_TIMEOUT
env var. The vLLM startupProbe already allows up to 30 minutes.

Signed-off-by: Andrew Anderson &lt;andy@clubanderson.com&gt;

* 🐛 Address Copilot review feedback on KEDA and scale-from-zero

- deploy_keda(): Check operator pods + APIService, not just CRD, to
  avoid false skip when stale CRD remains after prior uninstall
- detect_inference_pool_api_group(): Implement actual namespace-first
  then cluster-wide fallback (comment said fallback but code didn't)
- Pin KEDA chart version (KEDA_CHART_VERSION, default 2.19.0) for
  reproducible installs
- Fix ENABLE_SCALE_TO_ZERO default inconsistency in helm --set
- Add Skip guard in scale-from-zero test for non-KEDA environments
  where HPA rejects minReplicas=0
- Fix misleading comment that said scale-from-zero requires KEDA
- Document per-environment KEDA_NAMESPACE values in suite_test.go

Signed-off-by: Andrew Anderson &lt;andy@clubanderson.com&gt;

---------

Signed-off-by: Andy Anderson &lt;andy@clubanderson.com&gt;
Signed-off-by: Andrew Anderson &lt;andy@clubanderson.com&gt;
diff --git a/deploy/install.sh b/deploy/install.sh
@@ -118,6 +118,8 @@ QUEUE_SPARE_TRIGGER=${QUEUE_SPARE_TRIGGER:-""}
 # When keda: do not deploy Prometheus Adapter; deploy KEDA instead (ScaledObjects, external metrics API)
 SCALER_BACKEND=${SCALER_BACKEND:-prometheus-adapter}
 KEDA_NAMESPACE=${KEDA_NAMESPACE:-keda-system}
+# Pin KEDA chart version for reproducible installs (only used when deploy_keda installs from helm)
+KEDA_CHART_VERSION=${KEDA_CHART_VERSION:-2.19.0}
 
 # Environment-related variables
 SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
@@ -505,13 +507,39 @@ set_wva_logging_level() {
 # Detect which InferencePool API group is in use in the cluster (v1 vs v1alpha2).
 # Sets DETECTED_POOL_GROUP to inference.networking.k8s.io or inference.networking.x-k8s.io
 # so WVA can be upgraded to watch the correct group (required for scale-from-zero datastore).
+# Retries up to POOL_DETECT_RETRIES times (default 6, 10s apart) to handle the race where
+# InferencePool instances haven't been created yet after helmfile deploy.
 detect_inference_pool_api_group() {
     DETECTED_POOL_GROUP=""
-    if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
-        DETECTED_POOL_GROUP="inference.networking.k8s.io"
-    elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
-        DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
-    fi
+    local max_retries=${POOL_DETECT_RETRIES:-6}
+    local retry_interval_s=10
+    local attempt=0
+    # Search in the target namespace first (avoids cluster-wide RBAC issues), then fall back to -A.
+    while [ $attempt -lt $max_retries ]; do
+        # Try namespace-scoped first if LLMD_NS is set
+        if [ -n "${LLMD_NS:-}" ]; then
+            if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -n "$LLMD_NS" -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
+                DETECTED_POOL_GROUP="inference.networking.k8s.io"
+                return
+            elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -n "$LLMD_NS" -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
+                DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
+                return
+            fi
+        fi
+        # Fall back to cluster-wide search
+        if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
+            DETECTED_POOL_GROUP="inference.networking.k8s.io"
+            return
+        elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
+            DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
+            return
+        fi
+        attempt=$((attempt + 1))
+        if [ $attempt -lt $max_retries ]; then
+            log_info "InferencePool not found yet, retrying in ${retry_interval_s}s ($attempt/$max_retries)..."
+            sleep $retry_interval_s
+        fi
+    done
 }
 
 deploy_wva_controller() {
@@ -555,6 +583,7 @@ deploy_wva_controller() {
         --set wva.prometheus.tls.insecureSkipVerify=$SKIP_TLS_VERIFY \
         --set wva.namespaceScoped=$NAMESPACE_SCOPED \
         --set wva.metrics.secure=$WVA_METRICS_SECURE \
+        --set wva.scaleToZero=$ENABLE_SCALE_TO_ZERO \
         ${CONTROLLER_INSTANCE:+--set wva.controllerInstance=$CONTROLLER_INSTANCE} \
         ${POOL_GROUP:+--set wva.poolGroup=$POOL_GROUP} \
         ${KV_SPARE_TRIGGER:+--set wva.capacityScaling.default.kvSpareTrigger=$KV_SPARE_TRIGGER} \
@@ -1041,8 +1070,12 @@ deploy_llm_d_infrastructure() {
         fi
     fi
 
-    log_info "Waiting for llm-d components to initialize..."
-    kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout=60s || \
+    # Model-serving pods (vLLM) can take several minutes to download and load
+    # large models into GPU memory. The startupProbe allows up to 30m, so the
+    # wait timeout here must be long enough for the model to finish loading.
+    local DEPLOY_WAIT_TIMEOUT="${DEPLOY_WAIT_TIMEOUT:-600s}"
+    log_info "Waiting for llm-d components to initialize (timeout=${DEPLOY_WAIT_TIMEOUT})..."
+    kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout="$DEPLOY_WAIT_TIMEOUT" || \
         log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
 
     # Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
@@ -1074,12 +1107,26 @@ deploy_llm_d_infrastructure() {
 deploy_keda() {
     log_info "Deploying KEDA (scaler backend)..."
 
+    # Skip install if KEDA is already fully operational on the cluster.
+    # Check CRD + operator pods + external metrics APIService to avoid false positives
+    # from stale CRDs left behind after a prior uninstall.
+    if kubectl get crd scaledobjects.keda.sh >/dev/null 2>&1; then
+        if kubectl get pods -A -l app.kubernetes.io/name=keda-operator 2>/dev/null | grep -q Running; then
+            if kubectl get apiservice v1beta1.external.metrics.k8s.io >/dev/null 2>&1; then
+                log_success "KEDA CRD, operator, and metrics APIService detected — skipping helm install"
+                return
+            fi
+        fi
+        log_warning "KEDA ScaledObject CRD found but operator or metrics APIService not detected; proceeding with helm install"
+    fi
+
     kubectl create namespace "$KEDA_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
 
     helm repo add kedacore https://kedacore.github.io/charts 2>/dev/null || true
     helm repo update
 
     if ! helm upgrade -i keda kedacore/keda \
+        --version "$KEDA_CHART_VERSION" \
         -n "$KEDA_NAMESPACE" \
         --set prometheus.metricServer.enabled=true \
         --set prometheus.operator.enabled=true \
@@ -1689,12 +1736,9 @@ main() {
     fi
 
     # Deploy scaler backend: KEDA or Prometheus Adapter
-    # KEDA in this script is for kind-emulator e2e only; on OpenShift use the platform CMA / Prometheus Adapter.
+    # KEDA is supported on all environments. On OpenShift and CKS it is typically
+    # pre-installed on the cluster; deploy_keda will detect and skip the install.
     if [ "$SCALER_BACKEND" = "keda" ]; then
-        if [ "$ENVIRONMENT" != "kind-emulator" ]; then
-            log_error "KEDA scaler backend is only supported for kind-emulator environment (ENVIRONMENT=kind-emulator). Current: ENVIRONMENT=$ENVIRONMENT. Use SCALER_BACKEND=prometheus-adapter or run with ENVIRONMENT=kind-emulator."
-            exit 1
-        fi
         deploy_keda
     elif [ "$DEPLOY_PROMETHEUS_ADAPTER" = "true" ]; then
         deploy_prometheus_adapter
diff --git a/test/e2e/scale_from_zero_test.go b/test/e2e/scale_from_zero_test.go
@@ -25,7 +25,8 @@ import (
 // Scale-from-zero test validates that the WVA controller correctly detects pending requests
 // and scales up deployments from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
 // on EPP and an InferenceObjective); deploy with E2E_TESTS_ENABLED=true or ENABLE_SCALE_TO_ZERO=true.
-// Uses KEDA ScaledObject when standard HPA rejects minReplicas=0 (e.g. OpenShift).
+// On platforms without the HPAScaleToZero feature gate (e.g. OpenShift), set SCALER_BACKEND=keda
+// so the test uses a KEDA ScaledObject (which supports minReplicas=0) instead of a native HPA.
 var _ = Describe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, func() {
 	var (
 		poolName         = "scale-from-zero-pool"
@@ -35,9 +36,13 @@ var _ = Describe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, fun
 	)
 
 	BeforeAll(func() {
-		// Scale-from-zero is not validated on OpenShift (POOL_GROUP / flow control setup differs; HPA minReplicas=0 often unsupported).
-		if cfg.Environment == "openshift" {
-			Skip("Scale-from-zero test is disabled on OpenShift")
+		// Scale-from-zero requires GIE flow control and an InferenceObjective.
+		// On platforms where HPA rejects minReplicas=0 (e.g. OpenShift without
+		// HPAScaleToZero feature gate), SCALER_BACKEND=keda must be set so the
+		// test creates a KEDA ScaledObject instead of a native HPA.
+		if cfg.ScalerBackend != "keda" && !cfg.ScaleToZeroEnabled {
+			Skip("Scale-from-zero requires SCALER_BACKEND=\"keda\" or ENABLE_SCALE_TO_ZERO=true; " +
+				"current configuration does not support HPA minReplicas=0")
 		}
 
 		// Note: InferencePool should already exist from infra-only deployment
diff --git a/test/e2e/suite_test.go b/test/e2e/suite_test.go
@@ -53,10 +53,10 @@ var _ = BeforeSuite(func() {
 	By("Loading configuration from environment")
 	cfg = LoadConfigFromEnv()
 
-	// KEDA scaler backend is only supported for kind-emulator (emulated) e2e; on OpenShift use platform CMA / Prometheus Adapter.
-	if cfg.ScalerBackend == "keda" && cfg.Environment != "kind-emulator" {
-		Fail("KEDA scaler backend is only supported for kind-emulator environment. Use ENVIRONMENT=kind-emulator or SCALER_BACKEND=prometheus-adapter.")
-	}
+	// KEDA is supported on all environments — pre-installed on OCP (Custom Metrics
+	// Autoscaler operator, namespace: openshift-keda) and CKS (helm, namespace: keda),
+	// installed at runtime on kind-emulator via install.sh (namespace: keda-system).
+	// Set KEDA_NAMESPACE accordingly when running on OCP or CKS.
 
 	GinkgoWriter.Printf("=== E2E Test Configuration ===\n")
 	GinkgoWriter.Printf("Environment: %s\n", cfg.Environment)