Skip to content

Commit 7196434

Browse files
authored
πŸ› Enable scale-from-zero E2E on CKS and OCP with KEDA support (#865)
* πŸ› Enable scale-from-zero on CKS and OCP with KEDA support - Remove environment skip in scale_from_zero_test.go β€” test now runs on all platforms (KEDA must be pre-installed on the cluster) - Add retry logic to detect_inference_pool_api_group() to handle the race where InferencePool instances haven't been created yet after helmfile deploy - Make deploy_keda() skip helm install when KEDA CRD already exists (pre-installed on OCP via CMA operator, on CKS via helm) - Remove environment guard on SCALER_BACKEND=keda β€” supported everywhere Signed-off-by: Andy Anderson <andy@clubanderson.com> Signed-off-by: Andrew Anderson <andy@clubanderson.com> * πŸ› Increase deploy wait timeout from 60s to 600s for model loading The kubectl wait --timeout=60s for all deployments in the llm-d namespace was too short for model-serving pods (vLLM) that need to download and load large models (e.g. Meta-Llama-3.1-8B) into GPU memory. This caused both OCP and CKS nightly E2E to fail at the "Deploy guide via WVA install.sh" step. Default is now 600s (10 min), overridable via DEPLOY_WAIT_TIMEOUT env var. The vLLM startupProbe already allows up to 30 minutes. Signed-off-by: Andrew Anderson <andy@clubanderson.com> * πŸ› Address Copilot review feedback on KEDA and scale-from-zero - deploy_keda(): Check operator pods + APIService, not just CRD, to avoid false skip when stale CRD remains after prior uninstall - detect_inference_pool_api_group(): Implement actual namespace-first then cluster-wide fallback (comment said fallback but code didn't) - Pin KEDA chart version (KEDA_CHART_VERSION, default 2.19.0) for reproducible installs - Fix ENABLE_SCALE_TO_ZERO default inconsistency in helm --set - Add Skip guard in scale-from-zero test for non-KEDA environments where HPA rejects minReplicas=0 - Fix misleading comment that said scale-from-zero requires KEDA - Document per-environment KEDA_NAMESPACE values in suite_test.go Signed-off-by: Andrew Anderson <andy@clubanderson.com> --------- Signed-off-by: Andy Anderson <andy@clubanderson.com> Signed-off-by: Andrew Anderson <andy@clubanderson.com>
1 parent 072ec8b commit 7196434

3 files changed

Lines changed: 69 additions & 20 deletions

File tree

β€Ždeploy/install.shβ€Ž

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ QUEUE_SPARE_TRIGGER=${QUEUE_SPARE_TRIGGER:-""}
118118
# When keda: do not deploy Prometheus Adapter; deploy KEDA instead (ScaledObjects, external metrics API)
119119
SCALER_BACKEND=${SCALER_BACKEND:-prometheus-adapter}
120120
KEDA_NAMESPACE=${KEDA_NAMESPACE:-keda-system}
121+
# Pin KEDA chart version for reproducible installs (only used when deploy_keda installs from helm)
122+
KEDA_CHART_VERSION=${KEDA_CHART_VERSION:-2.19.0}
121123

122124
# Environment-related variables
123125
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
@@ -505,13 +507,39 @@ set_wva_logging_level() {
505507
# Detect which InferencePool API group is in use in the cluster (v1 vs v1alpha2).
506508
# Sets DETECTED_POOL_GROUP to inference.networking.k8s.io or inference.networking.x-k8s.io
507509
# so WVA can be upgraded to watch the correct group (required for scale-from-zero datastore).
510+
# Retries up to POOL_DETECT_RETRIES times (default 6, 10s apart) to handle the race where
511+
# InferencePool instances haven't been created yet after helmfile deploy.
508512
detect_inference_pool_api_group() {
509513
DETECTED_POOL_GROUP=""
510-
if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
511-
DETECTED_POOL_GROUP="inference.networking.k8s.io"
512-
elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
513-
DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
514-
fi
514+
local max_retries=${POOL_DETECT_RETRIES:-6}
515+
local retry_interval_s=10
516+
local attempt=0
517+
# Search in the target namespace first (avoids cluster-wide RBAC issues), then fall back to -A.
518+
while [ $attempt -lt $max_retries ]; do
519+
# Try namespace-scoped first if LLMD_NS is set
520+
if [ -n "${LLMD_NS:-}" ]; then
521+
if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -n "$LLMD_NS" -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
522+
DETECTED_POOL_GROUP="inference.networking.k8s.io"
523+
return
524+
elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -n "$LLMD_NS" -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
525+
DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
526+
return
527+
fi
528+
fi
529+
# Fall back to cluster-wide search
530+
if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
531+
DETECTED_POOL_GROUP="inference.networking.k8s.io"
532+
return
533+
elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
534+
DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
535+
return
536+
fi
537+
attempt=$((attempt + 1))
538+
if [ $attempt -lt $max_retries ]; then
539+
log_info "InferencePool not found yet, retrying in ${retry_interval_s}s ($attempt/$max_retries)..."
540+
sleep $retry_interval_s
541+
fi
542+
done
515543
}
516544

517545
deploy_wva_controller() {
@@ -555,6 +583,7 @@ deploy_wva_controller() {
555583
--set wva.prometheus.tls.insecureSkipVerify=$SKIP_TLS_VERIFY \
556584
--set wva.namespaceScoped=$NAMESPACE_SCOPED \
557585
--set wva.metrics.secure=$WVA_METRICS_SECURE \
586+
--set wva.scaleToZero=$ENABLE_SCALE_TO_ZERO \
558587
${CONTROLLER_INSTANCE:+--set wva.controllerInstance=$CONTROLLER_INSTANCE} \
559588
${POOL_GROUP:+--set wva.poolGroup=$POOL_GROUP} \
560589
${KV_SPARE_TRIGGER:+--set wva.capacityScaling.default.kvSpareTrigger=$KV_SPARE_TRIGGER} \
@@ -1041,8 +1070,12 @@ deploy_llm_d_infrastructure() {
10411070
fi
10421071
fi
10431072

1044-
log_info "Waiting for llm-d components to initialize..."
1045-
kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout=60s || \
1073+
# Model-serving pods (vLLM) can take several minutes to download and load
1074+
# large models into GPU memory. The startupProbe allows up to 30m, so the
1075+
# wait timeout here must be long enough for the model to finish loading.
1076+
local DEPLOY_WAIT_TIMEOUT="${DEPLOY_WAIT_TIMEOUT:-600s}"
1077+
log_info "Waiting for llm-d components to initialize (timeout=${DEPLOY_WAIT_TIMEOUT})..."
1078+
kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout="$DEPLOY_WAIT_TIMEOUT" || \
10461079
log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
10471080

10481081
# Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
@@ -1074,12 +1107,26 @@ deploy_llm_d_infrastructure() {
10741107
deploy_keda() {
10751108
log_info "Deploying KEDA (scaler backend)..."
10761109

1110+
# Skip install if KEDA is already fully operational on the cluster.
1111+
# Check CRD + operator pods + external metrics APIService to avoid false positives
1112+
# from stale CRDs left behind after a prior uninstall.
1113+
if kubectl get crd scaledobjects.keda.sh >/dev/null 2>&1; then
1114+
if kubectl get pods -A -l app.kubernetes.io/name=keda-operator 2>/dev/null | grep -q Running; then
1115+
if kubectl get apiservice v1beta1.external.metrics.k8s.io >/dev/null 2>&1; then
1116+
log_success "KEDA CRD, operator, and metrics APIService detected β€” skipping helm install"
1117+
return
1118+
fi
1119+
fi
1120+
log_warning "KEDA ScaledObject CRD found but operator or metrics APIService not detected; proceeding with helm install"
1121+
fi
1122+
10771123
kubectl create namespace "$KEDA_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
10781124

10791125
helm repo add kedacore https://kedacore.github.io/charts 2>/dev/null || true
10801126
helm repo update
10811127

10821128
if ! helm upgrade -i keda kedacore/keda \
1129+
--version "$KEDA_CHART_VERSION" \
10831130
-n "$KEDA_NAMESPACE" \
10841131
--set prometheus.metricServer.enabled=true \
10851132
--set prometheus.operator.enabled=true \
@@ -1689,12 +1736,9 @@ main() {
16891736
fi
16901737

16911738
# Deploy scaler backend: KEDA or Prometheus Adapter
1692-
# KEDA in this script is for kind-emulator e2e only; on OpenShift use the platform CMA / Prometheus Adapter.
1739+
# KEDA is supported on all environments. On OpenShift and CKS it is typically
1740+
# pre-installed on the cluster; deploy_keda will detect and skip the install.
16931741
if [ "$SCALER_BACKEND" = "keda" ]; then
1694-
if [ "$ENVIRONMENT" != "kind-emulator" ]; then
1695-
log_error "KEDA scaler backend is only supported for kind-emulator environment (ENVIRONMENT=kind-emulator). Current: ENVIRONMENT=$ENVIRONMENT. Use SCALER_BACKEND=prometheus-adapter or run with ENVIRONMENT=kind-emulator."
1696-
exit 1
1697-
fi
16981742
deploy_keda
16991743
elif [ "$DEPLOY_PROMETHEUS_ADAPTER" = "true" ]; then
17001744
deploy_prometheus_adapter

β€Žtest/e2e/scale_from_zero_test.goβ€Ž

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ import (
2525
// Scale-from-zero test validates that the WVA controller correctly detects pending requests
2626
// and scales up deployments from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
2727
// on EPP and an InferenceObjective); deploy with E2E_TESTS_ENABLED=true or ENABLE_SCALE_TO_ZERO=true.
28-
// Uses KEDA ScaledObject when standard HPA rejects minReplicas=0 (e.g. OpenShift).
28+
// On platforms without the HPAScaleToZero feature gate (e.g. OpenShift), set SCALER_BACKEND=keda
29+
// so the test uses a KEDA ScaledObject (which supports minReplicas=0) instead of a native HPA.
2930
var _ = Describe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, func() {
3031
var (
3132
poolName = "scale-from-zero-pool"
@@ -35,9 +36,13 @@ var _ = Describe("Scale-From-Zero Feature", Label("smoke", "full"), Ordered, fun
3536
)
3637

3738
BeforeAll(func() {
38-
// Scale-from-zero is not validated on OpenShift (POOL_GROUP / flow control setup differs; HPA minReplicas=0 often unsupported).
39-
if cfg.Environment == "openshift" {
40-
Skip("Scale-from-zero test is disabled on OpenShift")
39+
// Scale-from-zero requires GIE flow control and an InferenceObjective.
40+
// On platforms where HPA rejects minReplicas=0 (e.g. OpenShift without
41+
// HPAScaleToZero feature gate), SCALER_BACKEND=keda must be set so the
42+
// test creates a KEDA ScaledObject instead of a native HPA.
43+
if cfg.ScalerBackend != "keda" && !cfg.ScaleToZeroEnabled {
44+
Skip("Scale-from-zero requires SCALER_BACKEND=\"keda\" or ENABLE_SCALE_TO_ZERO=true; " +
45+
"current configuration does not support HPA minReplicas=0")
4146
}
4247

4348
// Note: InferencePool should already exist from infra-only deployment

β€Žtest/e2e/suite_test.goβ€Ž

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ var _ = BeforeSuite(func() {
5353
By("Loading configuration from environment")
5454
cfg = LoadConfigFromEnv()
5555

56-
// KEDA scaler backend is only supported for kind-emulator (emulated) e2e; on OpenShift use platform CMA / Prometheus Adapter.
57-
if cfg.ScalerBackend == "keda" && cfg.Environment != "kind-emulator" {
58-
Fail("KEDA scaler backend is only supported for kind-emulator environment. Use ENVIRONMENT=kind-emulator or SCALER_BACKEND=prometheus-adapter.")
59-
}
56+
// KEDA is supported on all environments β€” pre-installed on OCP (Custom Metrics
57+
// Autoscaler operator, namespace: openshift-keda) and CKS (helm, namespace: keda),
58+
// installed at runtime on kind-emulator via install.sh (namespace: keda-system).
59+
// Set KEDA_NAMESPACE accordingly when running on OCP or CKS.
6060

6161
GinkgoWriter.Printf("=== E2E Test Configuration ===\n")
6262
GinkgoWriter.Printf("Environment: %s\n", cfg.Environment)

0 commit comments

Comments
Β (0)