Skip to content

Commit 0ee5ba1

Browse files
authored
opt in for keda, env specific (llm-d#967)
Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent 669f5c4 commit 0ee5ba1

9 files changed

Lines changed: 93 additions & 18 deletions

File tree

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -656,10 +656,14 @@ jobs:
656656
# may be owned by another namespace's release, causing Helm ownership conflicts.
657657
# Fix: adopt them for our namespace so helmfile can proceed. Post-cleanup will
658658
# delete them, and the next user's helmfile run will recreate them fresh.
659+
# Only adopt legacy helmfile-style names (release "workload-variant-autoscaler").
660+
# PR-specific Helm releases use names like wva-e2e-<run_id>; those live in WVA_NAMESPACE.
661+
# Re-annotating them to LLMD_NAMESPACE breaks Helm ownership and can leave the controller
662+
# ServiceAccount bound to a wrong or unmanaged ClusterRole (cluster-wide list/watch denied).
659663
echo "Adopting shared WVA cluster-scoped resources for namespace $LLMD_NAMESPACE..."
660664
for kind in clusterrole clusterrolebinding; do
661665
kubectl get "$kind" -o json 2>/dev/null | \
662-
jq -r '.items[] | select(.metadata.name | contains("workload-variant-autoscaler")) | select(.metadata.annotations["meta.helm.sh/release-namespace"] != null) | .metadata.name' 2>/dev/null | \
666+
jq -r '.items[] | select(.metadata.name | contains("workload-variant-autoscaler")) | select(.metadata.name | startswith("wva-e2e-") | not) | select(.metadata.annotations["meta.helm.sh/release-namespace"] != null) | .metadata.name' 2>/dev/null | \
663667
while read -r name; do
664668
current_ns=$(kubectl get "$kind" "$name" -o json 2>/dev/null | jq -r '.metadata.annotations["meta.helm.sh/release-namespace"] // ""')
665669
if [ "$current_ns" != "$LLMD_NAMESPACE" ]; then
@@ -688,6 +692,9 @@ jobs:
688692
ENVIRONMENT: openshift
689693
INSTALL_GATEWAY_CTRLPLANE: "false"
690694
E2E_TESTS_ENABLED: "true"
695+
# OpenShift typically lacks HPAScaleToZero; e2e forces SCALE_TO_ZERO_ENABLED off for openshift
696+
# (see test/e2e/config.go). KEDA ScaledObjects support minReplicas=0 for scale-from-zero tests.
697+
SCALER_BACKEND: keda
691698
NAMESPACE_SCOPED: "false"
692699
# Pass PR-specific namespaces to install script
693700
LLMD_NS: ${{ env.LLMD_NAMESPACE }}
@@ -793,6 +800,7 @@ jobs:
793800
ENVIRONMENT: openshift
794801
INSTALL_GATEWAY_CTRLPLANE: "false"
795802
E2E_TESTS_ENABLED: "true"
803+
SCALER_BACKEND: keda
796804
NAMESPACE_SCOPED: "false"
797805
# Override namespaces for Model B stack
798806
LLMD_NS: ${{ env.LLMD_NAMESPACE_B }}

deploy/install.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,16 @@ QUEUE_SPARE_TRIGGER=${QUEUE_SPARE_TRIGGER:-""}
113113

114114
# Scaler backend: "prometheus-adapter" (default), "keda", or "none"
115115
# prometheus-adapter: deploy Prometheus Adapter + patch external metrics APIService
116-
# keda: deploy KEDA via Helm (or detect pre-installed) + configure ScaledObjects
116+
# keda: on kubernetes assume cluster-managed KEDA (no Helm; set KEDA_HELM_INSTALL=true to install);
117+
# on kind-emulator install via Helm when needed; OpenShift is always platform-managed (no Helm)
117118
# none: skip all scaler backend deployment; use when KEDA or another metrics API
118119
# is already installed on the cluster (e.g. llmd benchmark clusters)
119120
SCALER_BACKEND=${SCALER_BACKEND:-prometheus-adapter}
120121
KEDA_NAMESPACE=${KEDA_NAMESPACE:-keda-system}
121122
# Pin KEDA chart version for reproducible installs (only used when deploy_keda installs from helm)
122123
KEDA_CHART_VERSION=${KEDA_CHART_VERSION:-2.19.0}
124+
# kubernetes: default false (cluster-managed KEDA); set true to let this script install/upgrade KEDA via Helm
125+
KEDA_HELM_INSTALL=${KEDA_HELM_INSTALL:-false}
123126

124127
# Environment-related variables
125128
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

deploy/lib/cleanup.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88
#
99

1010
undeploy_keda() {
11+
if [ "$ENVIRONMENT" = "openshift" ]; then
12+
log_info "OpenShift: skipping KEDA uninstall (platform-managed)"
13+
return
14+
fi
15+
if [ "$ENVIRONMENT" = "kubernetes" ] && [ "${KEDA_HELM_INSTALL:-false}" != "true" ]; then
16+
log_info "Kubernetes: skipping KEDA uninstall (cluster-managed; set KEDA_HELM_INSTALL=true if this script installed KEDA)"
17+
return
18+
fi
1119
log_info "Uninstalling KEDA..."
1220
helm uninstall "$KEDA_RELEASE_NAME" -n "$KEDA_NAMESPACE" 2>/dev/null || \
1321
log_warning "KEDA not found or already uninstalled"

deploy/lib/cli.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,11 @@ Environment Variables:
3838
INFRA_ONLY Deploy only infrastructure (default: false, same as --infra-only flag)
3939
SCALER_BACKEND Scaler backend: "prometheus-adapter" (default), "keda", or "none".
4040
prometheus-adapter: installs Prometheus Adapter and patches the external metrics APIService.
41-
keda: installs KEDA (or detects pre-installed) and skips Prometheus Adapter.
41+
keda: skips Prometheus Adapter; on kubernetes assumes cluster-managed KEDA (KEDA_HELM_INSTALL=true for Helm);
42+
kind-emulator installs KEDA via Helm when needed; OpenShift is platform-managed only.
4243
none: skips all scaler backend deployment. Use this on clusters that already have
4344
KEDA or another external metrics API installed (e.g. llmd benchmark clusters).
45+
KEDA_HELM_INSTALL When true with ENVIRONMENT=kubernetes, install/upgrade KEDA via Helm (default: false)
4446
KEDA_NAMESPACE Namespace for KEDA (default: keda-system)
4547
UNDEPLOY Undeploy mode (default: false)
4648
DELETE_NAMESPACES Delete namespaces after undeploy (default: false)

deploy/lib/infra_llmd.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ deploy_llm_d_infrastructure() {
136136
if [ "$DEPLOY_WVA" == "true" ] && [ "$VLLM_SVC_ENABLED" == "true" ]; then
137137
helm upgrade "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
138138
-n "$WVA_NS" --reuse-values \
139+
--set wva.namespaceScoped="${NAMESPACE_SCOPED:-true}" \
139140
--set vllmService.port="$VLLM_SVC_PORT" \
140141
--set vllmService.targetPort="$VLLM_SVC_PORT"
141142
fi
@@ -325,7 +326,9 @@ deploy_llm_d_infrastructure() {
325326
if [ -n "$DETECTED_POOL_GROUP" ]; then
326327
log_info "Detected InferencePool API group: $DETECTED_POOL_GROUP; upgrading WVA to watch it (scale-from-zero)"
327328
if helm upgrade "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
328-
-n "$WVA_NS" --reuse-values --set wva.poolGroup="$DETECTED_POOL_GROUP" --wait --timeout=60s; then
329+
-n "$WVA_NS" --reuse-values \
330+
--set wva.namespaceScoped="${NAMESPACE_SCOPED:-true}" \
331+
--set wva.poolGroup="$DETECTED_POOL_GROUP" --wait --timeout=60s; then
329332
log_success "WVA upgraded with wva.poolGroup=$DETECTED_POOL_GROUP"
330333
else
331334
log_warning "WVA upgrade with poolGroup failed - scale-from-zero may not see the InferencePool"

deploy/lib/infra_scaler_backend.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88

99
deploy_scaler_backend() {
1010
# Deploy scaler backend: KEDA, Prometheus Adapter, or none.
11-
# KEDA is supported on all environments. On OpenShift and CKS it is typically
12-
# pre-installed on the cluster; deploy_keda will detect and skip the install.
11+
# OpenShift: KEDA is never Helm-installed (platform-managed); see deploy_keda in scaler_runtime.sh.
12+
# Kubernetes: deploy_keda skips Helm by default (cluster-managed); KEDA_HELM_INSTALL=true enables Helm.
13+
# kind-emulator: Helm when needed; shared-cluster guard uses ClusterRole keda-operator when Helm is used.
1314
# Use SCALER_BACKEND=none on clusters that already have an external metrics API
1415
# (e.g. llmd benchmark clusters with KEDA pre-installed) to avoid conflicts.
1516
if [ "$SCALER_BACKEND" = "keda" ]; then

deploy/lib/scaler_runtime.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,37 @@ stop_apiservice_guard() {
6868
deploy_keda() {
6969
log_info "Deploying KEDA (scaler backend)..."
7070

71+
# OpenShift: KEDA is cluster-managed (OLM/operator); never Helm-install — avoids
72+
# ClusterRole/release conflicts with an existing platform KEDA.
73+
if [ "$ENVIRONMENT" = "openshift" ]; then
74+
log_info "OpenShift: assuming platform-managed KEDA — skipping Helm install"
75+
if kubectl get crd scaledobjects.keda.sh >/dev/null 2>&1; then
76+
log_success "KEDA ScaledObject CRD is available on the cluster"
77+
else
78+
if [ "$E2E_TESTS_ENABLED" = "true" ]; then
79+
log_error "OpenShift: scaledobjects.keda.sh CRD not found — install cluster KEDA before E2E (SCALER_BACKEND=keda)"
80+
exit 1
81+
fi
82+
log_warning "KEDA ScaledObject CRD not found — ScaledObject-based scaling will not work"
83+
fi
84+
return
85+
fi
86+
87+
# Kubernetes (e.g. CKS, shared clusters): assume cluster-managed KEDA; never Helm unless opted in.
88+
if [ "$ENVIRONMENT" = "kubernetes" ] && [ "${KEDA_HELM_INSTALL:-false}" != "true" ]; then
89+
log_info "Kubernetes: assuming cluster-managed KEDA — skipping Helm (set KEDA_HELM_INSTALL=true to install via Helm)"
90+
if kubectl get crd scaledobjects.keda.sh >/dev/null 2>&1; then
91+
log_success "KEDA ScaledObject CRD is available on the cluster"
92+
else
93+
if [ "$E2E_TESTS_ENABLED" = "true" ]; then
94+
log_error "Kubernetes: scaledobjects.keda.sh CRD not found — install KEDA on the cluster or set KEDA_HELM_INSTALL=true"
95+
exit 1
96+
fi
97+
log_warning "KEDA ScaledObject CRD not found — ScaledObject-based scaling will not work"
98+
fi
99+
return
100+
fi
101+
71102
# Skip install if KEDA is already fully operational on the cluster.
72103
# Check CRD + operator pods + external metrics APIService to avoid false positives
73104
# from stale CRDs left behind after a prior uninstall.
@@ -78,6 +109,16 @@ deploy_keda() {
78109
return
79110
fi
80111
fi
112+
# Shared clusters (e.g. CKS) often pre-install KEDA without the exact pod label / APIService
113+
# shape our probe expects, but ClusterRole keda-operator already exists without Helm metadata.
114+
# Helm install then fails with ownership errors — skip Helm when that pattern is present.
115+
if kubectl get clusterrole keda-operator >/dev/null 2>&1; then
116+
keda_cr_managed_by=$(kubectl get clusterrole keda-operator -o jsonpath='{.metadata.labels.app\.kubernetes\.io/managed-by}' 2>/dev/null || true)
117+
if [ "$keda_cr_managed_by" != "Helm" ]; then
118+
log_info "KEDA CRD present and ClusterRole keda-operator is not Helm-managed — skipping Helm install (pre-installed KEDA)"
119+
return
120+
fi
121+
fi
81122
log_warning "KEDA ScaledObject CRD found but operator or metrics APIService not detected; proceeding with helm install"
82123
fi
83124

test/e2e/config.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ type E2EConfig struct {
1010
testconfig.SharedConfig
1111

1212
// Feature gates
13-
ScaleToZeroEnabled bool // HPAScaleToZero feature gate
13+
// ScaleToZeroEnabled: env SCALE_TO_ZERO_ENABLED — assume native HPA may use minReplicas=0
14+
// ("scale-to-zero" via HPAScaleToZero). Distinct from scale-from-zero (scale up from zero replicas).
15+
ScaleToZeroEnabled bool
1416

1517
// Timeouts (seconds unless noted)
1618
PodReadyTimeout int // Wait for deployment/model pods ready
@@ -53,10 +55,10 @@ func LoadConfigFromEnv() E2EConfig {
5355
PrometheusAdapterProbeSec: testconfig.GetEnvInt("E2E_PROM_ADAPTER_PROBE_SEC", 90),
5456
}
5557

56-
// OpenShift clusters typically don't have the HPAScaleToZero feature gate
57-
// enabled, so attempting to create HPAs with minReplicas=0 will fail with:
58-
// "spec.minReplicas: Invalid value: 0: must be greater than or equal to 1"
59-
// Override the env var to prevent test failures on OpenShift.
58+
// OpenShift clusters typically don't have the HPAScaleToZero feature gate enabled, so native HPAs
59+
// cannot use minReplicas=0 ("scale-to-zero" on the HPA). Ignore SCALE_TO_ZERO_ENABLED there so e2e
60+
// does not assume that path (creation fails with: minReplicas must be >= 1).
61+
// Scale-from-zero (scaling workloads up from zero replicas) is separate; this block does not configure SCALER_BACKEND.
6062
if cfg.Environment == "openshift" && cfg.ScaleToZeroEnabled {
6163
cfg.ScaleToZeroEnabled = false
6264
}

test/e2e/scale_from_zero_test.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,12 @@ func cleanupScaleFromZeroResources() {
135135
}
136136

137137
// Scale-from-zero test validates that the WVA controller correctly detects pending requests
138-
// and scales up deployments from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
138+
// and scales up scale targets from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
139139
// on EPP from install when E2E_TESTS_ENABLED=true) and an InferenceObjective (applied below in BeforeAll).
140+
// This suite needs a scaler that allows minReplicas=0 on the scaled workload: either
141+
// SCALE_TO_ZERO_ENABLED=true where native HPA supports it (HPAScaleToZero), or SCALER_BACKEND=keda
142+
// (ScaledObject). OpenShift usually lacks HPAScaleToZero; e2e config ignores SCALE_TO_ZERO_ENABLED there,
143+
// so use SCALER_BACKEND=keda for this Describe when running on OpenShift.
140144
// On platforms without the HPAScaleToZero feature gate (e.g. OpenShift), set SCALER_BACKEND=keda
141145
// so the test uses a KEDA ScaledObject (which supports minReplicas=0) instead of a native HPA.
142146
var _ = Describe("Scale-From-Zero Feature", Serial, Label("full"), Ordered, func() {
@@ -153,8 +157,9 @@ var _ = Describe("Scale-From-Zero Feature", Serial, Label("full"), Ordered, func
153157
// HPAScaleToZero feature gate), SCALER_BACKEND=keda must be set so the
154158
// test creates a KEDA ScaledObject instead of a native HPA.
155159
if cfg.ScalerBackend != "keda" && !cfg.ScaleToZeroEnabled {
156-
Skip("Scale-from-zero requires SCALER_BACKEND=\"keda\" or ENABLE_SCALE_TO_ZERO=true; " +
157-
"current configuration does not support HPA minReplicas=0")
160+
Skip("This suite needs minReplicas=0 on the scaler: set SCALER_BACKEND=\"keda\" " +
161+
"or SCALE_TO_ZERO_ENABLED=true (ignored on OpenShift without HPAScaleToZero — use KEDA); " +
162+
"current configuration does not support that scaler shape")
158163
}
159164

160165
By("Cleaning up any existing scale-from-zero test resources")
@@ -619,8 +624,9 @@ var _ = Describe("Scale-From-Zero Feature with LeaderWorkerSet", Serial, Label("
619624
// HPAScaleToZero feature gate), SCALER_BACKEND=keda must be set so the
620625
// test creates a KEDA ScaledObject instead of a native HPA.
621626
if cfg.ScalerBackend != "keda" && !cfg.ScaleToZeroEnabled {
622-
Skip("Scale-from-zero requires SCALER_BACKEND=\"keda\" or ENABLE_SCALE_TO_ZERO=true; " +
623-
"current configuration does not support HPA minReplicas=0")
627+
Skip("This suite needs minReplicas=0 on the scaler: set SCALER_BACKEND=\"keda\" " +
628+
"or SCALE_TO_ZERO_ENABLED=true (ignored on OpenShift without HPAScaleToZero — use KEDA); " +
629+
"current configuration does not support that scaler shape")
624630
}
625631

626632
By("Cleaning up any existing scale-from-zero test resources")
@@ -1039,8 +1045,9 @@ var _ = Describe("Scale-From-Zero Feature with LeaderWorkerSet (single-node)", S
10391045
// HPAScaleToZero feature gate), SCALER_BACKEND=keda must be set so the
10401046
// test creates a KEDA ScaledObject instead of a native HPA.
10411047
if cfg.ScalerBackend != "keda" && !cfg.ScaleToZeroEnabled {
1042-
Skip("Scale-from-zero requires SCALER_BACKEND=\"keda\" or ENABLE_SCALE_TO_ZERO=true; " +
1043-
"current configuration does not support HPA minReplicas=0")
1048+
Skip("This suite needs minReplicas=0 on the scaler: set SCALER_BACKEND=\"keda\" " +
1049+
"or SCALE_TO_ZERO_ENABLED=true (ignored on OpenShift without HPAScaleToZero — use KEDA); " +
1050+
"current configuration does not support that scaler shape")
10441051
}
10451052

10461053
By("Cleaning up any existing scale-from-zero test resources")

0 commit comments

Comments
 (0)