Skip to content

Commit 11c3130

Browse files
authored
add GIE queuing for scale from zero e2es (#849)
* add GIE queuing for scale from zero e2es Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * makes sure leftover scale-from-zero resources are cleaned up Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * autodetect inference_pool_api_group Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * doc nits and clarify Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> --------- Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent c05f24f commit 11c3130

12 files changed

Lines changed: 117 additions & 19 deletions

File tree

charts/workload-variant-autoscaler/templates/manager/wva-deployment-controller-manager.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ spec:
6262
- name: CONTROLLER_INSTANCE
6363
value: {{ .Values.wva.controllerInstance | quote }}
6464
{{- end }}
65+
{{- if .Values.wva.poolGroup }}
66+
- name: POOL_GROUP
67+
value: {{ .Values.wva.poolGroup | quote }}
68+
{{- end }}
6569
name: manager
6670
ports:
6771
- name: healthz

charts/workload-variant-autoscaler/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ wva:
5656
# Useful for parallel e2e tests where multiple WVA controllers run simultaneously
5757
controllerInstance: ""
5858

59+
# InferencePool API group to watch. Must match the API group used by InferencePools in the cluster.
60+
# - inference.networking.x-k8s.io (default): v1alpha2 / x-k8s.io
61+
# - inference.networking.k8s.io: v1 (used by GIE helm charts such as gaie-sim on kind-emulator)
62+
# When empty, WVA defaults to inference.networking.x-k8s.io.
63+
poolGroup: ""
64+
5965
# Saturation-based scaling configuration
6066
# These thresholds determine when replicas are saturated and when to scale up
6167
capacityScaling:
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# InferenceObjective for GIE queuing (scale-from-zero e2e and flow control).
2+
# Applied when E2E_TESTS_ENABLED or ENABLE_SCALE_TO_ZERO is true.
3+
# poolRef.name is templated by install.sh to match the deployed InferencePool.
4+
apiVersion: inference.networking.x-k8s.io/v1alpha2
5+
kind: InferenceObjective
6+
metadata:
7+
name: e2e-default
8+
namespace: NAMESPACE_PLACEHOLDER
9+
spec:
10+
priority: 0
11+
poolRef:
12+
name: POOL_NAME_PLACEHOLDER
13+
kind: InferencePool
14+
group: inference.networking.x-k8s.io

deploy/install.sh

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,18 @@ set_wva_logging_level() {
503503
echo ""
504504
}
505505

506+
# Detect which InferencePool API group is in use in the cluster (v1 vs v1alpha2).
507+
# Sets DETECTED_POOL_GROUP to inference.networking.k8s.io or inference.networking.x-k8s.io
508+
# so WVA can be upgraded to watch the correct group (required for scale-from-zero datastore).
509+
detect_inference_pool_api_group() {
510+
DETECTED_POOL_GROUP=""
511+
if [ -n "$(kubectl get inferencepools.inference.networking.k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
512+
DETECTED_POOL_GROUP="inference.networking.k8s.io"
513+
elif [ -n "$(kubectl get inferencepools.inference.networking.x-k8s.io -A -o name --request-timeout=10s 2>/dev/null | head -1)" ]; then
514+
DETECTED_POOL_GROUP="inference.networking.x-k8s.io"
515+
fi
516+
}
517+
506518
deploy_wva_controller() {
507519
log_info "Deploying Workload-Variant-Autoscaler..."
508520
log_info "Using image: $WVA_IMAGE_REPO:$WVA_IMAGE_TAG"
@@ -545,6 +557,7 @@ deploy_wva_controller() {
545557
--set wva.namespaceScoped=$NAMESPACE_SCOPED \
546558
--set wva.metrics.secure=$WVA_METRICS_SECURE \
547559
${CONTROLLER_INSTANCE:+--set wva.controllerInstance=$CONTROLLER_INSTANCE} \
560+
${POOL_GROUP:+--set wva.poolGroup=$POOL_GROUP} \
548561
${KV_SPARE_TRIGGER:+--set wva.capacityScaling.default.kvSpareTrigger=$KV_SPARE_TRIGGER} \
549562
${QUEUE_SPARE_TRIGGER:+--set wva.capacityScaling.default.queueSpareTrigger=$QUEUE_SPARE_TRIGGER}
550563

@@ -982,9 +995,9 @@ deploy_llm_d_infrastructure() {
982995
fi
983996
fi
984997

985-
# Patch llm-d-inference-scheduler deployment if scale-to-zero is enabled
986-
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ]; then
987-
# Patch llm-d-inference-scheduler to enable flowcontrol and use new image
998+
# Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero
999+
# or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior).
1000+
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
9881001
log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image"
9891002
if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
9901003
kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
@@ -1003,14 +1016,53 @@ deploy_llm_d_infrastructure() {
10031016
}
10041017
]'
10051018
else
1006-
log_warning "Skipping inference-scheduler patch for SCALE_TO_ZERO: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
1019+
log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
1020+
fi
1021+
fi
1022+
1023+
# Deploy InferenceObjective for GIE queuing when flow control is enabled (scale-from-zero / e2e).
1024+
# Enables gateway-level queuing so inference_extension_flow_control_queue_size is populated.
1025+
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
1026+
if kubectl get crd inferenceobjectives.inference.networking.x-k8s.io &>/dev/null; then
1027+
local infobj_file="${WVA_PROJECT}/deploy/inference-objective-e2e.yaml"
1028+
if [ -f "$infobj_file" ]; then
1029+
local pool_ref_name="${RELEASE_NAME_POSTFIX:+gaie-$RELEASE_NAME_POSTFIX}"
1030+
pool_ref_name="${pool_ref_name:-gaie-$WELL_LIT_PATH_NAME}"
1031+
log_info "Applying InferenceObjective e2e-default (poolRef.name=$pool_ref_name) for GIE queuing"
1032+
if sed -e "s/NAMESPACE_PLACEHOLDER/${LLMD_NS}/g" -e "s/POOL_NAME_PLACEHOLDER/${pool_ref_name}/g" "$infobj_file" | kubectl apply -f -; then
1033+
log_success "InferenceObjective e2e-default applied"
1034+
else
1035+
log_warning "Failed to apply InferenceObjective (pool $pool_ref_name may not exist yet)"
1036+
fi
1037+
else
1038+
log_warning "InferenceObjective manifest not found at $infobj_file"
1039+
fi
1040+
else
1041+
log_warning "InferenceObjective CRD not found; GIE may not support InferenceObjective yet"
10071042
fi
10081043
fi
10091044

10101045
log_info "Waiting for llm-d components to initialize..."
10111046
kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout=60s || \
10121047
log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
10131048

1049+
# Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
1050+
# llm-d version determines whether pools are inference.networking.k8s.io (v1) or inference.networking.x-k8s.io (v1alpha2).
1051+
if [ "$DEPLOY_WVA" == "true" ]; then
1052+
detect_inference_pool_api_group
1053+
if [ -n "$DETECTED_POOL_GROUP" ]; then
1054+
log_info "Detected InferencePool API group: $DETECTED_POOL_GROUP; upgrading WVA to watch it (scale-from-zero)"
1055+
if helm upgrade "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
1056+
-n $WVA_NS --reuse-values --set wva.poolGroup=$DETECTED_POOL_GROUP --wait --timeout=60s; then
1057+
log_success "WVA upgraded with wva.poolGroup=$DETECTED_POOL_GROUP"
1058+
else
1059+
log_warning "WVA upgrade with poolGroup failed - scale-from-zero may not see the InferencePool"
1060+
fi
1061+
else
1062+
log_warning "Could not detect InferencePool API group - WVA may have empty datastore for scale-from-zero"
1063+
fi
1064+
fi
1065+
10141066
# Deploy second model infrastructure for multi-model testing (limiter e2e tests)
10151067
if [ "$MULTI_MODEL_TESTING" == "true" ]; then
10161068
deploy_second_model_infrastructure

deploy/kind-emulator/install.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ WVA_NS=${WVA_NS:-"workload-variant-autoscaler-system"}
3636
WVA_RECONCILE_INTERVAL=${WVA_RECONCILE_INTERVAL:-"60s"} # WVA controller reconcile interval - tests set 30s interval
3737
SKIP_TLS_VERIFY=true # Skip TLS verification in emulated environments
3838
WVA_LOG_LEVEL="debug" # WVA log level set to debug for emulated environments
39+
# Initial WVA pool group; install.sh auto-detects the actual InferencePool API group after llm-d deploy and upgrades WVA (scale-from-zero).
40+
POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
3941

4042
# llm-d Configuration
4143
LLM_D_INFERENCE_SIM_IMG_REPO=${LLM_D_INFERENCE_SIM_IMG_REPO:-"ghcr.io/llm-d/llm-d-inference-sim"}

docs/developer-guide/testing.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ This deploys:
159159
- Prometheus stack and Prometheus Adapter (or KEDA when `SCALER_BACKEND=keda`)
160160
- **No** VariantAutoscaling, HPA, or model services (tests create these)
161161

162+
When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script also enables **GIE queuing** so scale-from-zero tests can run: it patches the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true` and applies an **InferenceObjective** (`e2e-default`) that references the default InferencePool. This ensures the metric `inference_extension_flow_control_queue_size` is populated when requests hit the gateway.
163+
162164
Alternatively, use the Makefile to deploy infra and run tests in one go:
163165

164166
```bash

docs/developer-guide/troubleshooting.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
kubectl get inferencepool
1414
```
1515

16-
**Solution**: Ensure InferencePool is created and reconciled before creating VariantAutoscaling.
16+
WVA watches a single InferencePool API group (`inference.networking.k8s.io` or `inference.networking.x-k8s.io`). If the cluster's pools use the other group, the datastore stays empty and scale-from-zero never gets a recommendation.
17+
18+
**Solution**: Ensure InferencePool is created and reconciled before creating VariantAutoscaling. When using `deploy/install.sh` with llm-d (e.g. kind-emulator or CI), the script auto-detects the pool API group after llm-d deploy and upgrades WVA with the correct `wva.poolGroup` so both local and CI work regardless of llm-d version.
1719

1820
2. **Labels mismatch**:
1921
```bash
@@ -54,6 +56,10 @@
5456

5557
**Solution**: Verify requests are being sent to the correct model endpoint.
5658

59+
### E2E and infra-only deploys
60+
61+
For e2e and infra-only deploys, the install script enables EPP flow control and optionally applies an InferenceObjective when `E2E_TESTS_ENABLED=true` or `ENABLE_SCALE_TO_ZERO=true`. See [deploy/install.sh](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/install.sh) and [deploy/inference-objective-e2e.yaml](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/inference-objective-e2e.yaml).
62+
5763
## Slow Scale-Up Response
5864

5965
**Symptom**: Deployment takes too long to scale up from zero.

docs/user-guide/scale-from-zero.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ The ScaleFromZero engine continuously monitors inactive VariantAutoscaling resou
4242
## Prerequisites
4343

4444
- WVA and llm-d installed and running - deployment options available for [kind](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kind-emulator/README.md), [OpenShift](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/openshift/README.md) and [Kubernetes](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kubernetes/README.md)
45-
- EndpointPicker (EPP) configured with flowcontrol enabled - required for queue metrics collection (set EPP env variable `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER`)
45+
- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (set EPP env `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
4646

4747

4848
## Usage

test/e2e/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,8 @@ ginkgo -v --label-filter="smoke" ./test/e2e/
216216
- Verify independent scaling per VA
217217

218218
3. **Scale-From-Zero** (~7 min)
219-
- Create HPA with minReplicas=0
219+
- Requires EPP flow control enabled so the metric `inference_extension_flow_control_queue_size` is populated (InferenceObjective is not required for this metric). When deploying infra with `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the install script enables flow control on the EPP and optionally applies an InferenceObjective for e2e.
220+
- Create HPA (or KEDA ScaledObject) with minReplicas=0
220221
- Verify deployment scales to 0 when idle
221222
- Generate first request, verify scale-up from 0 → 1
222223
- Verify request queuing during cold start

test/e2e/config.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,16 @@ type E2EConfig struct {
5757

5858
// LoadConfigFromEnv reads e2e test configuration from environment variables
5959
func LoadConfigFromEnv() E2EConfig {
60+
env := getEnv("ENVIRONMENT", "kind-emulator")
61+
eppServiceDefault := "gaie-inference-scheduling-epp"
62+
if env == "kind-emulator" {
63+
// kind-emulator deploy uses gaie-<NAMESPACE_SUFFIX>-epp with NAMESPACE_SUFFIX=sim
64+
eppServiceDefault = "gaie-sim-epp"
65+
}
66+
6067
cfg := E2EConfig{
6168
// Cluster defaults
62-
Environment: getEnv("ENVIRONMENT", "kind-emulator"),
69+
Environment: env,
6370
Kubeconfig: getEnv("KUBECONFIG", os.Getenv("HOME")+"/.kube/config"),
6471

6572
// Namespace defaults
@@ -78,11 +85,11 @@ func LoadConfigFromEnv() E2EConfig {
7885
ScalerBackend: getEnv("SCALER_BACKEND", "prometheus-adapter"),
7986
KEDANamespace: getEnv("KEDA_NAMESPACE", "keda-system"),
8087

81-
// EPP defaults
88+
// EPP defaults (kind-emulator uses gaie-sim-epp; other envs use gaie-inference-scheduling-epp)
8289
EPPMode: getEnv("EPP_MODE", "poolName"),
8390
PoolName: getEnv("POOL_NAME", ""),
8491
EndpointSelector: parseEndpointSelector(getEnv("ENDPOINT_SELECTOR", "")),
85-
EPPServiceName: getEnv("EPP_SERVICE_NAME", "gaie-inference-scheduling-epp"),
92+
EPPServiceName: getEnv("EPP_SERVICE_NAME", eppServiceDefault),
8693

8794
// Model defaults
8895
ModelID: getEnv("MODEL_ID", "unsloth/Meta-Llama-3.1-8B"),

0 commit comments

Comments
 (0)