Skip to content

Commit 89ef12b

Browse files
authored
test(e2e): remove load generation and keep deterministic correctness (llm-d#937)
* cleanup e2es and remove load tests Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * address review Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * add comment for clarity for prom adapter restarts Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * add saturation v1 focused tests Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * magic numbers to named constants Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * rm hard coded deployment name Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * add comment regarding workload builderfor benchmarking Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> --------- Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent cac1a05 commit 89ef12b

26 files changed

Lines changed: 1295 additions & 2442 deletions

Makefile

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ deploy-e2e-infra: ## Deploy e2e test infrastructure (infra-only: WVA + llm-d, no
211211
# Deploy e2e infrastructure with KEDA as scaler backend (installs KEDA, skips Prometheus Adapter).
212212
# Runs a subset of smoke tests from the e2e suite.
213213
.PHONY: test-e2e-smoke
214-
test-e2e-smoke: manifests generate fmt vet ## Run smoke e2e tests
214+
test-e2e-smoke: ## Run smoke e2e tests
215215
@echo "Running smoke e2e tests..."
216216
$(eval FOCUS_ARGS := $(if $(FOCUS),-ginkgo.focus="$(FOCUS)",))
217217
$(eval SKIP_ARGS := $(if $(SKIP),-ginkgo.skip="$(SKIP)",))
@@ -224,8 +224,6 @@ test-e2e-smoke: manifests generate fmt vet ## Run smoke e2e tests
224224
SCALE_TO_ZERO_ENABLED=$(SCALE_TO_ZERO_ENABLED) \
225225
SCALER_BACKEND=$(SCALER_BACKEND) \
226226
MODEL_ID=$(MODEL_ID) \
227-
REQUEST_RATE=$(REQUEST_RATE) \
228-
NUM_PROMPTS=$(NUM_PROMPTS) \
229227
go test ./test/e2e/ -timeout 20m -v -ginkgo.v \
230228
-ginkgo.label-filter="smoke" $(FOCUS_ARGS) $(SKIP_ARGS); \
231229
TEST_EXIT_CODE=$$?; \
@@ -237,7 +235,7 @@ test-e2e-smoke: manifests generate fmt vet ## Run smoke e2e tests
237235

238236
# Runs the complete e2e test suite (excluding flaky tests).
239237
.PHONY: test-e2e-full
240-
test-e2e-full: manifests generate fmt vet ## Run full e2e test suite
238+
test-e2e-full: ## Run full e2e test suite
241239
@echo "Running full e2e test suite..."
242240
$(eval FOCUS_ARGS := $(if $(FOCUS),-ginkgo.focus="$(FOCUS)",))
243241
$(eval SKIP_ARGS := $(if $(SKIP),-ginkgo.skip="$(SKIP)",))
@@ -248,8 +246,6 @@ test-e2e-full: manifests generate fmt vet ## Run full e2e test suite
248246
SCALE_TO_ZERO_ENABLED=$(SCALE_TO_ZERO_ENABLED) \
249247
SCALER_BACKEND=$(SCALER_BACKEND) \
250248
MODEL_ID=$(MODEL_ID) \
251-
REQUEST_RATE=$(REQUEST_RATE) \
252-
NUM_PROMPTS=$(NUM_PROMPTS) \
253249
go test ./test/e2e/ -timeout 35m -v -ginkgo.v \
254250
-ginkgo.label-filter="full && !flaky" $(FOCUS_ARGS) $(SKIP_ARGS); \
255251
TEST_EXIT_CODE=$$?; \

deploy/install.sh

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,14 @@ log_error() {
156156
exit 1
157157
}
158158

159+
# Helm repo update behavior:
160+
# - Default: DO NOT skip (`helm repo update` runs)
161+
# - Opt-in: set `SKIP_HELM_REPO_UPDATE=true` to skip (faster, but requires repo indexes to already exist)
162+
should_skip_helm_repo_update() {
163+
local skip="${SKIP_HELM_REPO_UPDATE:-false}"
164+
echo "$skip"
165+
}
166+
159167
# APIService guard: background loop that continuously ensures the
160168
# v1beta1.external.metrics.k8s.io APIService points to prometheus-adapter.
161169
# On clusters with KEDA, the operator continuously reconciles the APIService
@@ -1077,13 +1085,37 @@ deploy_llm_d_infrastructure() {
10771085
fi
10781086
fi
10791087

1080-
# Model-serving pods (vLLM) can take several minutes to download and load
1081-
# large models into GPU memory. The startupProbe allows up to 30m, so the
1082-
# wait timeout here must be long enough for the model to finish loading.
1083-
local DEPLOY_WAIT_TIMEOUT="${DEPLOY_WAIT_TIMEOUT:-600s}"
1084-
log_info "Waiting for llm-d components to initialize (timeout=${DEPLOY_WAIT_TIMEOUT})..."
1085-
kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout="$DEPLOY_WAIT_TIMEOUT" || \
1086-
log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
1088+
# For deterministic e2e infra-only runs, avoid waiting on all llm-d deployments.
1089+
# The full wait often blocks on modelservice decode/prefill readiness, which is
1090+
# unnecessary for the e2e suite because tests create/manage their own workloads.
1091+
if [ "$E2E_TESTS_ENABLED" = "true" ] && [ "$INFRA_ONLY" = "true" ]; then
1092+
local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-120s}"
1093+
log_info "E2E infra-only mode: waiting for essential llm-d components (timeout=${E2E_DEPLOY_WAIT_TIMEOUT})..."
1094+
1095+
if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &>/dev/null; then
1096+
kubectl wait --for=condition=Available "deployment/$LLM_D_EPP_NAME" -n "$LLMD_NS" --timeout="$E2E_DEPLOY_WAIT_TIMEOUT" || \
1097+
log_warning "EPP deployment not ready yet: $LLM_D_EPP_NAME"
1098+
else
1099+
log_warning "EPP deployment not found: $LLM_D_EPP_NAME"
1100+
fi
1101+
1102+
# Gateway deployment name includes release prefix and can vary by environment.
1103+
# Wait only if we can detect one, otherwise continue.
1104+
local gateway_deploy
1105+
gateway_deploy=$(kubectl get deployment -n "$LLMD_NS" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true)
1106+
if [ -n "$gateway_deploy" ]; then
1107+
kubectl wait --for=condition=Available "$gateway_deploy" -n "$LLMD_NS" --timeout="$E2E_DEPLOY_WAIT_TIMEOUT" || \
1108+
log_warning "Gateway deployment not ready yet: $gateway_deploy"
1109+
fi
1110+
else
1111+
# Model-serving pods (vLLM) can take several minutes to download and load
1112+
# large models into GPU memory. The startupProbe allows up to 30m, so the
1113+
# wait timeout here must be long enough for the model to finish loading.
1114+
local DEPLOY_WAIT_TIMEOUT="${DEPLOY_WAIT_TIMEOUT:-600s}"
1115+
log_info "Waiting for llm-d components to initialize (timeout=${DEPLOY_WAIT_TIMEOUT})..."
1116+
kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout="$DEPLOY_WAIT_TIMEOUT" || \
1117+
log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
1118+
fi
10871119

10881120
# Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
10891121
# llm-d version determines whether pools are inference.networking.k8s.io (v1) or inference.networking.x-k8s.io (v1alpha2).
@@ -1130,7 +1162,11 @@ deploy_keda() {
11301162
kubectl create namespace "$KEDA_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
11311163

11321164
helm repo add kedacore https://kedacore.github.io/charts 2>/dev/null || true
1133-
helm repo update
1165+
if [ "$(should_skip_helm_repo_update)" = "true" ]; then
1166+
log_info "Skipping helm repo update for KEDA (SKIP_HELM_REPO_UPDATE=true)"
1167+
else
1168+
helm repo update
1169+
fi
11341170

11351171
if ! helm upgrade -i keda kedacore/keda \
11361172
--version "$KEDA_CHART_VERSION" \
@@ -1193,7 +1229,11 @@ deploy_prometheus_adapter() {
11931229
# Add Prometheus community helm repo
11941230
log_info "Adding Prometheus community helm repo"
11951231
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true
1196-
helm repo update
1232+
if [ "$(should_skip_helm_repo_update)" = "true" ]; then
1233+
log_info "Skipping helm repo update for Prometheus Adapter (SKIP_HELM_REPO_UPDATE=true)"
1234+
else
1235+
helm repo update
1236+
fi
11971237

11981238
# Create prometheus-ca ConfigMap from the CA certificate
11991239
log_info "Creating prometheus-ca ConfigMap for Prometheus Adapter"

deploy/kind-emulator/README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -192,21 +192,23 @@ kubectl port-forward -n llm-d-sim svc/infra-sim-inference-gateway 8000:80
192192
kubectl apply -f ../../config/samples/
193193
```
194194

195-
### 3. Generate Load
195+
### 3. Run E2E test
196196

197197
**Option A — Run E2E tests (recommended)**
198-
The e2e suite deploys infra, creates resources, generates load, and validates scaling. No manual load tool needed.
198+
The consolidated e2e suite (`test/e2e/`) exercises infra-only deploy, resource wiring, reconciliation, and deterministic correctness checks. For sustained load or benchmarking, use **Option B** or separate perf workflows — not required for e2e.
199199

200200
```bash
201201
# From repo root, after deploying (e.g. make deploy-wva-emulated-on-kind)
202202
make deploy-e2e-infra # if not already done
203203
make test-e2e-smoke # quick validation
204204
# or
205-
make test-e2e-full # full suite including saturation scaling
205+
make test-e2e-full # full suite (`full && !flaky`)
206206
```
207207

208208
See [Testing Guide](../../docs/developer-guide/testing.md) and [E2E Test Suite README](../../test/e2e/README.md).
209209

210+
### 4. Generate Load
211+
210212
**Option B — Manual load with burst script**
211213
Use the script in the e2e fixtures (requires only `curl`; no Python). After port-forwarding the inference gateway or vLLM service to `localhost:8000`:
212214

@@ -221,7 +223,7 @@ export BATCH_SIZE=10
221223

222224
Tune load with `TOTAL_REQUESTS`, `BATCH_SIZE`, and optional `BATCH_SLEEP`, `MAX_TOKENS`, `CURL_TIMEOUT` (see script header).
223225

224-
### 4. Monitor
226+
### 5. Monitor
225227

226228
```bash
227229
# Watch deployments scale

deploy/kind-emulator/install.sh

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
4343
LLM_D_INFERENCE_SIM_IMG_REPO=${LLM_D_INFERENCE_SIM_IMG_REPO:-"ghcr.io/llm-d/llm-d-inference-sim"}
4444
LLM_D_INFERENCE_SIM_IMG_TAG=${LLM_D_INFERENCE_SIM_IMG_TAG:-"latest"}
4545

46-
# Load generator image (guidellm) - pre-loaded into Kind for faster e2e test startup
47-
GUIDELLM_IMG=${GUIDELLM_IMG:-"ghcr.io/vllm-project/guidellm:latest"}
4846
LLM_D_MODELSERVICE_NAME="ms-$NAMESPACE_SUFFIX-llm-d-modelservice"
4947
LLM_D_MODELSERVICE_VALUES="ms-$NAMESPACE_SUFFIX/values.yaml"
5048
LLM_D_EPP_NAME="gaie-$NAMESPACE_SUFFIX-epp"
@@ -132,9 +130,6 @@ check_specific_prerequisites() {
132130
# Load WVA image into KIND cluster
133131
load_image
134132

135-
# Pre-load guidellm image so e2e load jobs don't need to pull at runtime
136-
preload_e2e_images
137-
138133
log_success "All Kind emulated deployment prerequisites met"
139134
}
140135

@@ -212,42 +207,6 @@ load_image() {
212207
log_success "Image '$WVA_IMAGE_REPO:$WVA_IMAGE_TAG' loaded into KIND cluster '$CLUSTER_NAME'"
213208
}
214209

215-
# Pre-loads e2e test images (guidellm load generator) into the Kind cluster
216-
# so that load generation jobs start quickly without runtime image pulls.
217-
preload_e2e_images() {
218-
if [ "${PRELOAD_E2E_IMAGES:-true}" = "false" ]; then
219-
log_info "Skipping e2e image pre-loading (PRELOAD_E2E_IMAGES=false)"
220-
return
221-
fi
222-
223-
log_info "Pre-loading e2e test images into Kind cluster..."
224-
225-
local platform="${KIND_IMAGE_PLATFORM:-}"
226-
if [ -z "$platform" ]; then
227-
case "$(uname -m)" in
228-
aarch64|arm64) platform="linux/arm64" ;;
229-
*) platform="linux/amd64" ;;
230-
esac
231-
fi
232-
233-
# Pre-load guidellm image (used by CreateLoadJob in e2e tests)
234-
if docker image inspect "$GUIDELLM_IMG" >/dev/null 2>&1; then
235-
log_info "guidellm image already exists locally, loading into Kind..."
236-
else
237-
log_info "Pulling guidellm image '$GUIDELLM_IMG' (platform=$platform)..."
238-
if ! docker pull --platform "$platform" "$GUIDELLM_IMG"; then
239-
log_warning "Failed to pull guidellm image - e2e load jobs will pull at runtime (slower)"
240-
return
241-
fi
242-
fi
243-
244-
if kind load docker-image "$GUIDELLM_IMG" --name "$CLUSTER_NAME"; then
245-
log_success "guidellm image loaded into Kind cluster"
246-
else
247-
log_warning "Failed to load guidellm image into Kind - e2e load jobs will pull at runtime"
248-
fi
249-
}
250-
251210
#### REQUIRED FUNCTION used by deploy/install.sh ####
252211
create_namespaces() {
253212
log_info "Creating namespaces..."
@@ -269,7 +228,11 @@ deploy_prometheus_stack() {
269228

270229
# Add helm repo
271230
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true
272-
helm repo update
231+
if [ "${SKIP_HELM_REPO_UPDATE:-}" = "true" ]; then
232+
log_info "Skipping helm repo update (SKIP_HELM_REPO_UPDATE=true)"
233+
else
234+
helm repo update
235+
fi
273236

274237
# Create self-signed TLS certificate for Prometheus
275238
log_info "Creating self-signed TLS certificate for Prometheus"

docs/developer-guide/testing.md

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ WVA provides a **single consolidated E2E suite** that runs on multiple environme
140140
- **Environments**: Kind (emulated), OpenShift, or generic Kubernetes
141141
- **Tiers**: Smoke (~5–10 min) for PRs; full suite (~15–25 min) for comprehensive validation
142142

143+
### Scope
144+
145+
E2E is intended to be a **deterministic correctness signal**: resource wiring, reconciliation, and stable invariants (e.g., CRs reconcile, status conditions are set, scalers are created and point at the right targets/metrics). Traffic generation and performance/benchmarking scenarios should live outside `test/e2e/`.
146+
143147
### Infra-Only Setup (Required Before Running Tests)
144148

145149
Tests expect **only** the WVA controller and llm-d infrastructure to be deployed; they create VariantAutoscaling resources, HPAs, and model services themselves. Use the install script in **infra-only** mode:
@@ -161,6 +165,11 @@ This deploys:
161165

162166
When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script also enables **GIE queuing** so scale-from-zero tests can run: it patches the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true` and applies an **InferenceObjective** (`e2e-default`) that references the default InferencePool. This ensures the metric `inference_extension_flow_control_queue_size` is populated when requests hit the gateway.
163167

168+
**Install script tuning (optional, same variables as `deploy/install.sh`):**
169+
170+
- **`SKIP_HELM_REPO_UPDATE`**: When set to **`true`**, `helm repo update` is skipped during installs (faster, less network churn). Default runs `helm repo update` to refresh repo indexes.
171+
- **`E2E_DEPLOY_WAIT_TIMEOUT`**: For infra-only e2e deploys (`INFRA_ONLY=true` with `E2E_TESTS_ENABLED=true`), caps the `kubectl wait` for the EPP and inference-gateway deployments (default **`120s`**). Raise it if image pulls rollouts routinely exceed that window.
172+
164173
Alternatively, use the Makefile to deploy infra and run tests in one go:
165174

166175
```bash
@@ -196,7 +205,7 @@ FOCUS="Basic VA lifecycle" make test-e2e-smoke
196205
### What the Suite Validates
197206

198207
- **Smoke (label `smoke`)**: Infrastructure readiness, basic VA lifecycle, target condition validation
199-
- **Full (label `full`)**: Saturation scaling (single and multiple VAs), scale-from-zero, scale-to-zero (when `SCALE_TO_ZERO_ENABLED=true`), limiter, pod scraping, parallel load scale-up
208+
- **Full (label `full`)**: Smoke plus additional deterministic correctness checks (scale-from-zero, limiter, pod scraping, etc.)
200209

201210
### Configuration
202211

@@ -208,8 +217,11 @@ Key environment variables (see [E2E Test Suite README](../../test/e2e/README.md)
208217
| `USE_SIMULATOR` | `true` | Emulated GPUs (true) or real vLLM (false) |
209218
| `SCALE_TO_ZERO_ENABLED` | `false` | Enable scale-to-zero tests (Kind supports both enabled and disabled) |
210219
| `SCALER_BACKEND` | `prometheus-adapter` | `prometheus-adapter` or `keda` (KEDA only for kind-emulator) |
211-
| `REQUEST_RATE` | `8` | Load generation: requests per second |
212-
| `NUM_PROMPTS` | `1000` | Load generation: total prompts |
220+
| `POD_READY_TIMEOUT` / `SCALE_UP_TIMEOUT` | `300` / `600` | Model ready vs longest scale/job waits (seconds) |
221+
| `E2E_EVENTUALLY_STANDARD`, etc. | see README | Optional `Eventually` timeouts and poll intervals (`E2E_EVENTUALLY_*`, `E2E_EVENTUALLY_POLL*`) |
222+
| `RESTART_PROMETHEUS_ADAPTER` | `auto` | kind-emulator: `auto` probes adapter + API before restarting pods; `true`/`false` force always/never |
223+
224+
Deploy-time knobs (passed through when you run `./deploy/install.sh` or `make deploy-e2e-infra`): `SKIP_HELM_REPO_UPDATE`, `E2E_DEPLOY_WAIT_TIMEOUT` — see **Install script tuning** above.
213225

214226
For running multiple test runs in parallel, use [multi-controller isolation](../user-guide/multi-controller-isolation.md) (`CONTROLLER_INSTANCE`).
215227

@@ -524,30 +536,9 @@ kubectl get events -n <namespace> --sort-by='.lastTimestamp'
524536
kubectl top nodes
525537
```
526538

527-
## Performance Testing
528-
529-
### Load Testing
530-
531-
For load testing, use the consolidated E2E suite with custom load parameters:
532-
533-
```bash
534-
# Kind (emulated): low / medium / heavy load
535-
REQUEST_RATE=8 NUM_PROMPTS=2000 make test-e2e-full
536-
REQUEST_RATE=20 NUM_PROMPTS=3000 make test-e2e-full
537-
REQUEST_RATE=40 NUM_PROMPTS=5000 make test-e2e-full
538-
539-
# OpenShift (real cluster)
540-
export ENVIRONMENT=openshift
541-
REQUEST_RATE=20 NUM_PROMPTS=3000 make test-e2e-full
542-
```
543-
544-
### Stress Testing
539+
## Performance / Benchmarking
545540

546-
Test system behavior under extreme conditions:
547-
- High request rates (50+ req/s)
548-
- Long-running load (30+ minutes)
549-
- Rapid load changes
550-
- Multiple concurrent variants
541+
Performance and benchmarking scenarios (traffic generation, throughput/latency measurement, scale-up latency, etc.) are intentionally **out of scope** for `test/e2e/` so that e2e remains deterministic. Use the project’s dedicated benchmarking tooling/workflows instead.
551542

552543
## Test Coverage Goals
553544

hack/burst_load_generator.sh

Lines changed: 0 additions & 79 deletions
This file was deleted.

hack/burst_load_generator.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../test/e2e/fixtures/burst_load_generator.sh

0 commit comments

Comments
 (0)