mamy-CS
diff --git a/‎Makefile‎
Lines changed: 2 additions & 6 deletions b/‎Makefile‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎deploy/install.sh‎
Lines changed: 49 additions & 9 deletions b/‎deploy/install.sh‎
Lines changed: 49 additions & 9 deletions
diff --git a/‎deploy/kind-emulator/README.md‎
Lines changed: 6 additions & 4 deletions b/‎deploy/kind-emulator/README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎deploy/kind-emulator/install.sh‎
Lines changed: 5 additions & 42 deletions b/‎deploy/kind-emulator/install.sh‎
Lines changed: 5 additions & 42 deletions
diff --git a/‎docs/developer-guide/testing.md‎
Lines changed: 17 additions & 26 deletions b/‎docs/developer-guide/testing.md‎
Lines changed: 17 additions & 26 deletions
diff --git a/‎hack/burst_load_generator.sh‎
Lines changed: 0 additions & 79 deletions b/‎hack/burst_load_generator.sh‎
Lines changed: 0 additions & 79 deletions
diff --git a/‎hack/burst_load_generator.sh‎
Lines changed: 1 addition & 0 deletions b/‎hack/burst_load_generator.sh‎
Lines changed: 1 addition & 0 deletions
@@ -211,7 +211,7 @@ deploy-e2e-infra: ## Deploy e2e test infrastructure (infra-only: WVA + llm-d, no
 # Deploy e2e infrastructure with KEDA as scaler backend (installs KEDA, skips Prometheus Adapter).
 # Runs a subset of smoke tests from the e2e suite.
 .PHONY: test-e2e-smoke
-test-e2e-smoke: manifests generate fmt vet ## Run smoke e2e tests
+test-e2e-smoke: ## Run smoke e2e tests
 	@echo "Running smoke e2e tests..."
 	$(eval FOCUS_ARGS := $(if $(FOCUS),-ginkgo.focus="$(FOCUS)",))
 	$(eval SKIP_ARGS := $(if $(SKIP),-ginkgo.skip="$(SKIP)",))
@@ -224,8 +224,6 @@ test-e2e-smoke: manifests generate fmt vet ## Run smoke e2e tests
 	SCALE_TO_ZERO_ENABLED=$(SCALE_TO_ZERO_ENABLED) \
 	SCALER_BACKEND=$(SCALER_BACKEND) \
 	MODEL_ID=$(MODEL_ID) \
-	REQUEST_RATE=$(REQUEST_RATE) \
-	NUM_PROMPTS=$(NUM_PROMPTS) \
 	go test ./test/e2e/ -timeout 20m -v -ginkgo.v \
 		-ginkgo.label-filter="smoke" $(FOCUS_ARGS) $(SKIP_ARGS); \
 	TEST_EXIT_CODE=$$?; \
@@ -237,7 +235,7 @@ test-e2e-smoke: manifests generate fmt vet ## Run smoke e2e tests
 
 # Runs the complete e2e test suite (excluding flaky tests).
 .PHONY: test-e2e-full
-test-e2e-full: manifests generate fmt vet ## Run full e2e test suite
+test-e2e-full: ## Run full e2e test suite
 	@echo "Running full e2e test suite..."
 	$(eval FOCUS_ARGS := $(if $(FOCUS),-ginkgo.focus="$(FOCUS)",))
 	$(eval SKIP_ARGS := $(if $(SKIP),-ginkgo.skip="$(SKIP)",))
@@ -248,8 +246,6 @@ test-e2e-full: manifests generate fmt vet ## Run full e2e test suite
 	SCALE_TO_ZERO_ENABLED=$(SCALE_TO_ZERO_ENABLED) \
 	SCALER_BACKEND=$(SCALER_BACKEND) \
 	MODEL_ID=$(MODEL_ID) \
-	REQUEST_RATE=$(REQUEST_RATE) \
-	NUM_PROMPTS=$(NUM_PROMPTS) \
 	go test ./test/e2e/ -timeout 35m -v -ginkgo.v \
 		-ginkgo.label-filter="full && !flaky" $(FOCUS_ARGS) $(SKIP_ARGS); \
 	TEST_EXIT_CODE=$$?; \
 
@@ -156,6 +156,14 @@ log_error() {
     exit 1
 }
 
+# Helm repo update behavior:
+# - Default: DO NOT skip (`helm repo update` runs)
+# - Opt-in: set `SKIP_HELM_REPO_UPDATE=true` to skip (faster, but requires repo indexes to already exist)
+should_skip_helm_repo_update() {
+    local skip="${SKIP_HELM_REPO_UPDATE:-false}"
+    echo "$skip"
+}
+
 # APIService guard: background loop that continuously ensures the
 # v1beta1.external.metrics.k8s.io APIService points to prometheus-adapter.
 # On clusters with KEDA, the operator continuously reconciles the APIService
@@ -1077,13 +1085,37 @@ deploy_llm_d_infrastructure() {
         fi
     fi
 
-    # Model-serving pods (vLLM) can take several minutes to download and load
-    # large models into GPU memory. The startupProbe allows up to 30m, so the
-    # wait timeout here must be long enough for the model to finish loading.
-    local DEPLOY_WAIT_TIMEOUT="${DEPLOY_WAIT_TIMEOUT:-600s}"
-    log_info "Waiting for llm-d components to initialize (timeout=${DEPLOY_WAIT_TIMEOUT})..."
-    kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout="$DEPLOY_WAIT_TIMEOUT" || \
-        log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
+    # For deterministic e2e infra-only runs, avoid waiting on all llm-d deployments.
+    # The full wait often blocks on modelservice decode/prefill readiness, which is
+    # unnecessary for the e2e suite because tests create/manage their own workloads.
+    if [ "$E2E_TESTS_ENABLED" = "true" ] && [ "$INFRA_ONLY" = "true" ]; then
+        local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-120s}"
+        log_info "E2E infra-only mode: waiting for essential llm-d components (timeout=${E2E_DEPLOY_WAIT_TIMEOUT})..."
+
+        if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &>/dev/null; then
+            kubectl wait --for=condition=Available "deployment/$LLM_D_EPP_NAME" -n "$LLMD_NS" --timeout="$E2E_DEPLOY_WAIT_TIMEOUT" || \
+                log_warning "EPP deployment not ready yet: $LLM_D_EPP_NAME"
+        else
+            log_warning "EPP deployment not found: $LLM_D_EPP_NAME"
+        fi
+
+        # Gateway deployment name includes release prefix and can vary by environment.
+        # Wait only if we can detect one, otherwise continue.
+        local gateway_deploy
+        gateway_deploy=$(kubectl get deployment -n "$LLMD_NS" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true)
+        if [ -n "$gateway_deploy" ]; then
+            kubectl wait --for=condition=Available "$gateway_deploy" -n "$LLMD_NS" --timeout="$E2E_DEPLOY_WAIT_TIMEOUT" || \
+                log_warning "Gateway deployment not ready yet: $gateway_deploy"
+        fi
+    else
+        # Model-serving pods (vLLM) can take several minutes to download and load
+        # large models into GPU memory. The startupProbe allows up to 30m, so the
+        # wait timeout here must be long enough for the model to finish loading.
+        local DEPLOY_WAIT_TIMEOUT="${DEPLOY_WAIT_TIMEOUT:-600s}"
+        log_info "Waiting for llm-d components to initialize (timeout=${DEPLOY_WAIT_TIMEOUT})..."
+        kubectl wait --for=condition=Available deployment --all -n $LLMD_NS --timeout="$DEPLOY_WAIT_TIMEOUT" || \
+            log_warning "llm-d components are not ready yet - check 'kubectl get pods -n $LLMD_NS'"
+    fi
 
     # Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
     # llm-d version determines whether pools are inference.networking.k8s.io (v1) or inference.networking.x-k8s.io (v1alpha2).
@@ -1130,7 +1162,11 @@ deploy_keda() {
     kubectl create namespace "$KEDA_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
 
     helm repo add kedacore https://kedacore.github.io/charts 2>/dev/null || true
-    helm repo update
+    if [ "$(should_skip_helm_repo_update)" = "true" ]; then
+        log_info "Skipping helm repo update for KEDA (SKIP_HELM_REPO_UPDATE=true)"
+    else
+        helm repo update
+    fi
 
     if ! helm upgrade -i keda kedacore/keda \
         --version "$KEDA_CHART_VERSION" \
@@ -1193,7 +1229,11 @@ deploy_prometheus_adapter() {
     # Add Prometheus community helm repo
     log_info "Adding Prometheus community helm repo"
     helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true
-    helm repo update
+    if [ "$(should_skip_helm_repo_update)" = "true" ]; then
+        log_info "Skipping helm repo update for Prometheus Adapter (SKIP_HELM_REPO_UPDATE=true)"
+    else
+        helm repo update
+    fi
 
     # Create prometheus-ca ConfigMap from the CA certificate
     log_info "Creating prometheus-ca ConfigMap for Prometheus Adapter"
 
@@ -192,21 +192,23 @@ kubectl port-forward -n llm-d-sim svc/infra-sim-inference-gateway 8000:80
 kubectl apply -f ../../config/samples/
 ```
 
-### 3. Generate Load
+### 3. Run E2E test
 
 **Option A — Run E2E tests (recommended)**  
-The e2e suite deploys infra, creates resources, generates load, and validates scaling. No manual load tool needed.
+The consolidated e2e suite (`test/e2e/`) exercises infra-only deploy, resource wiring, reconciliation, and deterministic correctness checks. For sustained load or benchmarking, use **Option B** or separate perf workflows — not required for e2e.
 
 ```bash
 # From repo root, after deploying (e.g. make deploy-wva-emulated-on-kind)
 make deploy-e2e-infra   # if not already done
 make test-e2e-smoke    # quick validation
 # or
-make test-e2e-full     # full suite including saturation scaling
+make test-e2e-full     # full suite (`full && !flaky`)
 ```
 
 See [Testing Guide](../../docs/developer-guide/testing.md) and [E2E Test Suite README](../../test/e2e/README.md).
 
+### 4. Generate Load
+
 **Option B — Manual load with burst script**  
 Use the script in the e2e fixtures (requires only `curl`; no Python). After port-forwarding the inference gateway or vLLM service to `localhost:8000`:
 
@@ -221,7 +223,7 @@ export BATCH_SIZE=10
 
 Tune load with `TOTAL_REQUESTS`, `BATCH_SIZE`, and optional `BATCH_SLEEP`, `MAX_TOKENS`, `CURL_TIMEOUT` (see script header).
 
-### 4. Monitor
+### 5. Monitor
 
 ```bash
 # Watch deployments scale
 
@@ -43,8 +43,6 @@ POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
 LLM_D_INFERENCE_SIM_IMG_REPO=${LLM_D_INFERENCE_SIM_IMG_REPO:-"ghcr.io/llm-d/llm-d-inference-sim"}
 LLM_D_INFERENCE_SIM_IMG_TAG=${LLM_D_INFERENCE_SIM_IMG_TAG:-"latest"}
 
-# Load generator image (guidellm) - pre-loaded into Kind for faster e2e test startup
-GUIDELLM_IMG=${GUIDELLM_IMG:-"ghcr.io/vllm-project/guidellm:latest"}
 LLM_D_MODELSERVICE_NAME="ms-$NAMESPACE_SUFFIX-llm-d-modelservice"
 LLM_D_MODELSERVICE_VALUES="ms-$NAMESPACE_SUFFIX/values.yaml"
 LLM_D_EPP_NAME="gaie-$NAMESPACE_SUFFIX-epp"
@@ -132,9 +130,6 @@ check_specific_prerequisites() {
     # Load WVA image into KIND cluster
     load_image
 
-    # Pre-load guidellm image so e2e load jobs don't need to pull at runtime
-    preload_e2e_images
-
     log_success "All Kind emulated deployment prerequisites met"
 }
 
@@ -212,42 +207,6 @@ load_image() {
     log_success "Image '$WVA_IMAGE_REPO:$WVA_IMAGE_TAG' loaded into KIND cluster '$CLUSTER_NAME'"
 }
 
-# Pre-loads e2e test images (guidellm load generator) into the Kind cluster
-# so that load generation jobs start quickly without runtime image pulls.
-preload_e2e_images() {
-    if [ "${PRELOAD_E2E_IMAGES:-true}" = "false" ]; then
-        log_info "Skipping e2e image pre-loading (PRELOAD_E2E_IMAGES=false)"
-        return
-    fi
-
-    log_info "Pre-loading e2e test images into Kind cluster..."
-
-    local platform="${KIND_IMAGE_PLATFORM:-}"
-    if [ -z "$platform" ]; then
-        case "$(uname -m)" in
-            aarch64|arm64) platform="linux/arm64" ;;
-            *) platform="linux/amd64" ;;
-        esac
-    fi
-
-    # Pre-load guidellm image (used by CreateLoadJob in e2e tests)
-    if docker image inspect "$GUIDELLM_IMG" >/dev/null 2>&1; then
-        log_info "guidellm image already exists locally, loading into Kind..."
-    else
-        log_info "Pulling guidellm image '$GUIDELLM_IMG' (platform=$platform)..."
-        if ! docker pull --platform "$platform" "$GUIDELLM_IMG"; then
-            log_warning "Failed to pull guidellm image - e2e load jobs will pull at runtime (slower)"
-            return
-        fi
-    fi
-
-    if kind load docker-image "$GUIDELLM_IMG" --name "$CLUSTER_NAME"; then
-        log_success "guidellm image loaded into Kind cluster"
-    else
-        log_warning "Failed to load guidellm image into Kind - e2e load jobs will pull at runtime"
-    fi
-}
-
 #### REQUIRED FUNCTION used by deploy/install.sh ####
 create_namespaces() {
     log_info "Creating namespaces..."
@@ -269,7 +228,11 @@ deploy_prometheus_stack() {
 
     # Add helm repo
     helm repo add prometheus-community https://prometheus-community.github.io/helm-charts || true
-    helm repo update
+    if [ "${SKIP_HELM_REPO_UPDATE:-}" = "true" ]; then
+        log_info "Skipping helm repo update (SKIP_HELM_REPO_UPDATE=true)"
+    else
+        helm repo update
+    fi
 
     # Create self-signed TLS certificate for Prometheus
     log_info "Creating self-signed TLS certificate for Prometheus"
 
@@ -140,6 +140,10 @@ WVA provides a **single consolidated E2E suite** that runs on multiple environme
 - **Environments**: Kind (emulated), OpenShift, or generic Kubernetes
 - **Tiers**: Smoke (~5–10 min) for PRs; full suite (~15–25 min) for comprehensive validation
 
+### Scope
+
+E2E is intended to be a **deterministic correctness signal**: resource wiring, reconciliation, and stable invariants (e.g., CRs reconcile, status conditions are set, scalers are created and point at the right targets/metrics). Traffic generation and performance/benchmarking scenarios should live outside `test/e2e/`.
+
 ### Infra-Only Setup (Required Before Running Tests)
 
 Tests expect **only** the WVA controller and llm-d infrastructure to be deployed; they create VariantAutoscaling resources, HPAs, and model services themselves. Use the install script in **infra-only** mode:
@@ -161,6 +165,11 @@ This deploys:
 
 When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script also enables **GIE queuing** so scale-from-zero tests can run: it patches the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true` and applies an **InferenceObjective** (`e2e-default`) that references the default InferencePool. This ensures the metric `inference_extension_flow_control_queue_size` is populated when requests hit the gateway.
 
+**Install script tuning (optional, same variables as `deploy/install.sh`):**
+
+- **`SKIP_HELM_REPO_UPDATE`**: When set to **`true`**, `helm repo update` is skipped during installs (faster, less network churn). Default runs `helm repo update` to refresh repo indexes.
+- **`E2E_DEPLOY_WAIT_TIMEOUT`**: For infra-only e2e deploys (`INFRA_ONLY=true` with `E2E_TESTS_ENABLED=true`), caps the `kubectl wait` for the EPP and inference-gateway deployments (default **`120s`**). Raise it if image pulls rollouts routinely exceed that window.
+
 Alternatively, use the Makefile to deploy infra and run tests in one go:
 
 ```bash
@@ -196,7 +205,7 @@ FOCUS="Basic VA lifecycle" make test-e2e-smoke
 ### What the Suite Validates
 
 - **Smoke (label `smoke`)**: Infrastructure readiness, basic VA lifecycle, target condition validation
-- **Full (label `full`)**: Saturation scaling (single and multiple VAs), scale-from-zero, scale-to-zero (when `SCALE_TO_ZERO_ENABLED=true`), limiter, pod scraping, parallel load scale-up
+- **Full (label `full`)**: Smoke plus additional deterministic correctness checks (scale-from-zero, limiter, pod scraping, etc.)
 
 ### Configuration
 
@@ -208,8 +217,11 @@ Key environment variables (see [E2E Test Suite README](../../test/e2e/README.md)
 | `USE_SIMULATOR` | `true` | Emulated GPUs (true) or real vLLM (false) |
 | `SCALE_TO_ZERO_ENABLED` | `false` | Enable scale-to-zero tests (Kind supports both enabled and disabled) |
 | `SCALER_BACKEND` | `prometheus-adapter` | `prometheus-adapter` or `keda` (KEDA only for kind-emulator) |
-| `REQUEST_RATE` | `8` | Load generation: requests per second |
-| `NUM_PROMPTS` | `1000` | Load generation: total prompts |
+| `POD_READY_TIMEOUT` / `SCALE_UP_TIMEOUT` | `300` / `600` | Model ready vs longest scale/job waits (seconds) |
+| `E2E_EVENTUALLY_STANDARD`, etc. | see README | Optional `Eventually` timeouts and poll intervals (`E2E_EVENTUALLY_*`, `E2E_EVENTUALLY_POLL*`) |
+| `RESTART_PROMETHEUS_ADAPTER` | `auto` | kind-emulator: `auto` probes adapter + API before restarting pods; `true`/`false` force always/never |
+
+Deploy-time knobs (passed through when you run `./deploy/install.sh` or `make deploy-e2e-infra`): `SKIP_HELM_REPO_UPDATE`, `E2E_DEPLOY_WAIT_TIMEOUT` — see **Install script tuning** above.
 
 For running multiple test runs in parallel, use [multi-controller isolation](../user-guide/multi-controller-isolation.md) (`CONTROLLER_INSTANCE`).
 
@@ -524,30 +536,9 @@ kubectl get events -n <namespace> --sort-by='.lastTimestamp'
 kubectl top nodes
 ```
 
-## Performance Testing
-
-### Load Testing
-
-For load testing, use the consolidated E2E suite with custom load parameters:
-
-```bash
-# Kind (emulated): low / medium / heavy load
-REQUEST_RATE=8 NUM_PROMPTS=2000 make test-e2e-full
-REQUEST_RATE=20 NUM_PROMPTS=3000 make test-e2e-full
-REQUEST_RATE=40 NUM_PROMPTS=5000 make test-e2e-full
-
-# OpenShift (real cluster)
-export ENVIRONMENT=openshift
-REQUEST_RATE=20 NUM_PROMPTS=3000 make test-e2e-full
-```
-
-### Stress Testing
+## Performance / Benchmarking
 
-Test system behavior under extreme conditions:
-- High request rates (50+ req/s)
-- Long-running load (30+ minutes)
-- Rapid load changes
-- Multiple concurrent variants
+Performance and benchmarking scenarios (traffic generation, throughput/latency measurement, scale-up latency, etc.) are intentionally **out of scope** for `test/e2e/` so that e2e remains deterministic. Use the project’s dedicated benchmarking tooling/workflows instead.
 
 ## Test Coverage Goals
 
 
@@ -0,0 +1 @@
+../test/e2e/fixtures/burst_load_generator.sh
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../test/e2e/fixtures/burst_load_generator.sh`