upgrade infra versions

mamy-CS · mamy-CS · commit 9ae201ee41ce · 2026-04-02T18:25:37.000-04:00
Signed-off-by: Mohammed Abdi &lt;mohammed.munir.abdi@ibm.com&gt;
diff --git a/deploy/README.md b/deploy/README.md
@@ -697,7 +697,7 @@ HPA_STABILIZATION_SECONDS=30 ./deploy/install.sh
 | `WVA_LOG_LEVEL` | WVA logging level | `info` |
 | `VLLM_SVC_ENABLED` | Enable vLLM Service | `true` |
 | `VLLM_SVC_NODEPORT` | vLLM NodePort | `30000` |
-| `LLM_D_RELEASE` | llm-d version | `v0.3.0` |
+| `LLM_D_RELEASE` | llm-d git ref (guides / Helmfile); newest tag is often older than default scheduler images — see `install.sh` comment | `v0.5.1` |
 | `VLLM_MAX_NUM_SEQS` | vLLM max concurrent sequences per replica | (unset - uses vLLM default) |
 
 **vLLM Performance Tuning:**
diff --git a/deploy/install.sh b/deploy/install.sh
@@ -47,7 +47,9 @@ CONTROLLER_INSTANCE=${CONTROLLER_INSTANCE:-""}
 # llm-d Configuration
 LLM_D_OWNER=${LLM_D_OWNER:-"llm-d"}
 LLM_D_PROJECT=${LLM_D_PROJECT:-"llm-d"}
-LLM_D_RELEASE=${LLM_D_RELEASE:-"v0.3.0"}
+# Git ref for cloning llm-d (guides, Helmfile, chart pins). Use newest *tag* for llmd, check on github.com/llm-d/llm-d repo for the latest tag.
+# Default EPP/sidecar images below therefore track newer *published* containers while charts come from this ref;
+LLM_D_RELEASE=${LLM_D_RELEASE:-"v0.5.1"}
 LLM_D_MODELSERVICE_NAME=${LLM_D_MODELSERVICE_NAME:-"ms-$WELL_LIT_PATH_NAME-llm-d-modelservice"}
 LLM_D_EPP_NAME=${LLM_D_EPP_NAME:-"gaie-$WELL_LIT_PATH_NAME-epp"}
 CLIENT_PREREQ_DIR=${CLIENT_PREREQ_DIR:-"$WVA_PROJECT/$LLM_D_PROJECT/guides/prereq/client-setup"}
@@ -57,9 +59,10 @@ LLM_D_MODELSERVICE_VALUES=${LLM_D_MODELSERVICE_VALUES:-"$EXAMPLE_DIR/ms-$WELL_LI
 ITL_AVERAGE_LATENCY_MS=${ITL_AVERAGE_LATENCY_MS:-20}
 TTFT_AVERAGE_LATENCY_MS=${TTFT_AVERAGE_LATENCY_MS:-200}
 ENABLE_SCALE_TO_ZERO=${ENABLE_SCALE_TO_ZERO:-true}
-# llm-d-inference scheduler with image with flowcontrol support
-# TODO: update once the llm-d-inference-scheduler v0.5.0 is released
-LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.5.0-rc.1"}
+# llm-d-inference-scheduler (EPP) image patched onto the helm deployment when flow control / e2e is enabled.
+LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.7.0"}
+# Routing sidecar must stay on the same minor release as the inference scheduler (shared gRPC / gateway integration).
+LLM_D_ROUTING_SIDECAR_IMG=${LLM_D_ROUTING_SIDECAR_IMG:-"ghcr.io/llm-d/llm-d-routing-sidecar:v0.7.0"}
 
 # Gateway Configuration
 GATEWAY_PROVIDER=${GATEWAY_PROVIDER:-"istio"} # Options: kgateway, istio
diff --git a/deploy/lib/infra_llmd.sh b/deploy/lib/infra_llmd.sh
@@ -7,6 +7,140 @@
 # containsElement(), wait_deployment_available_nonfatal(), detect_inference_pool_api_group().
 #
 
+# Helm may reconcile the EPP Deployment before the inference-gateway-istio Deployment exists.
+# Without waiting, patch_llm_d_routing_sidecar_image runs once, finds nothing, and never retries.
+wait_for_inference_gateway_deployment() {
+    local ns="$1"
+    local max_sec="${2:-180}"
+    local interval="${3:-5}"
+    local elapsed=0
+    local gw_deploy=""
+    while [ "$elapsed" -lt "$max_sec" ]; do
+        gw_deploy=$(kubectl get deployment -n "$ns" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true)
+        if [ -n "$gw_deploy" ]; then
+            log_info "Gateway deployment visible: $gw_deploy (waited ${elapsed}s)"
+            return 0
+        fi
+        sleep "$interval"
+        elapsed=$((elapsed + interval))
+    done
+    log_warning "No inference-gateway-istio deployment in $ns after ${max_sec}s"
+    return 1
+}
+
+# Patch EPP (llm-d-inference-scheduler) image, strip deprecated metric CLI flags, and ensure flow-control env.
+# GIE/EPP v1.4+ (scheduler v0.7.x) errors on flags like --kv-cache-usage-percentage-metric; older inferencepool
+# Helm charts still inject them (see gateway-api-inference-extension runner flag validation).
+patch_llm_d_inference_scheduler_epp() {
+    local ns="$1"
+    local deploy_name="$2"
+    local img="$3"
+
+    if ! command -v jq &>/dev/null; then
+        log_error "jq is required to patch EPP args when using scheduler v0.7+ (deprecated metric flags must be stripped)."
+        return 1
+    fi
+
+    local dep_json
+    dep_json=$(kubectl get deployment "$deploy_name" -n "$ns" -o json) || return 1
+
+    local args_json env_json env_op patch_json
+    # Named list: flags that take a single following argument (AGENTS.md / GIE v1.4 deprecations).
+    args_json=$(echo "$dep_json" | jq -c '
+      ["--kv-cache-usage-percentage-metric","--total-queued-requests-metric","--total-running-requests-metric","--lora-info-metric","--cache-info-metric"] as $d |
+      (.spec.template.spec.containers[0].args // []) |
+      def walk($args; $i):
+        if ($i >= ($args|length)) then []
+        elif ($args[$i] as $f | ($d | index($f))) != null
+        then walk($args; $i+2)
+        else [$args[$i]] + walk($args; $i+1)
+        end;
+      walk(.; 0)
+    ') || return 1
+
+    env_json=$(echo "$dep_json" | jq -c '
+      (.spec.template.spec.containers[0].env // []) as $e |
+      if ($e | map(.name) | index("ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER")) != null then
+        $e | map(if .name == "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER" then .value = "true" else . end)
+      else
+        $e + [{"name":"ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER","value":"true"}]
+      end
+    ') || return 1
+
+    env_op=$(echo "$dep_json" | jq -r 'if (.spec.template.spec.containers[0].env // null) | type == "array" then "replace" else "add" end')
+
+    patch_json=$(jq -n \
+      --arg img "$img" \
+      --argjson args "$args_json" \
+      --argjson env "$env_json" \
+      --arg env_op "$env_op" \
+      '[{"op":"replace","path":"/spec/template/spec/containers/0/image","value":$img},
+        {"op":"replace","path":"/spec/template/spec/containers/0/args","value":$args},
+        {"op":$env_op,"path":"/spec/template/spec/containers/0/env","value":$env}]')
+
+    kubectl patch deployment "$deploy_name" -n "$ns" --type=json -p "$patch_json"
+}
+
+# Add feature gate "flowControl" to the EPP plugins ConfigMap (default-plugins.yaml).
+# GIE v1.4+ sets admission mode from r.featureGates in parseConfigurationPhaseOne; ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
+# only mutates rawConfig in phase two and does not refresh r.featureGates, so env alone leaves legacy admission on
+# (no inference_extension_flow_control_queue_size). See gateway-api-inference-extension cmd/epp/runner/runner.go.
+patch_epp_configmap_flow_control_feature_gate() {
+    local ns="$1"
+    local epp_name="$2"
+    local cm_key="default-plugins.yaml"
+
+    if ! command -v yq &>/dev/null; then
+        log_error "yq is required to patch EPP ConfigMap featureGates for flow control (scale-from-zero)."
+        return 1
+    fi
+    if ! kubectl get cm "$epp_name" -n "$ns" &>/dev/null; then
+        log_warning "ConfigMap $epp_name not found in $ns; skipping flow-control feature gate patch"
+        return 0
+    fi
+    if ! kubectl get cm "$epp_name" -n "$ns" -o json | jq -e --arg k "$cm_key" '.data[$k] | type == "string"' &>/dev/null; then
+        log_warning "ConfigMap $epp_name has no data key '$cm_key'; skipping flow-control feature gate patch"
+        return 0
+    fi
+
+    local original updated
+    original=$(kubectl get cm "$epp_name" -n "$ns" -o json | jq -r --arg k "$cm_key" '.data[$k]') || return 1
+    updated=$(echo "$original" | yq eval '.featureGates = ((.featureGates // []) + ["flowControl"] | unique)' -) || return 1
+
+    if [ "$original" = "$updated" ]; then
+        log_info "EPP ConfigMap $epp_name already includes flowControl feature gate"
+        return 0
+    fi
+
+    local merge_patch
+    merge_patch=$(jq -n --arg k "$cm_key" --arg v "$updated" '{data: {($k): $v}}') || return 1
+    kubectl patch configmap "$epp_name" -n "$ns" --type merge -p "$merge_patch"
+    log_success "Patched EPP ConfigMap $epp_name: featureGates += flowControl (GIE queue metrics for scale-from-zero)"
+}
+
+# Keep Istio inference-gateway routing sidecar aligned with the EPP image (llm-d-inference-scheduler release).
+patch_llm_d_routing_sidecar_image() {
+    local ns="$1"
+    local img="$2"
+    local gw_deploy
+    gw_deploy=$(kubectl get deployment -n "$ns" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true)
+    if [ -z "$gw_deploy" ]; then
+        log_warning "No inference-gateway-istio deployment found in $ns; skipping routing sidecar image patch"
+        return 0
+    fi
+    local patched=false
+    for cname in sidecar routing-sidecar; do
+        if kubectl set image "$gw_deploy" "${cname}=${img}" -n "$ns" &>/dev/null; then
+            log_info "Patched routing sidecar container '$cname' on $gw_deploy to $img"
+            patched=true
+            break
+        fi
+    done
+    if [ "$patched" = false ]; then
+        log_warning "Could not patch routing sidecar on $gw_deploy (expected container name sidecar or routing-sidecar)"
+    fi
+}
+
 deploy_llm_d_infrastructure() {
     log_info "Deploying llm-d infrastructure..."
 
@@ -44,7 +178,7 @@ deploy_llm_d_infrastructure() {
     # Only install Gateway API Inference Extension (GAIE) CRDs directly.
     if [[ "$ENVIRONMENT" == "openshift" ]]; then
         log_info "Skipping Gateway API base CRDs on OpenShift (managed by Ingress Operator)"
-        GAIE_CRD_REV=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.3.0"}
+        GAIE_CRD_REV=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.4.0"}
         log_info "Installing Gateway API Inference Extension CRDs (${GAIE_CRD_REV})"
         kubectl apply -k "https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd/?ref=${GAIE_CRD_REV}" \
             && log_success "GAIE CRDs installed" \
@@ -243,23 +377,21 @@ deploy_llm_d_infrastructure() {
     # Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero
     # or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior).
     if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
-        log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image"
+        log_info "Patching EPP ConfigMap + llm-d-inference-scheduler deployment for flow control (scale-from-zero)"
         if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
-            kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
-                {
-                    "op": "replace",
-                    "path": "/spec/template/spec/containers/0/image",
-                    "value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'"
-                },
-                {
-                    "op": "add",
-                    "path": "/spec/template/spec/containers/0/env/-",
-                    "value": {
-                    "name": "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER",
-                    "value": "true"
-                    }
-                }
-            ]'
+            # Must run before deployment rollout so new pods load EndpointPickerConfig with featureGates (not env alone).
+            patch_epp_configmap_flow_control_feature_gate "$LLMD_NS" "$LLM_D_EPP_NAME" || {
+                log_error "Failed to patch EPP ConfigMap $LLM_D_EPP_NAME for flowControl feature gate"
+                exit 1
+            }
+            patch_llm_d_inference_scheduler_epp "$LLMD_NS" "$LLM_D_EPP_NAME" "$LLM_D_INFERENCE_SCHEDULER_IMG" || {
+                log_error "Failed to patch EPP deployment $LLM_D_EPP_NAME"
+                exit 1
+            }
+            # Sidecar must match scheduler; gateway object often appears shortly after EPP in the same apply.
+            local gw_wait_sec="${E2E_GATEWAY_APPEAR_WAIT_SEC:-180}"
+            wait_for_inference_gateway_deployment "$LLMD_NS" "$gw_wait_sec" 5 || true
+            patch_llm_d_routing_sidecar_image "$LLMD_NS" "$LLM_D_ROUTING_SIDECAR_IMG"
         else
             log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
         fi
@@ -291,7 +423,8 @@ deploy_llm_d_infrastructure() {
     # The full wait often blocks on modelservice decode/prefill readiness, which is
     # unnecessary for the e2e suite because tests create/manage their own workloads.
     if [ "$E2E_TESTS_ENABLED" = "true" ] && [ "$INFRA_ONLY" = "true" ]; then
-        local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-120s}"
+        # EPP image overrides (e.g. v0.7.0) can trigger Recreate rollouts + large pulls; 120s is often too tight.
+        local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-300s}"
         log_info "E2E infra-only mode: waiting for essential llm-d components (timeout=${E2E_DEPLOY_WAIT_TIMEOUT})..."
 
         if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &>/dev/null; then
diff --git a/docs/developer-guide/testing.md b/docs/developer-guide/testing.md
@@ -163,7 +163,7 @@ This deploys:
 - Prometheus stack and Prometheus Adapter (or KEDA when `SCALER_BACKEND=keda`)
 - **No** VariantAutoscaling, HPA, or model services (tests create these)
 
-When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE queuing** by patching the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`. For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing helps populate `inference_extension_flow_control_queue_size` when requests hit the gateway.
+When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE flow control** by patching the EPP **ConfigMap** (`default-plugins.yaml`) with `featureGates: [flowControl]` (required for GIE v1.4+ / scheduler v0.7.x: env-only `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER` does not flip the early feature-gate map) and patching the EPP Deployment (image, args, and the same env for compatibility). For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing populates `inference_extension_flow_control_queue_size` when requests hit the gateway.
 
 **Install script tuning (optional, same variables as `deploy/install.sh`):**
 
diff --git a/docs/user-guide/scale-from-zero.md b/docs/user-guide/scale-from-zero.md
@@ -42,7 +42,7 @@ The ScaleFromZero engine continuously monitors inactive VariantAutoscaling resou
 ## Prerequisites
 
 - WVA and llm-d installed and running - deployment options available for [kind](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kind-emulator/README.md), [OpenShift](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/openshift/README.md) and [Kubernetes](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kubernetes/README.md)
-- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (set EPP env `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
+- **EPP flow control**: EndpointPicker (EPP) must run with the **flow control** feature enabled so the queue metric `inference_extension_flow_control_queue_size` is collected. For GIE v1.4+ (e.g. `llm-d-inference-scheduler` v0.7.x), enable the `flowControl` entry in **EndpointPickerConfig** `featureGates` (the WVA deploy scripts patch the EPP `default-plugins.yaml` ConfigMap accordingly). Env-only `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true` is not sufficient on those versions. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
 
 
 ## Usage