Skip to content

Commit 9ae201e

Browse files
committed
upgrade infra versions
Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent 0ee5ba1 commit 9ae201e

File tree

5 files changed

+161
-25
lines changed

5 files changed

+161
-25
lines changed

deploy/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,7 @@ HPA_STABILIZATION_SECONDS=30 ./deploy/install.sh
697697
| `WVA_LOG_LEVEL` | WVA logging level | `info` |
698698
| `VLLM_SVC_ENABLED` | Enable vLLM Service | `true` |
699699
| `VLLM_SVC_NODEPORT` | vLLM NodePort | `30000` |
700-
| `LLM_D_RELEASE` | llm-d version | `v0.3.0` |
700+
| `LLM_D_RELEASE` | llm-d git ref (guides / Helmfile); newest tag is often older than default scheduler images — see `install.sh` comment | `v0.5.1` |
701701
| `VLLM_MAX_NUM_SEQS` | vLLM max concurrent sequences per replica | (unset - uses vLLM default) |
702702

703703
**vLLM Performance Tuning:**

deploy/install.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ CONTROLLER_INSTANCE=${CONTROLLER_INSTANCE:-""}
4747
# llm-d Configuration
4848
LLM_D_OWNER=${LLM_D_OWNER:-"llm-d"}
4949
LLM_D_PROJECT=${LLM_D_PROJECT:-"llm-d"}
50-
LLM_D_RELEASE=${LLM_D_RELEASE:-"v0.3.0"}
50+
# Git ref for cloning llm-d (guides, Helmfile, chart pins). Use newest *tag* for llmd, check on github.com/llm-d/llm-d repo for the latest tag.
51+
# Default EPP/sidecar images below therefore track newer *published* containers while charts come from this ref;
52+
LLM_D_RELEASE=${LLM_D_RELEASE:-"v0.5.1"}
5153
LLM_D_MODELSERVICE_NAME=${LLM_D_MODELSERVICE_NAME:-"ms-$WELL_LIT_PATH_NAME-llm-d-modelservice"}
5254
LLM_D_EPP_NAME=${LLM_D_EPP_NAME:-"gaie-$WELL_LIT_PATH_NAME-epp"}
5355
CLIENT_PREREQ_DIR=${CLIENT_PREREQ_DIR:-"$WVA_PROJECT/$LLM_D_PROJECT/guides/prereq/client-setup"}
@@ -57,9 +59,10 @@ LLM_D_MODELSERVICE_VALUES=${LLM_D_MODELSERVICE_VALUES:-"$EXAMPLE_DIR/ms-$WELL_LI
5759
ITL_AVERAGE_LATENCY_MS=${ITL_AVERAGE_LATENCY_MS:-20}
5860
TTFT_AVERAGE_LATENCY_MS=${TTFT_AVERAGE_LATENCY_MS:-200}
5961
ENABLE_SCALE_TO_ZERO=${ENABLE_SCALE_TO_ZERO:-true}
60-
# llm-d-inference scheduler with image with flowcontrol support
61-
# TODO: update once the llm-d-inference-scheduler v0.5.0 is released
62-
LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.5.0-rc.1"}
62+
# llm-d-inference-scheduler (EPP) image patched onto the helm deployment when flow control / e2e is enabled.
63+
LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.7.0"}
64+
# Routing sidecar must stay on the same minor release as the inference scheduler (shared gRPC / gateway integration).
65+
LLM_D_ROUTING_SIDECAR_IMG=${LLM_D_ROUTING_SIDECAR_IMG:-"ghcr.io/llm-d/llm-d-routing-sidecar:v0.7.0"}
6366

6467
# Gateway Configuration
6568
GATEWAY_PROVIDER=${GATEWAY_PROVIDER:-"istio"} # Options: kgateway, istio

deploy/lib/infra_llmd.sh

Lines changed: 151 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,140 @@
77
# containsElement(), wait_deployment_available_nonfatal(), detect_inference_pool_api_group().
88
#
99

10+
# Helm may reconcile the EPP Deployment before the inference-gateway-istio Deployment exists.
11+
# Without waiting, patch_llm_d_routing_sidecar_image runs once, finds nothing, and never retries.
12+
wait_for_inference_gateway_deployment() {
13+
local ns="$1"
14+
local max_sec="${2:-180}"
15+
local interval="${3:-5}"
16+
local elapsed=0
17+
local gw_deploy=""
18+
while [ "$elapsed" -lt "$max_sec" ]; do
19+
gw_deploy=$(kubectl get deployment -n "$ns" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true)
20+
if [ -n "$gw_deploy" ]; then
21+
log_info "Gateway deployment visible: $gw_deploy (waited ${elapsed}s)"
22+
return 0
23+
fi
24+
sleep "$interval"
25+
elapsed=$((elapsed + interval))
26+
done
27+
log_warning "No inference-gateway-istio deployment in $ns after ${max_sec}s"
28+
return 1
29+
}
30+
31+
# Patch EPP (llm-d-inference-scheduler) image, strip deprecated metric CLI flags, and ensure flow-control env.
32+
# GIE/EPP v1.4+ (scheduler v0.7.x) errors on flags like --kv-cache-usage-percentage-metric; older inferencepool
33+
# Helm charts still inject them (see gateway-api-inference-extension runner flag validation).
34+
patch_llm_d_inference_scheduler_epp() {
35+
local ns="$1"
36+
local deploy_name="$2"
37+
local img="$3"
38+
39+
if ! command -v jq &>/dev/null; then
40+
log_error "jq is required to patch EPP args when using scheduler v0.7+ (deprecated metric flags must be stripped)."
41+
return 1
42+
fi
43+
44+
local dep_json
45+
dep_json=$(kubectl get deployment "$deploy_name" -n "$ns" -o json) || return 1
46+
47+
local args_json env_json env_op patch_json
48+
# Named list: flags that take a single following argument (AGENTS.md / GIE v1.4 deprecations).
49+
args_json=$(echo "$dep_json" | jq -c '
50+
["--kv-cache-usage-percentage-metric","--total-queued-requests-metric","--total-running-requests-metric","--lora-info-metric","--cache-info-metric"] as $d |
51+
(.spec.template.spec.containers[0].args // []) |
52+
def walk($args; $i):
53+
if ($i >= ($args|length)) then []
54+
elif ($args[$i] as $f | ($d | index($f))) != null
55+
then walk($args; $i+2)
56+
else [$args[$i]] + walk($args; $i+1)
57+
end;
58+
walk(.; 0)
59+
') || return 1
60+
61+
env_json=$(echo "$dep_json" | jq -c '
62+
(.spec.template.spec.containers[0].env // []) as $e |
63+
if ($e | map(.name) | index("ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER")) != null then
64+
$e | map(if .name == "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER" then .value = "true" else . end)
65+
else
66+
$e + [{"name":"ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER","value":"true"}]
67+
end
68+
') || return 1
69+
70+
env_op=$(echo "$dep_json" | jq -r 'if (.spec.template.spec.containers[0].env // null) | type == "array" then "replace" else "add" end')
71+
72+
patch_json=$(jq -n \
73+
--arg img "$img" \
74+
--argjson args "$args_json" \
75+
--argjson env "$env_json" \
76+
--arg env_op "$env_op" \
77+
'[{"op":"replace","path":"/spec/template/spec/containers/0/image","value":$img},
78+
{"op":"replace","path":"/spec/template/spec/containers/0/args","value":$args},
79+
{"op":$env_op,"path":"/spec/template/spec/containers/0/env","value":$env}]')
80+
81+
kubectl patch deployment "$deploy_name" -n "$ns" --type=json -p "$patch_json"
82+
}
83+
84+
# Add feature gate "flowControl" to the EPP plugins ConfigMap (default-plugins.yaml).
85+
# GIE v1.4+ sets admission mode from r.featureGates in parseConfigurationPhaseOne; ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
86+
# only mutates rawConfig in phase two and does not refresh r.featureGates, so env alone leaves legacy admission on
87+
# (no inference_extension_flow_control_queue_size). See gateway-api-inference-extension cmd/epp/runner/runner.go.
88+
patch_epp_configmap_flow_control_feature_gate() {
89+
local ns="$1"
90+
local epp_name="$2"
91+
local cm_key="default-plugins.yaml"
92+
93+
if ! command -v yq &>/dev/null; then
94+
log_error "yq is required to patch EPP ConfigMap featureGates for flow control (scale-from-zero)."
95+
return 1
96+
fi
97+
if ! kubectl get cm "$epp_name" -n "$ns" &>/dev/null; then
98+
log_warning "ConfigMap $epp_name not found in $ns; skipping flow-control feature gate patch"
99+
return 0
100+
fi
101+
if ! kubectl get cm "$epp_name" -n "$ns" -o json | jq -e --arg k "$cm_key" '.data[$k] | type == "string"' &>/dev/null; then
102+
log_warning "ConfigMap $epp_name has no data key '$cm_key'; skipping flow-control feature gate patch"
103+
return 0
104+
fi
105+
106+
local original updated
107+
original=$(kubectl get cm "$epp_name" -n "$ns" -o json | jq -r --arg k "$cm_key" '.data[$k]') || return 1
108+
updated=$(echo "$original" | yq eval '.featureGates = ((.featureGates // []) + ["flowControl"] | unique)' -) || return 1
109+
110+
if [ "$original" = "$updated" ]; then
111+
log_info "EPP ConfigMap $epp_name already includes flowControl feature gate"
112+
return 0
113+
fi
114+
115+
local merge_patch
116+
merge_patch=$(jq -n --arg k "$cm_key" --arg v "$updated" '{data: {($k): $v}}') || return 1
117+
kubectl patch configmap "$epp_name" -n "$ns" --type merge -p "$merge_patch"
118+
log_success "Patched EPP ConfigMap $epp_name: featureGates += flowControl (GIE queue metrics for scale-from-zero)"
119+
}
120+
121+
# Keep Istio inference-gateway routing sidecar aligned with the EPP image (llm-d-inference-scheduler release).
122+
patch_llm_d_routing_sidecar_image() {
123+
local ns="$1"
124+
local img="$2"
125+
local gw_deploy
126+
gw_deploy=$(kubectl get deployment -n "$ns" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true)
127+
if [ -z "$gw_deploy" ]; then
128+
log_warning "No inference-gateway-istio deployment found in $ns; skipping routing sidecar image patch"
129+
return 0
130+
fi
131+
local patched=false
132+
for cname in sidecar routing-sidecar; do
133+
if kubectl set image "$gw_deploy" "${cname}=${img}" -n "$ns" &>/dev/null; then
134+
log_info "Patched routing sidecar container '$cname' on $gw_deploy to $img"
135+
patched=true
136+
break
137+
fi
138+
done
139+
if [ "$patched" = false ]; then
140+
log_warning "Could not patch routing sidecar on $gw_deploy (expected container name sidecar or routing-sidecar)"
141+
fi
142+
}
143+
10144
deploy_llm_d_infrastructure() {
11145
log_info "Deploying llm-d infrastructure..."
12146

@@ -44,7 +178,7 @@ deploy_llm_d_infrastructure() {
44178
# Only install Gateway API Inference Extension (GAIE) CRDs directly.
45179
if [[ "$ENVIRONMENT" == "openshift" ]]; then
46180
log_info "Skipping Gateway API base CRDs on OpenShift (managed by Ingress Operator)"
47-
GAIE_CRD_REV=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.3.0"}
181+
GAIE_CRD_REV=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.4.0"}
48182
log_info "Installing Gateway API Inference Extension CRDs (${GAIE_CRD_REV})"
49183
kubectl apply -k "https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd/?ref=${GAIE_CRD_REV}" \
50184
&& log_success "GAIE CRDs installed" \
@@ -243,23 +377,21 @@ deploy_llm_d_infrastructure() {
243377
# Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero
244378
# or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior).
245379
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
246-
log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image"
380+
log_info "Patching EPP ConfigMap + llm-d-inference-scheduler deployment for flow control (scale-from-zero)"
247381
if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
248-
kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
249-
{
250-
"op": "replace",
251-
"path": "/spec/template/spec/containers/0/image",
252-
"value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'"
253-
},
254-
{
255-
"op": "add",
256-
"path": "/spec/template/spec/containers/0/env/-",
257-
"value": {
258-
"name": "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER",
259-
"value": "true"
260-
}
261-
}
262-
]'
382+
# Must run before deployment rollout so new pods load EndpointPickerConfig with featureGates (not env alone).
383+
patch_epp_configmap_flow_control_feature_gate "$LLMD_NS" "$LLM_D_EPP_NAME" || {
384+
log_error "Failed to patch EPP ConfigMap $LLM_D_EPP_NAME for flowControl feature gate"
385+
exit 1
386+
}
387+
patch_llm_d_inference_scheduler_epp "$LLMD_NS" "$LLM_D_EPP_NAME" "$LLM_D_INFERENCE_SCHEDULER_IMG" || {
388+
log_error "Failed to patch EPP deployment $LLM_D_EPP_NAME"
389+
exit 1
390+
}
391+
# Sidecar must match scheduler; gateway object often appears shortly after EPP in the same apply.
392+
local gw_wait_sec="${E2E_GATEWAY_APPEAR_WAIT_SEC:-180}"
393+
wait_for_inference_gateway_deployment "$LLMD_NS" "$gw_wait_sec" 5 || true
394+
patch_llm_d_routing_sidecar_image "$LLMD_NS" "$LLM_D_ROUTING_SIDECAR_IMG"
263395
else
264396
log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
265397
fi
@@ -291,7 +423,8 @@ deploy_llm_d_infrastructure() {
291423
# The full wait often blocks on modelservice decode/prefill readiness, which is
292424
# unnecessary for the e2e suite because tests create/manage their own workloads.
293425
if [ "$E2E_TESTS_ENABLED" = "true" ] && [ "$INFRA_ONLY" = "true" ]; then
294-
local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-120s}"
426+
# EPP image overrides (e.g. v0.7.0) can trigger Recreate rollouts + large pulls; 120s is often too tight.
427+
local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-300s}"
295428
log_info "E2E infra-only mode: waiting for essential llm-d components (timeout=${E2E_DEPLOY_WAIT_TIMEOUT})..."
296429

297430
if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &>/dev/null; then

docs/developer-guide/testing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ This deploys:
163163
- Prometheus stack and Prometheus Adapter (or KEDA when `SCALER_BACKEND=keda`)
164164
- **No** VariantAutoscaling, HPA, or model services (tests create these)
165165

166-
When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE queuing** by patching the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`. For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing helps populate `inference_extension_flow_control_queue_size` when requests hit the gateway.
166+
When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE flow control** by patching the EPP **ConfigMap** (`default-plugins.yaml`) with `featureGates: [flowControl]` (required for GIE v1.4+ / scheduler v0.7.x: env-only `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER` does not flip the early feature-gate map) and patching the EPP Deployment (image, args, and the same env for compatibility). For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing populates `inference_extension_flow_control_queue_size` when requests hit the gateway.
167167

168168
**Install script tuning (optional, same variables as `deploy/install.sh`):**
169169

docs/user-guide/scale-from-zero.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ The ScaleFromZero engine continuously monitors inactive VariantAutoscaling resou
4242
## Prerequisites
4343

4444
- WVA and llm-d installed and running - deployment options available for [kind](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kind-emulator/README.md), [OpenShift](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/openshift/README.md) and [Kubernetes](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kubernetes/README.md)
45-
- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (set EPP env `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
45+
- **EPP flow control**: EndpointPicker (EPP) must run with the **flow control** feature enabled so the queue metric `inference_extension_flow_control_queue_size` is collected. For GIE v1.4+ (e.g. `llm-d-inference-scheduler` v0.7.x), enable the `flowControl` entry in **EndpointPickerConfig** `featureGates` (the WVA deploy scripts patch the EPP `default-plugins.yaml` ConfigMap accordingly). Env-only `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true` is not sufficient on those versions. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
4646

4747

4848
## Usage

0 commit comments

Comments
 (0)