|
7 | 7 | # containsElement(), wait_deployment_available_nonfatal(), detect_inference_pool_api_group(). |
8 | 8 | # |
9 | 9 |
|
| 10 | +# Helm may reconcile the EPP Deployment before the inference-gateway-istio Deployment exists. |
| 11 | +# Without waiting, patch_llm_d_routing_sidecar_image runs once, finds nothing, and never retries. |
| 12 | +wait_for_inference_gateway_deployment() { |
| 13 | + local ns="$1" |
| 14 | + local max_sec="${2:-180}" |
| 15 | + local interval="${3:-5}" |
| 16 | + local elapsed=0 |
| 17 | + local gw_deploy="" |
| 18 | + while [ "$elapsed" -lt "$max_sec" ]; do |
| 19 | + gw_deploy=$(kubectl get deployment -n "$ns" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true) |
| 20 | + if [ -n "$gw_deploy" ]; then |
| 21 | + log_info "Gateway deployment visible: $gw_deploy (waited ${elapsed}s)" |
| 22 | + return 0 |
| 23 | + fi |
| 24 | + sleep "$interval" |
| 25 | + elapsed=$((elapsed + interval)) |
| 26 | + done |
| 27 | + log_warning "No inference-gateway-istio deployment in $ns after ${max_sec}s" |
| 28 | + return 1 |
| 29 | +} |
| 30 | + |
| 31 | +# Patch EPP (llm-d-inference-scheduler) image, strip deprecated metric CLI flags, and ensure flow-control env. |
| 32 | +# GIE/EPP v1.4+ (scheduler v0.7.x) errors on flags like --kv-cache-usage-percentage-metric; older inferencepool |
| 33 | +# Helm charts still inject them (see gateway-api-inference-extension runner flag validation). |
| 34 | +patch_llm_d_inference_scheduler_epp() { |
| 35 | + local ns="$1" |
| 36 | + local deploy_name="$2" |
| 37 | + local img="$3" |
| 38 | + |
| 39 | + if ! command -v jq &>/dev/null; then |
| 40 | + log_error "jq is required to patch EPP args when using scheduler v0.7+ (deprecated metric flags must be stripped)." |
| 41 | + return 1 |
| 42 | + fi |
| 43 | + |
| 44 | + local dep_json |
| 45 | + dep_json=$(kubectl get deployment "$deploy_name" -n "$ns" -o json) || return 1 |
| 46 | + |
| 47 | + local args_json env_json env_op patch_json |
| 48 | + # Named list: flags that take a single following argument (AGENTS.md / GIE v1.4 deprecations). |
| 49 | + args_json=$(echo "$dep_json" | jq -c ' |
| 50 | + ["--kv-cache-usage-percentage-metric","--total-queued-requests-metric","--total-running-requests-metric","--lora-info-metric","--cache-info-metric"] as $d | |
| 51 | + (.spec.template.spec.containers[0].args // []) | |
| 52 | + def walk($args; $i): |
| 53 | + if ($i >= ($args|length)) then [] |
| 54 | + elif ($args[$i] as $f | ($d | index($f))) != null |
| 55 | + then walk($args; $i+2) |
| 56 | + else [$args[$i]] + walk($args; $i+1) |
| 57 | + end; |
| 58 | + walk(.; 0) |
| 59 | + ') || return 1 |
| 60 | + |
| 61 | + env_json=$(echo "$dep_json" | jq -c ' |
| 62 | + (.spec.template.spec.containers[0].env // []) as $e | |
| 63 | + if ($e | map(.name) | index("ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER")) != null then |
| 64 | + $e | map(if .name == "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER" then .value = "true" else . end) |
| 65 | + else |
| 66 | + $e + [{"name":"ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER","value":"true"}] |
| 67 | + end |
| 68 | + ') || return 1 |
| 69 | + |
| 70 | + env_op=$(echo "$dep_json" | jq -r 'if (.spec.template.spec.containers[0].env // null) | type == "array" then "replace" else "add" end') |
| 71 | + |
| 72 | + patch_json=$(jq -n \ |
| 73 | + --arg img "$img" \ |
| 74 | + --argjson args "$args_json" \ |
| 75 | + --argjson env "$env_json" \ |
| 76 | + --arg env_op "$env_op" \ |
| 77 | + '[{"op":"replace","path":"/spec/template/spec/containers/0/image","value":$img}, |
| 78 | + {"op":"replace","path":"/spec/template/spec/containers/0/args","value":$args}, |
| 79 | + {"op":$env_op,"path":"/spec/template/spec/containers/0/env","value":$env}]') |
| 80 | + |
| 81 | + kubectl patch deployment "$deploy_name" -n "$ns" --type=json -p "$patch_json" |
| 82 | +} |
| 83 | + |
| 84 | +# Add feature gate "flowControl" to the EPP plugins ConfigMap (default-plugins.yaml). |
| 85 | +# GIE v1.4+ sets admission mode from r.featureGates in parseConfigurationPhaseOne; ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER |
| 86 | +# only mutates rawConfig in phase two and does not refresh r.featureGates, so env alone leaves legacy admission on |
| 87 | +# (no inference_extension_flow_control_queue_size). See gateway-api-inference-extension cmd/epp/runner/runner.go. |
| 88 | +patch_epp_configmap_flow_control_feature_gate() { |
| 89 | + local ns="$1" |
| 90 | + local epp_name="$2" |
| 91 | + local cm_key="default-plugins.yaml" |
| 92 | + |
| 93 | + if ! command -v yq &>/dev/null; then |
| 94 | + log_error "yq is required to patch EPP ConfigMap featureGates for flow control (scale-from-zero)." |
| 95 | + return 1 |
| 96 | + fi |
| 97 | + if ! kubectl get cm "$epp_name" -n "$ns" &>/dev/null; then |
| 98 | + log_warning "ConfigMap $epp_name not found in $ns; skipping flow-control feature gate patch" |
| 99 | + return 0 |
| 100 | + fi |
| 101 | + if ! kubectl get cm "$epp_name" -n "$ns" -o json | jq -e --arg k "$cm_key" '.data[$k] | type == "string"' &>/dev/null; then |
| 102 | + log_warning "ConfigMap $epp_name has no data key '$cm_key'; skipping flow-control feature gate patch" |
| 103 | + return 0 |
| 104 | + fi |
| 105 | + |
| 106 | + local original updated |
| 107 | + original=$(kubectl get cm "$epp_name" -n "$ns" -o json | jq -r --arg k "$cm_key" '.data[$k]') || return 1 |
| 108 | + updated=$(echo "$original" | yq eval '.featureGates = ((.featureGates // []) + ["flowControl"] | unique)' -) || return 1 |
| 109 | + |
| 110 | + if [ "$original" = "$updated" ]; then |
| 111 | + log_info "EPP ConfigMap $epp_name already includes flowControl feature gate" |
| 112 | + return 0 |
| 113 | + fi |
| 114 | + |
| 115 | + local merge_patch |
| 116 | + merge_patch=$(jq -n --arg k "$cm_key" --arg v "$updated" '{data: {($k): $v}}') || return 1 |
| 117 | + kubectl patch configmap "$epp_name" -n "$ns" --type merge -p "$merge_patch" |
| 118 | + log_success "Patched EPP ConfigMap $epp_name: featureGates += flowControl (GIE queue metrics for scale-from-zero)" |
| 119 | +} |
| 120 | + |
| 121 | +# Keep Istio inference-gateway routing sidecar aligned with the EPP image (llm-d-inference-scheduler release). |
| 122 | +patch_llm_d_routing_sidecar_image() { |
| 123 | + local ns="$1" |
| 124 | + local img="$2" |
| 125 | + local gw_deploy |
| 126 | + gw_deploy=$(kubectl get deployment -n "$ns" -o name 2>/dev/null | grep "inference-gateway-istio" | head -1 || true) |
| 127 | + if [ -z "$gw_deploy" ]; then |
| 128 | + log_warning "No inference-gateway-istio deployment found in $ns; skipping routing sidecar image patch" |
| 129 | + return 0 |
| 130 | + fi |
| 131 | + local patched=false |
| 132 | + for cname in sidecar routing-sidecar; do |
| 133 | + if kubectl set image "$gw_deploy" "${cname}=${img}" -n "$ns" &>/dev/null; then |
| 134 | + log_info "Patched routing sidecar container '$cname' on $gw_deploy to $img" |
| 135 | + patched=true |
| 136 | + break |
| 137 | + fi |
| 138 | + done |
| 139 | + if [ "$patched" = false ]; then |
| 140 | + log_warning "Could not patch routing sidecar on $gw_deploy (expected container name sidecar or routing-sidecar)" |
| 141 | + fi |
| 142 | +} |
| 143 | + |
10 | 144 | deploy_llm_d_infrastructure() { |
11 | 145 | log_info "Deploying llm-d infrastructure..." |
12 | 146 |
|
@@ -44,7 +178,7 @@ deploy_llm_d_infrastructure() { |
44 | 178 | # Only install Gateway API Inference Extension (GAIE) CRDs directly. |
45 | 179 | if [[ "$ENVIRONMENT" == "openshift" ]]; then |
46 | 180 | log_info "Skipping Gateway API base CRDs on OpenShift (managed by Ingress Operator)" |
47 | | - GAIE_CRD_REV=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.3.0"} |
| 181 | + GAIE_CRD_REV=${GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION:-"v1.4.0"} |
48 | 182 | log_info "Installing Gateway API Inference Extension CRDs (${GAIE_CRD_REV})" |
49 | 183 | kubectl apply -k "https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd/?ref=${GAIE_CRD_REV}" \ |
50 | 184 | && log_success "GAIE CRDs installed" \ |
@@ -243,23 +377,21 @@ deploy_llm_d_infrastructure() { |
243 | 377 | # Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero |
244 | 378 | # or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior). |
245 | 379 | if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then |
246 | | - log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image" |
| 380 | + log_info "Patching EPP ConfigMap + llm-d-inference-scheduler deployment for flow control (scale-from-zero)" |
247 | 381 | if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then |
248 | | - kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[ |
249 | | - { |
250 | | - "op": "replace", |
251 | | - "path": "/spec/template/spec/containers/0/image", |
252 | | - "value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'" |
253 | | - }, |
254 | | - { |
255 | | - "op": "add", |
256 | | - "path": "/spec/template/spec/containers/0/env/-", |
257 | | - "value": { |
258 | | - "name": "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER", |
259 | | - "value": "true" |
260 | | - } |
261 | | - } |
262 | | - ]' |
| 382 | + # Must run before deployment rollout so new pods load EndpointPickerConfig with featureGates (not env alone). |
| 383 | + patch_epp_configmap_flow_control_feature_gate "$LLMD_NS" "$LLM_D_EPP_NAME" || { |
| 384 | + log_error "Failed to patch EPP ConfigMap $LLM_D_EPP_NAME for flowControl feature gate" |
| 385 | + exit 1 |
| 386 | + } |
| 387 | + patch_llm_d_inference_scheduler_epp "$LLMD_NS" "$LLM_D_EPP_NAME" "$LLM_D_INFERENCE_SCHEDULER_IMG" || { |
| 388 | + log_error "Failed to patch EPP deployment $LLM_D_EPP_NAME" |
| 389 | + exit 1 |
| 390 | + } |
| 391 | + # Sidecar must match scheduler; gateway object often appears shortly after EPP in the same apply. |
| 392 | + local gw_wait_sec="${E2E_GATEWAY_APPEAR_WAIT_SEC:-180}" |
| 393 | + wait_for_inference_gateway_deployment "$LLMD_NS" "$gw_wait_sec" 5 || true |
| 394 | + patch_llm_d_routing_sidecar_image "$LLMD_NS" "$LLM_D_ROUTING_SIDECAR_IMG" |
263 | 395 | else |
264 | 396 | log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS" |
265 | 397 | fi |
@@ -291,7 +423,8 @@ deploy_llm_d_infrastructure() { |
291 | 423 | # The full wait often blocks on modelservice decode/prefill readiness, which is |
292 | 424 | # unnecessary for the e2e suite because tests create/manage their own workloads. |
293 | 425 | if [ "$E2E_TESTS_ENABLED" = "true" ] && [ "$INFRA_ONLY" = "true" ]; then |
294 | | - local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-120s}" |
| 426 | + # EPP image overrides (e.g. v0.7.0) can trigger Recreate rollouts + large pulls; 120s is often too tight. |
| 427 | + local E2E_DEPLOY_WAIT_TIMEOUT="${E2E_DEPLOY_WAIT_TIMEOUT:-300s}" |
295 | 428 | log_info "E2E infra-only mode: waiting for essential llm-d components (timeout=${E2E_DEPLOY_WAIT_TIMEOUT})..." |
296 | 429 |
|
297 | 430 | if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &>/dev/null; then |
|
0 commit comments