Using featureGates to enable EPP flowControl feature (#973)

dumb0002 · web-flow · commit 79e72a81413e · 2026-04-06T15:23:32.000-04:00
Signed-off-by: Braulio Dumba &lt;Braulio.Dumba@ibm.com&gt;
diff --git a/deploy/install.sh b/deploy/install.sh
@@ -58,8 +58,7 @@ ITL_AVERAGE_LATENCY_MS=${ITL_AVERAGE_LATENCY_MS:-20}
 TTFT_AVERAGE_LATENCY_MS=${TTFT_AVERAGE_LATENCY_MS:-200}
 ENABLE_SCALE_TO_ZERO=${ENABLE_SCALE_TO_ZERO:-true}
 # llm-d-inference scheduler with image with flowcontrol support
-# TODO: update once the llm-d-inference-scheduler v0.5.0 is released
-LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.5.0-rc.1"}
+LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.7.0"}
 
 # Gateway Configuration
 GATEWAY_PROVIDER=${GATEWAY_PROVIDER:-"istio"} # Options: kgateway, istio
diff --git a/deploy/lib/infra_llmd.sh b/deploy/lib/infra_llmd.sh
@@ -240,26 +240,63 @@ deploy_llm_d_infrastructure() {
         fi
     fi
 
-    # Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero
-    # or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior).
+    # Patch llm-d-inference-scheduler deployment image and enable flowControl when scale-to-zero or e2e tests are enabled
+    # (required for scale-from-zero: the image must support flow control for queue metrics).
     if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
-        log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image"
         if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
-            kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
-                {
-                    "op": "replace",
-                    "path": "/spec/template/spec/containers/0/image",
-                    "value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'"
-                },
-                {
-                    "op": "add",
-                    "path": "/spec/template/spec/containers/0/env/-",
-                    "value": {
-                    "name": "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER",
-                    "value": "true"
+            # Get the current image from the deployment
+            local CURRENT_IMAGE=$(kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" -o jsonpath='{.spec.template.spec.containers[0].image}')
+            
+            # Only patch if the image is different
+            if [ "$CURRENT_IMAGE" != "$LLM_D_INFERENCE_SCHEDULER_IMG" ]; then
+                log_info "Patching llm-d-inference-scheduler deployment: updating image from $CURRENT_IMAGE to $LLM_D_INFERENCE_SCHEDULER_IMG"
+                kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
+                    {
+                        "op": "replace",
+                        "path": "/spec/template/spec/containers/0/image",
+                        "value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'"
                     }
-                }
-            ]'
+                ]'
+            else
+                log_info "Skipping image patch: llm-d-inference-scheduler already using $LLM_D_INFERENCE_SCHEDULER_IMG"
+            fi
+
+            # Enable flowControl feature gate in the EPP ConfigMap
+            if kubectl get configmap "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
+                # Check if flowControl is already enabled
+                local CURRENT_CONFIG=$(kubectl get configmap "$LLM_D_EPP_NAME" -n "$LLMD_NS" -o jsonpath='{.data.default-plugins\.yaml}')
+                
+                if echo "$CURRENT_CONFIG" | yq eval '.featureGates // [] | contains(["flowControl"])' - | grep -q 'true'; then
+                    log_info "flowControl feature gate already enabled in EPP ConfigMap"
+                else
+                    log_info "Enabling flowControl feature gate in EPP ConfigMap $LLM_D_EPP_NAME"
+                    
+                    # Use yq to properly add flowControl to featureGates array (creates array if missing, appends if exists)
+                    local UPDATED_CONFIG=$(echo "$CURRENT_CONFIG" | yq eval '.featureGates += ["flowControl"] | .featureGates |= unique' -)
+                    
+                    # Validate that flowControl was successfully added
+                    if echo "$UPDATED_CONFIG" | yq eval '.featureGates // [] | contains(["flowControl"])' - | grep -q 'true'; then
+                        # Apply the updated config
+                        kubectl patch configmap "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
+                            {
+                                "op": "replace",
+                                "path": "/data/default-plugins.yaml",
+                                "value": "'"$(echo "$UPDATED_CONFIG" | sed 's/"/\\"/g' | tr '\n' '\r' | sed 's/\r/\\n/g')"'"
+                            }
+                        ]'
+                        
+                        # Restart deployment to pick up the config change
+                        log_info "Restarting $LLM_D_EPP_NAME deployment to apply flowControl feature gate"
+                        kubectl rollout restart deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS"
+                    else
+                        log_error "Failed to add flowControl to featureGates in EPP ConfigMap - YAML structure may be invalid or unexpected"
+                        log_error "Current config structure: $(echo "$CURRENT_CONFIG" | yq eval '.' - 2>&1 | head -5)"
+                        exit 1
+                    fi
+                fi
+            else
+                log_warning "ConfigMap $LLM_D_EPP_NAME not found in $LLMD_NS"
+            fi
         else
             log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
         fi
diff --git a/docs/developer-guide/testing.md b/docs/developer-guide/testing.md
@@ -163,7 +163,7 @@ This deploys:
 - Prometheus stack and Prometheus Adapter (or KEDA when `SCALER_BACKEND=keda`)
 - **No** VariantAutoscaling, HPA, or model services (tests create these)
 
-When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE queuing** by patching the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`. For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing helps populate `inference_extension_flow_control_queue_size` when requests hit the gateway.
+When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE queuing** by adding the `flowControl` feature gate to the EPP ConfigMap and updating the EPP image to a version that supports flow control. For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing helps populate `inference_extension_flow_control_queue_size` when requests hit the gateway.
 
 **Install script tuning (optional, same variables as `deploy/install.sh`):**
 
diff --git a/docs/user-guide/scale-from-zero.md b/docs/user-guide/scale-from-zero.md
@@ -42,7 +42,7 @@ The ScaleFromZero engine continuously monitors inactive VariantAutoscaling resou
 ## Prerequisites
 
 - WVA and llm-d installed and running - deployment options available for [kind](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kind-emulator/README.md), [OpenShift](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/openshift/README.md) and [Kubernetes](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kubernetes/README.md)
-- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (set EPP env `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
+- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (via the `flowControl` feature gate in the EPP ConfigMap) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
 
 
 ## Usage
diff --git a/test/e2e/scale_from_zero_test.go b/test/e2e/scale_from_zero_test.go
@@ -135,8 +135,8 @@ func cleanupScaleFromZeroResources() {
 }
 
 // Scale-from-zero test validates that the WVA controller correctly detects pending requests
-// and scales up scale targets from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
-// on EPP from install when E2E_TESTS_ENABLED=true) and an InferenceObjective (applied below in BeforeAll).
+// and scales up scale targets from zero replicas. Requires GIE queuing (flowControl feature gate
+// enabled on EPP from install when E2E_TESTS_ENABLED=true) and an InferenceObjective (applied below in BeforeAll).
 // This suite needs a scaler that allows minReplicas=0 on the scaled workload: either
 // SCALE_TO_ZERO_ENABLED=true where native HPA supports it (HPAScaleToZero), or SCALER_BACKEND=keda
 // (ScaledObject). OpenShift usually lacks HPAScaleToZero; e2e config ignores SCALE_TO_ZERO_ENABLED there,
@@ -1448,4 +1448,3 @@ var _ = Describe("Scale-From-Zero Feature with LeaderWorkerSet (single-node)", S
 		})
 	})
 })
-