Skip to content

Commit 79e72a8

Browse files
authored
Using featureGates to enable EPP flowControl feature (#973)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com>
1 parent aa7e6d2 commit 79e72a8

File tree

5 files changed

+59
-24
lines changed

5 files changed

+59
-24
lines changed

deploy/install.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,7 @@ ITL_AVERAGE_LATENCY_MS=${ITL_AVERAGE_LATENCY_MS:-20}
5858
TTFT_AVERAGE_LATENCY_MS=${TTFT_AVERAGE_LATENCY_MS:-200}
5959
ENABLE_SCALE_TO_ZERO=${ENABLE_SCALE_TO_ZERO:-true}
6060
# llm-d-inference scheduler with image with flowcontrol support
61-
# TODO: update once the llm-d-inference-scheduler v0.5.0 is released
62-
LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.5.0-rc.1"}
61+
LLM_D_INFERENCE_SCHEDULER_IMG=${LLM_D_INFERENCE_SCHEDULER_IMG:-"ghcr.io/llm-d/llm-d-inference-scheduler:v0.7.0"}
6362

6463
# Gateway Configuration
6564
GATEWAY_PROVIDER=${GATEWAY_PROVIDER:-"istio"} # Options: kgateway, istio

deploy/lib/infra_llmd.sh

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -240,26 +240,63 @@ deploy_llm_d_infrastructure() {
240240
fi
241241
fi
242242

243-
# Patch llm-d-inference-scheduler deployment to enable GIE flow control when scale-to-zero
244-
# or e2e tests are enabled (required for scale-from-zero: queue metrics and queuing behavior).
243+
# Patch llm-d-inference-scheduler deployment image and enable flowControl when scale-to-zero or e2e tests are enabled
244+
# (required for scale-from-zero: the image must support flow control for queue metrics).
245245
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] || [ "$E2E_TESTS_ENABLED" == "true" ]; then
246-
log_info "Patching llm-d-inference-scheduler deployment to enable flowcontrol and use a new image"
247246
if kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
248-
kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
249-
{
250-
"op": "replace",
251-
"path": "/spec/template/spec/containers/0/image",
252-
"value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'"
253-
},
254-
{
255-
"op": "add",
256-
"path": "/spec/template/spec/containers/0/env/-",
257-
"value": {
258-
"name": "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER",
259-
"value": "true"
247+
# Get the current image from the deployment
248+
local CURRENT_IMAGE=$(kubectl get deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" -o jsonpath='{.spec.template.spec.containers[0].image}')
249+
250+
# Only patch if the image is different
251+
if [ "$CURRENT_IMAGE" != "$LLM_D_INFERENCE_SCHEDULER_IMG" ]; then
252+
log_info "Patching llm-d-inference-scheduler deployment: updating image from $CURRENT_IMAGE to $LLM_D_INFERENCE_SCHEDULER_IMG"
253+
kubectl patch deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
254+
{
255+
"op": "replace",
256+
"path": "/spec/template/spec/containers/0/image",
257+
"value": "'$LLM_D_INFERENCE_SCHEDULER_IMG'"
260258
}
261-
}
262-
]'
259+
]'
260+
else
261+
log_info "Skipping image patch: llm-d-inference-scheduler already using $LLM_D_INFERENCE_SCHEDULER_IMG"
262+
fi
263+
264+
# Enable flowControl feature gate in the EPP ConfigMap
265+
if kubectl get configmap "$LLM_D_EPP_NAME" -n "$LLMD_NS" &> /dev/null; then
266+
# Check if flowControl is already enabled
267+
local CURRENT_CONFIG=$(kubectl get configmap "$LLM_D_EPP_NAME" -n "$LLMD_NS" -o jsonpath='{.data.default-plugins\.yaml}')
268+
269+
if echo "$CURRENT_CONFIG" | yq eval '.featureGates // [] | contains(["flowControl"])' - | grep -q 'true'; then
270+
log_info "flowControl feature gate already enabled in EPP ConfigMap"
271+
else
272+
log_info "Enabling flowControl feature gate in EPP ConfigMap $LLM_D_EPP_NAME"
273+
274+
# Use yq to properly add flowControl to featureGates array (creates array if missing, appends if exists)
275+
local UPDATED_CONFIG=$(echo "$CURRENT_CONFIG" | yq eval '.featureGates += ["flowControl"] | .featureGates |= unique' -)
276+
277+
# Validate that flowControl was successfully added
278+
if echo "$UPDATED_CONFIG" | yq eval '.featureGates // [] | contains(["flowControl"])' - | grep -q 'true'; then
279+
# Apply the updated config
280+
kubectl patch configmap "$LLM_D_EPP_NAME" -n "$LLMD_NS" --type='json' -p='[
281+
{
282+
"op": "replace",
283+
"path": "/data/default-plugins.yaml",
284+
"value": "'"$(echo "$UPDATED_CONFIG" | sed 's/"/\\"/g' | tr '\n' '\r' | sed 's/\r/\\n/g')"'"
285+
}
286+
]'
287+
288+
# Restart deployment to pick up the config change
289+
log_info "Restarting $LLM_D_EPP_NAME deployment to apply flowControl feature gate"
290+
kubectl rollout restart deployment "$LLM_D_EPP_NAME" -n "$LLMD_NS"
291+
else
292+
log_error "Failed to add flowControl to featureGates in EPP ConfigMap - YAML structure may be invalid or unexpected"
293+
log_error "Current config structure: $(echo "$CURRENT_CONFIG" | yq eval '.' - 2>&1 | head -5)"
294+
exit 1
295+
fi
296+
fi
297+
else
298+
log_warning "ConfigMap $LLM_D_EPP_NAME not found in $LLMD_NS"
299+
fi
263300
else
264301
log_warning "Skipping inference-scheduler patch: Deployment $LLM_D_EPP_NAME not found in $LLMD_NS"
265302
fi

docs/developer-guide/testing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ This deploys:
163163
- Prometheus stack and Prometheus Adapter (or KEDA when `SCALER_BACKEND=keda`)
164164
- **No** VariantAutoscaling, HPA, or model services (tests create these)
165165

166-
When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE queuing** by patching the EPP with `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`. For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing helps populate `inference_extension_flow_control_queue_size` when requests hit the gateway.
166+
When `E2E_TESTS_ENABLED=true` (or `ENABLE_SCALE_TO_ZERO=true`), the deploy script enables **GIE queuing** by adding the `flowControl` feature gate to the EPP ConfigMap and updating the EPP image to a version that supports flow control. For **e2e**, the **InferenceObjective** `e2e-default` is created by the scale-from-zero tests (`test/e2e/fixtures`), not by `install.sh`. For non-e2e scale-to-zero (`ENABLE_SCALE_TO_ZERO=true` without e2e), `install.sh` still applies `deploy/inference-objective-e2e.yaml`. Queuing helps populate `inference_extension_flow_control_queue_size` when requests hit the gateway.
167167

168168
**Install script tuning (optional, same variables as `deploy/install.sh`):**
169169

docs/user-guide/scale-from-zero.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ The ScaleFromZero engine continuously monitors inactive VariantAutoscaling resou
4242
## Prerequisites
4343

4444
- WVA and llm-d installed and running - deployment options available for [kind](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kind-emulator/README.md), [OpenShift](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/openshift/README.md) and [Kubernetes](https://github.com/llm-d/llm-d-workload-variant-autoscaler/blob/main/deploy/kubernetes/README.md)
45-
- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (set EPP env `ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER=true`) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
45+
- **EPP flow control**: EndpointPicker (EPP) with flow control enabled (via the `flowControl` feature gate in the EPP ConfigMap) so the queue metric `inference_extension_flow_control_queue_size` is collected. InferenceObjective is not required to enable this metric; it is a QoS policy for priority-based scheduling and optional for scale-from-zero.
4646

4747

4848
## Usage

test/e2e/scale_from_zero_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ func cleanupScaleFromZeroResources() {
135135
}
136136

137137
// Scale-from-zero test validates that the WVA controller correctly detects pending requests
138-
// and scales up scale targets from zero replicas. Requires GIE queuing (ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER
139-
// on EPP from install when E2E_TESTS_ENABLED=true) and an InferenceObjective (applied below in BeforeAll).
138+
// and scales up scale targets from zero replicas. Requires GIE queuing (flowControl feature gate
139+
// enabled on EPP from install when E2E_TESTS_ENABLED=true) and an InferenceObjective (applied below in BeforeAll).
140140
// This suite needs a scaler that allows minReplicas=0 on the scaled workload: either
141141
// SCALE_TO_ZERO_ENABLED=true where native HPA supports it (HPAScaleToZero), or SCALER_BACKEND=keda
142142
// (ScaledObject). OpenShift usually lacks HPAScaleToZero; e2e config ignores SCALE_TO_ZERO_ENABLED there,
@@ -1448,4 +1448,3 @@ var _ = Describe("Scale-From-Zero Feature with LeaderWorkerSet (single-node)", S
14481448
})
14491449
})
14501450
})
1451-

0 commit comments

Comments
 (0)