Skip to content

Commit 7b9b77a

Browse files
clubandersonclaude
andcommitted
🐛 Fix e2e tests by adding DECODE_REPLICAS override
PR llm-d/llm-d#619 changed decode.replicas from 2 to 8, requiring 16 GPUs (8 pods × 2 GPUs each). This breaks e2e tests which don't have enough GPU resources available. Add DECODE_REPLICAS environment variable support to install.sh, similar to the existing VLLM_MAX_NUM_SEQS pattern. Set it to 1 in the OpenShift e2e workflow so tests start with minimal resources and let the HPA scale as needed. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Andrew Anderson <andy@clubanderson.com>
1 parent e40d310 commit 7b9b77a

2 files changed

Lines changed: 14 additions & 0 deletions

File tree

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,8 @@ jobs:
475475
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
476476
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
477477
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
478+
# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
479+
DECODE_REPLICAS: "1"
478480
run: |
479481
echo "Deploying WVA and llm-d infrastructure..."
480482
echo " MODEL_ID: $MODEL_ID"
@@ -485,6 +487,7 @@ jobs:
485487
echo " WVA_IMAGE_TAG: $WVA_IMAGE_TAG"
486488
echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
487489
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
490+
echo " DECODE_REPLICAS: $DECODE_REPLICAS"
488491
echo " HF token configuration: ✓"
489492
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
490493
@@ -530,11 +533,14 @@ jobs:
530533
DEPLOY_HPA: "false"
531534
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
532535
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
536+
# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
537+
DECODE_REPLICAS: "1"
533538
run: |
534539
echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
535540
echo " MODEL_ID: $MODEL_ID"
536541
echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
537542
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
543+
echo " DECODE_REPLICAS: $DECODE_REPLICAS"
538544
539545
# Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
540546
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift

deploy/install.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ SKIP_CHECKS=${SKIP_CHECKS:-false}
9494
E2E_TESTS_ENABLED=${E2E_TESTS_ENABLED:-false}
9595
# vLLM max-num-seqs (max concurrent sequences per replica, lower = easier to saturate for testing)
9696
VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-""}
97+
# Decode replicas override (useful for e2e testing with limited GPUs)
98+
DECODE_REPLICAS=${DECODE_REPLICAS:-""}
9799

98100
# Environment-related variables
99101
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
@@ -758,6 +760,12 @@ deploy_llm_d_infrastructure() {
758760
yq eval ".decode.containers[0].args += [\"--max-num-seqs=$VLLM_MAX_NUM_SEQS\"]" -i "$LLM_D_MODELSERVICE_VALUES"
759761
fi
760762

763+
# Configure decode replicas if set (useful for e2e testing with limited GPUs)
764+
if [ -n "$DECODE_REPLICAS" ]; then
765+
log_info "Setting decode replicas to $DECODE_REPLICAS"
766+
yq eval ".decode.replicas = $DECODE_REPLICAS" -i "$LLM_D_MODELSERVICE_VALUES"
767+
fi
768+
761769
# Deploy llm-d core components
762770
log_info "Deploying llm-d core components"
763771
helmfile apply -e $GATEWAY_PROVIDER -n ${LLMD_NS}

0 commit comments

Comments
 (0)