🐛 Fix e2e tests by adding DECODE_REPLICAS override

clubanderson · claude · clubanderson · commit 7b9b77a5969a · 2026-01-27T09:50:19.000-05:00
PR llm-d/llm-d#619 changed decode.replicas from 2 to 8, requiring 16 GPUs (8 pods × 2 GPUs each). This breaks e2e tests which don't have enough GPU resources available. Add DECODE_REPLICAS environment variable support to install.sh, similar to the existing VLLM_MAX_NUM_SEQS pattern. Set it to 1 in the OpenShift e2e workflow so tests start with minimal resources and let the HPA scale as needed. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: Andrew Anderson <andy@clubanderson.com>
diff --git a/.github/workflows/ci-e2e-openshift.yaml b/.github/workflows/ci-e2e-openshift.yaml
@@ -475,6 +475,8 @@ jobs:
           CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
           # vLLM max-num-seqs for e2e testing (lower = easier to saturate)
           VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
+          # Decode replicas for e2e testing (start with 1 replica, let HPA scale)
+          DECODE_REPLICAS: "1"
         run: |
           echo "Deploying WVA and llm-d infrastructure..."
           echo "  MODEL_ID: $MODEL_ID"
@@ -485,6 +487,7 @@ jobs:
           echo "  WVA_IMAGE_TAG: $WVA_IMAGE_TAG"
           echo "  CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
           echo "  VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
+          echo "  DECODE_REPLICAS: $DECODE_REPLICAS"
           echo "  HF token configuration: ✓"
           ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
 
@@ -530,11 +533,14 @@ jobs:
           DEPLOY_HPA: "false"
           # vLLM max-num-seqs for e2e testing (lower = easier to saturate)
           VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
+          # Decode replicas for e2e testing (start with 1 replica, let HPA scale)
+          DECODE_REPLICAS: "1"
         run: |
           echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
           echo "  MODEL_ID: $MODEL_ID"
           echo "  ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
           echo "  VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
+          echo "  DECODE_REPLICAS: $DECODE_REPLICAS"
 
           # Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
           ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift
diff --git a/deploy/install.sh b/deploy/install.sh
@@ -94,6 +94,8 @@ SKIP_CHECKS=${SKIP_CHECKS:-false}
 E2E_TESTS_ENABLED=${E2E_TESTS_ENABLED:-false}
 # vLLM max-num-seqs (max concurrent sequences per replica, lower = easier to saturate for testing)
 VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-""}
+# Decode replicas override (useful for e2e testing with limited GPUs)
+DECODE_REPLICAS=${DECODE_REPLICAS:-""}
 
 # Environment-related variables
 SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
@@ -758,6 +760,12 @@ deploy_llm_d_infrastructure() {
       yq eval ".decode.containers[0].args += [\"--max-num-seqs=$VLLM_MAX_NUM_SEQS\"]" -i "$LLM_D_MODELSERVICE_VALUES"
     fi
 
+    # Configure decode replicas if set (useful for e2e testing with limited GPUs)
+    if [ -n "$DECODE_REPLICAS" ]; then
+      log_info "Setting decode replicas to $DECODE_REPLICAS"
+      yq eval ".decode.replicas = $DECODE_REPLICAS" -i "$LLM_D_MODELSERVICE_VALUES"
+    fi
+
     # Deploy llm-d core components
     log_info "Deploying llm-d core components"
     helmfile apply -e $GATEWAY_PROVIDER -n ${LLMD_NS}