Skip to content

Commit 2ca4c66

Browse files
clubandersonclaude
andauthored
🐛 Fix e2e tests by adding DECODE_REPLICAS override (llm-d#648)
PR llm-d/llm-d#619 changed decode.replicas from 2 to 8 and the default model from Qwen/Qwen3-0.6B to Qwen/Qwen3-32B. This breaks e2e tests. Changes: - Add DECODE_REPLICAS environment variable support to install.sh - Set DECODE_REPLICAS=1 in the OpenShift e2e workflow - Fix gaie-sim-epp patch to only run when the deployment exists - Update DEFAULT_MODEL_ID to Qwen/Qwen3-32B to match llm-d values.yaml (fixes silent model substitution failure) - Increase model-storage size from 30Gi to 100Gi to handle larger models Signed-off-by: Andrew Anderson <andy@clubanderson.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 660fcc0 commit 2ca4c66

2 files changed

Lines changed: 18 additions & 4 deletions

File tree

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,8 @@ jobs:
475475
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
476476
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
477477
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
478+
# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
479+
DECODE_REPLICAS: "1"
478480
run: |
479481
echo "Deploying WVA and llm-d infrastructure..."
480482
echo " MODEL_ID: $MODEL_ID"
@@ -485,6 +487,7 @@ jobs:
485487
echo " WVA_IMAGE_TAG: $WVA_IMAGE_TAG"
486488
echo " CONTROLLER_INSTANCE: $CONTROLLER_INSTANCE"
487489
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
490+
echo " DECODE_REPLICAS: $DECODE_REPLICAS"
488491
echo " HF token configuration: ✓"
489492
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
490493
@@ -530,11 +533,14 @@ jobs:
530533
DEPLOY_HPA: "false"
531534
# vLLM max-num-seqs for e2e testing (lower = easier to saturate)
532535
VLLM_MAX_NUM_SEQS: ${{ env.MAX_NUM_SEQS }}
536+
# Decode replicas for e2e testing (start with 1 replica, let HPA scale)
537+
DECODE_REPLICAS: "1"
533538
run: |
534539
echo "Deploying Model B infrastructure in $LLMD_NAMESPACE_B..."
535540
echo " MODEL_ID: $MODEL_ID"
536541
echo " ACCELERATOR_TYPE: $ACCELERATOR_TYPE"
537542
echo " VLLM_MAX_NUM_SEQS: $VLLM_MAX_NUM_SEQS"
543+
echo " DECODE_REPLICAS: $DECODE_REPLICAS"
538544
539545
# Deploy llm-d infrastructure only (no WVA controller, no VA/HPA)
540546
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --environment openshift

deploy/install.sh

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ INSTALL_GATEWAY_CTRLPLANE_ORIGINAL="${INSTALL_GATEWAY_CTRLPLANE:-}"
6464
INSTALL_GATEWAY_CTRLPLANE="${INSTALL_GATEWAY_CTRLPLANE:-false}"
6565

6666
# Model and SLO Configuration
67-
DEFAULT_MODEL_ID=${DEFAULT_MODEL_ID:-"Qwen/Qwen3-0.6B"}
67+
DEFAULT_MODEL_ID=${DEFAULT_MODEL_ID:-"Qwen/Qwen3-32B"}
6868
MODEL_ID=${MODEL_ID:-"unsloth/Meta-Llama-3.1-8B"}
6969
ACCELERATOR_TYPE=${ACCELERATOR_TYPE:-"H100"}
7070
SLO_TPOT=${SLO_TPOT:-10} # Target time-per-output-token SLO (in ms)
@@ -94,6 +94,8 @@ SKIP_CHECKS=${SKIP_CHECKS:-false}
9494
E2E_TESTS_ENABLED=${E2E_TESTS_ENABLED:-false}
9595
# vLLM max-num-seqs (max concurrent sequences per replica, lower = easier to saturate for testing)
9696
VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-""}
97+
# Decode replicas override (useful for e2e testing with limited GPUs)
98+
DECODE_REPLICAS=${DECODE_REPLICAS:-""}
9799

98100
# Environment-related variables
99101
SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
@@ -737,7 +739,7 @@ deploy_llm_d_infrastructure() {
737739

738740
# Increase model-storage volume size
739741
log_info "Increasing model-storage volume size for model: $MODEL_ID"
740-
yq eval '.modelArtifacts.size = "30Gi"' -i "$LLM_D_MODELSERVICE_VALUES"
742+
yq eval '.modelArtifacts.size = "100Gi"' -i "$LLM_D_MODELSERVICE_VALUES"
741743
fi
742744

743745
# Configure llm-d-inference-simulator if needed
@@ -758,6 +760,12 @@ deploy_llm_d_infrastructure() {
758760
yq eval ".decode.containers[0].args += [\"--max-num-seqs=$VLLM_MAX_NUM_SEQS\"]" -i "$LLM_D_MODELSERVICE_VALUES"
759761
fi
760762

763+
# Configure decode replicas if set (useful for e2e testing with limited GPUs)
764+
if [ -n "$DECODE_REPLICAS" ]; then
765+
log_info "Setting decode replicas to $DECODE_REPLICAS"
766+
yq eval ".decode.replicas = $DECODE_REPLICAS" -i "$LLM_D_MODELSERVICE_VALUES"
767+
fi
768+
761769
# Deploy llm-d core components
762770
log_info "Deploying llm-d core components"
763771
helmfile apply -e $GATEWAY_PROVIDER -n ${LLMD_NS}
@@ -772,8 +780,8 @@ deploy_llm_d_infrastructure() {
772780
# -p '{"spec":{"kube":{"service":{"type":"NodePort"}}}}'
773781
# fi
774782

775-
# Patch llm-d-inference-simulator deployment if scale-to-zero is enabled
776-
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ]; then
783+
# Patch llm-d-inference-simulator deployment if scale-to-zero is enabled and simulator is deployed
784+
if [ "$ENABLE_SCALE_TO_ZERO" == "true" ] && kubectl get deployment gaie-sim-epp -n $LLMD_NS &>/dev/null; then
777785
# Patch llm-d-inference-simulator deployment to use the correct image
778786
log_info "Patching llm-d-inference-simulator deployment to enable flowcontrol and use a new image"
779787
export DEPLOYMENT_NAME="gaie-sim-epp"

0 commit comments

Comments
 (0)