Skip to content

.github/workflows/nightly-e2e-pd-disaggregation-gke.yaml #33

.github/workflows/nightly-e2e-pd-disaggregation-gke.yaml

.github/workflows/nightly-e2e-pd-disaggregation-gke.yaml #33

name: Nightly - PD Disaggregation E2E (GKE)
# Nightly regression test for the pd-disaggregation guide on GKE.
# Deploys via helmfile (gke env) and validates with e2e-validate.sh.
# Uses a slim model transformation to reduce GPU requirements.
on:
schedule:
- cron: '30 10 * * *' # 10:30 UTC daily (staggered)
workflow_dispatch:
inputs:
skip_cleanup:
description: 'Skip cleanup after tests (for debugging)'
required: false
default: 'false'
permissions:
contents: read
concurrency:
group: nightly-e2e-pd-disaggregation-gke
cancel-in-progress: true
jobs:
nightly:
if: github.repository == 'llm-d/llm-d'
uses: llm-d/llm-d-infra/.github/workflows/reusable-nightly-e2e-gke-helmfile.yaml@main
with:
guide_name: pd-disaggregation
namespace: llm-d-nightly-pd-gke
helmfile_env: gke_pd_rdma
gateway_type: gke
gke_cluster_name: llm-d-e2e-us-east5
gke_cluster_zone: us-east5
required_gpus: 2
recommended_gpus: 4
accelerator_type: H100
pod_wait_timeout: '30m'
pod_readiness_delay: 180
httproute_file: httproute.gke.yaml
# Slim transform: reduce model to Qwen3-0.6B, 1 GPU per pod, reduce memory
pre_deploy_script: |
echo "Applying pd-disaggregation slim transforms..."
cd guides/pd-disaggregation
yq e '.modelArtifacts.uri = "hf://Qwen/Qwen3-0.6B"' -i ms-pd/values.yaml
yq e '.routing.modelName = "Qwen/Qwen3-0.6B"' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].args[] | select(. == "--max-model-len" or . == "32000"))' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].args[] | select(. == "--max-model-len" or . == "32000"))' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].resources.limits.memory)' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].resources.requests.memory)' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].resources.limits.cpu)' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].resources.requests.cpu)' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].resources.limits."rdma/ib")' -i ms-pd/values.yaml
yq e 'del(.decode.containers[0].resources.requests."rdma/ib")' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].resources.limits.memory)' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].resources.requests.memory)' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].resources.limits.cpu)' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].resources.requests.cpu)' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].resources.limits."rdma/ib")' -i ms-pd/values.yaml
yq e 'del(.prefill.containers[0].resources.requests."rdma/ib")' -i ms-pd/values.yaml
yq e '.decode.containers[0].resources.limits["nvidia.com/gpu"] = "1"' -i ms-pd/values.yaml
yq e '.decode.containers[0].resources.requests["nvidia.com/gpu"] = "1"' -i ms-pd/values.yaml
yq e '.decode.parallelism.tensor = 1' -i ms-pd/values.yaml
yq e '.prefill.replicas = 1' -i ms-pd/values.yaml
yq e '.decode.volumes[1].emptyDir.sizeLimit = "2Gi"' -i ms-pd/values.yaml
yq e '.prefill.volumes[1].emptyDir.sizeLimit = "2Gi"' -i ms-pd/values.yaml
echo "Slim transform applied — model: Qwen3-0.6B, 1 GPU decode, 1 prefill replica"
cd "$GITHUB_WORKSPACE"
image_override: 'ghcr.io/llm-d/llm-d-cuda-dev:latest'
allow_gpu_preemption: true
skip_cleanup: ${{ github.event.inputs.skip_cleanup == 'true' }}
secrets: inherit