.github/workflows/nightly-e2e-pd-disaggregation-gke.yaml #31
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly - PD Disaggregation E2E (GKE) | |
| # Nightly regression test for the pd-disaggregation guide on GKE. | |
| # Deploys via helmfile (gke env) and validates with e2e-validate.sh. | |
| # Uses a slim model transformation to reduce GPU requirements. | |
| on: | |
| schedule: | |
| - cron: '30 10 * * *' # 10:30 UTC daily (staggered) | |
| workflow_dispatch: | |
| inputs: | |
| skip_cleanup: | |
| description: 'Skip cleanup after tests (for debugging)' | |
| required: false | |
| default: 'false' | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: nightly-e2e-pd-disaggregation-gke | |
| cancel-in-progress: true | |
| jobs: | |
| nightly: | |
| if: github.repository == 'llm-d/llm-d' | |
| uses: llm-d/llm-d-infra/.github/workflows/reusable-nightly-e2e-gke-helmfile.yaml@main | |
| with: | |
| guide_name: pd-disaggregation | |
| namespace: llm-d-nightly-pd-gke | |
| helmfile_env: gke_pd_rdma | |
| gateway_type: gke | |
| gke_cluster_name: llm-d-e2e-us-east5 | |
| gke_cluster_zone: us-east5 | |
| required_gpus: 2 | |
| recommended_gpus: 4 | |
| accelerator_type: H100 | |
| pod_wait_timeout: '30m' | |
| pod_readiness_delay: 180 | |
| httproute_file: httproute.gke.yaml | |
| # Slim transform: reduce model to Qwen3-0.6B, 1 GPU per pod, reduce memory | |
| pre_deploy_script: | | |
| echo "Applying pd-disaggregation slim transforms..." | |
| cd guides/pd-disaggregation | |
| yq e '.modelArtifacts.uri = "hf://Qwen/Qwen3-0.6B"' -i ms-pd/values.yaml | |
| yq e '.routing.modelName = "Qwen/Qwen3-0.6B"' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].args[] | select(. == "--max-model-len" or . == "32000"))' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].args[] | select(. == "--max-model-len" or . == "32000"))' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].resources.limits.memory)' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].resources.requests.memory)' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].resources.limits.cpu)' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].resources.requests.cpu)' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].resources.limits."rdma/ib")' -i ms-pd/values.yaml | |
| yq e 'del(.decode.containers[0].resources.requests."rdma/ib")' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].resources.limits.memory)' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].resources.requests.memory)' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].resources.limits.cpu)' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].resources.requests.cpu)' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].resources.limits."rdma/ib")' -i ms-pd/values.yaml | |
| yq e 'del(.prefill.containers[0].resources.requests."rdma/ib")' -i ms-pd/values.yaml | |
| yq e '.decode.containers[0].resources.limits["nvidia.com/gpu"] = "1"' -i ms-pd/values.yaml | |
| yq e '.decode.containers[0].resources.requests["nvidia.com/gpu"] = "1"' -i ms-pd/values.yaml | |
| yq e '.decode.parallelism.tensor = 1' -i ms-pd/values.yaml | |
| yq e '.prefill.replicas = 1' -i ms-pd/values.yaml | |
| yq e '.decode.volumes[1].emptyDir.sizeLimit = "2Gi"' -i ms-pd/values.yaml | |
| yq e '.prefill.volumes[1].emptyDir.sizeLimit = "2Gi"' -i ms-pd/values.yaml | |
| echo "Slim transform applied — model: Qwen3-0.6B, 1 GPU decode, 1 prefill replica" | |
| cd "$GITHUB_WORKSPACE" | |
| image_override: 'ghcr.io/llm-d/llm-d-cuda-dev:latest' | |
| allow_gpu_preemption: true | |
| skip_cleanup: ${{ github.event.inputs.skip_cleanup == 'true' }} | |
| secrets: inherit |