Nightly - Precise Prefix Cache E2E (OpenShift) #52
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly - Precise Prefix Cache E2E (OpenShift) | |
| # Nightly regression test for the precise-prefix-cache-aware guide on OpenShift. | |
| # Deploys the guide via custom_deploy_script (kustomize + standalone chart with | |
| # UDS tokenizer post-renderer) and validates with e2e-validate.sh against the | |
| # scheduler EPP service. Mirrors the optimized-baseline CKS/GKE migration in | |
| # llm-d/llm-d#1268; the OCP helmfile reusable workflow doesn't expose a | |
| # `gateway_host` input yet, so pre_deploy_script seeds GATEWAY_HOST via | |
| # GITHUB_ENV. | |
| on: | |
| schedule: | |
| - cron: '0 1 * * *' # 01:00 UTC daily | |
| workflow_dispatch: | |
| inputs: | |
| skip_cleanup: | |
| description: 'Skip cleanup after tests (for debugging)' | |
| required: false | |
| default: 'false' | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: nightly-e2e-precise-prefix-cache | |
| cancel-in-progress: true | |
| jobs: | |
| nightly: | |
| if: github.repository == 'llm-d/llm-d' | |
| uses: llm-d/llm-d-infra/.github/workflows/reusable-nightly-e2e-openshift-helmfile.yaml@main | |
| with: | |
| guide_name: precise-prefix-cache-aware | |
| namespace: llm-d-nightly-prefix-cache | |
| accelerator_type: H100 | |
| required_gpus: 2 | |
| recommended_gpus: 4 | |
| pod_wait_timeout: '30m' | |
| pod_readiness_delay: 180 | |
| image_override: 'ghcr.io/llm-d/llm-d-cuda-dev:latest' | |
| allow_gpu_preemption: true | |
| install_gateway_provider: false | |
| skip_cleanup: ${{ github.event.inputs.skip_cleanup == 'true' }} | |
| pre_deploy_script: | | |
| echo "Adding H100 nodeSelector to gpu/vllm overlay..." | |
| yq e '.spec.template.spec.nodeSelector["nvidia.com/gpu.product"] = "NVIDIA-H100-80GB-HBM3"' \ | |
| -i "${GUIDE_PATH}/modelserver/gpu/vllm/patch-vllm.yaml" | |
| yq e '.spec.template.spec.nodeSelector' "${GUIDE_PATH}/modelserver/gpu/vllm/patch-vllm.yaml" | |
| echo "GATEWAY_HOST=precise-prefix-cache-aware-epp" >> "$GITHUB_ENV" | |
| custom_deploy_script: | | |
| kubectl apply -k guides/precise-prefix-cache-aware/modelserver/gpu/vllm -n ${NAMESPACE} | |
| helm install precise-prefix-cache-aware \ | |
| oci://registry.k8s.io/gateway-api-inference-extension/charts/standalone \ | |
| -f guides/recipes/scheduler/base.values.yaml \ | |
| -f guides/precise-prefix-cache-aware/scheduler/precise-prefix-cache-aware.values.yaml \ | |
| --post-renderer ./guides/precise-prefix-cache-aware/scheduler/patches/uds-tokenizer/post-renderer.sh \ | |
| -n ${NAMESPACE} --version v1.4.0 | |
| secrets: inherit |