XPU PD Test #52
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: XPU PD Test | |
| on: | |
| schedule: | |
| - cron: '0 10 * * *' # 3AM PST (10:00 UTC) | |
| workflow_dispatch: | |
| inputs: | |
| pr_or_branch: | |
| description: 'PR number (e.g. 123 or pr-123) or branch name (e.g. main, feature/xyz) to test' | |
| required: true | |
| default: 'main' | |
| type: string | |
| custom_image_tag: | |
| description: 'Tag only (e.g. pr-123, auto-expanded to ghcr.io/<org>/<repo>-xpu-dev:pr-123) or full image address (e.g. ghcr.io/llm-d/llm-d-xpu:v0.5.1). Leave empty to use default from values file' | |
| required: false | |
| default: '' | |
| type: string | |
| workflow_call: | |
| inputs: | |
| pr_or_branch: | |
| description: 'Pull-request number or branch name to test' | |
| required: true | |
| default: 'main' | |
| type: string | |
| custom_image_tag: | |
| description: 'Custom XPU image tag to use (optional)' | |
| required: false | |
| default: '' | |
| type: string | |
| jobs: | |
| deploy_and_validate: | |
| if: github.repository == 'llm-d/llm-d' | |
| runs-on: xpu | |
| env: | |
| NAMESPACE: "llm-d-xpu-pd" | |
| GATEWAY_TYPE: "istio" | |
| RELEASE_NAME_POSTFIX: "pd-xpu" | |
| INFRA_RELEASE_NAME: "infra-pd-xpu" | |
| GAIE_RELEASE_NAME: "gaie-pd-xpu" | |
| MS_RELEASE_NAME: "ms-pd-xpu" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| persist-credentials: false | |
| - name: Determine if pr_or_branch is a PR number | |
| id: check_pr | |
| env: | |
| PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch }} | |
| shell: bash | |
| run: | | |
| echo "PR_OR_BRANCH=${PR_OR_BRANCH:-main}" >> "$GITHUB_ENV" | |
| # Strip optional 'pr-' prefix so both '704' and 'pr-704' work | |
| CLEAN="${PR_OR_BRANCH#pr-}" | |
| if [[ "$CLEAN" =~ ^[0-9]+$ ]]; then | |
| echo "PR_OR_BRANCH=$CLEAN" >> "$GITHUB_ENV" | |
| echo "is_pr=true" >> "$GITHUB_OUTPUT" | |
| elif [[ "${{ github.event_name }}" = "pull_request" ]]; then | |
| echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV | |
| echo "is_pr=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "is_pr=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Fetch and checkout PR | |
| if: steps.check_pr.outputs.is_pr == 'true' | |
| run: | | |
| git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH" | |
| git checkout pr-"$PR_OR_BRANCH" | |
| - name: Checkout branch | |
| if: steps.check_pr.outputs.is_pr == 'false' | |
| run: git checkout "$PR_OR_BRANCH" | |
| - name: Install prerequisites idempotently | |
| run: | | |
| ./helpers/client-setup/install-deps.sh | tee ~/install-deps.log | |
| - name: Install chart dependencies (CRDs and Istio) | |
| run: | | |
| cd guides/prereq/gateway-provider | |
| ./install-gateway-provider-dependencies.sh | |
| helmfile apply -f istio.helmfile.yaml | |
| - name: Install monitoring stack | |
| run: | | |
| cd docs/monitoring | |
| ./scripts/install-prometheus-grafana.sh || true | |
| - name: Create namespace | |
| run: | | |
| kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists" | |
| - name: Create llm-d-hf-token secret | |
| run: | | |
| kubectl create secret generic llm-d-hf-token \ | |
| --from-literal="HF_TOKEN=${{ secrets.HF_TOKEN }}" \ | |
| --namespace "${NAMESPACE}" \ | |
| --dry-run=client -o yaml | kubectl apply -f - | |
| - name: Set image tag for deployment | |
| id: set_image_tag | |
| run: | | |
| CUSTOM_TAG="${{ inputs.custom_image_tag }}" | |
| if [ -n "${CUSTOM_TAG}" ]; then | |
| # Check if it's a full image address (contains '/') or just a tag | |
| if [[ "${CUSTOM_TAG}" == *"/"* ]]; then | |
| echo "Using custom full image address: ${CUSTOM_TAG}" | |
| FULL_IMAGE="${CUSTOM_TAG}" | |
| else | |
| echo "Using custom image tag: ${CUSTOM_TAG}" | |
| FULL_IMAGE="ghcr.io/${{ github.repository }}-xpu-dev:${CUSTOM_TAG}" | |
| fi | |
| echo "IMAGE_TAG=${CUSTOM_TAG}" >> $GITHUB_ENV | |
| echo "HELM_SET_IMAGE=--set decode.containers[0].image=${FULL_IMAGE} --set decode.containers[0].imagePullPolicy=Always --set prefill.containers[0].image=${FULL_IMAGE} --set prefill.containers[0].imagePullPolicy=Always" >> $GITHUB_ENV | |
| else | |
| echo "Using default image tag from values file" | |
| echo "HELM_SET_IMAGE=" >> $GITHUB_ENV | |
| fi | |
| - name: Deploy guide | |
| run: | | |
| cd guides/pd-disaggregation | |
| RELEASE_NAME_POSTFIX=${RELEASE_NAME_POSTFIX} \ | |
| helmfile apply -e xpu -n "${NAMESPACE}" ${HELM_SET_IMAGE} \ | |
| --skip-schema-validation \ | |
| | tee ~/pd-xpu-deployment.log | |
| echo "---------------------------------------" >> ~/pd-xpu-deployment.log | |
| - name: Deploy HTTPRoute | |
| run: | | |
| cd guides/pd-disaggregation | |
| echo "Deploying HTTPRoute with RELEASE_NAME_POSTFIX=${RELEASE_NAME_POSTFIX}..." | |
| # Create a temporary HTTPRoute file with the correct names | |
| sed -e "s/infra-pd-inference-gateway/${INFRA_RELEASE_NAME}-inference-gateway/g" \ | |
| -e "s/gaie-pd/${GAIE_RELEASE_NAME}/g" \ | |
| -e "s/name: llm-d-pd-disaggregation/name: llm-d-xpu-pd-${RELEASE_NAME_POSTFIX}/g" \ | |
| httproute.yaml > httproute-xpu.yaml | |
| echo "Generated HTTPRoute configuration:" | |
| cat httproute-xpu.yaml | |
| kubectl apply -f httproute-xpu.yaml -n "${NAMESPACE}" \ | |
| | tee -a ~/pd-xpu-deployment.log | |
| echo "---------------------------------------" >> ~/pd-xpu-deployment.log | |
| - name: fetch helm manifests - prepare for upload | |
| run: | | |
| for release_name in "${INFRA_RELEASE_NAME}" "${GAIE_RELEASE_NAME}" "${MS_RELEASE_NAME}"; do | |
| bash .github/scripts/e2e/helm-get-all.sh \ | |
| ~/pd-xpu-deployment.log \ | |
| "$release_name" \ | |
| "$NAMESPACE" | |
| done | |
| - name: Wait for all pods to be ready | |
| id: wait_pods | |
| run: | | |
| echo "Waiting for all pods to be ready..." | |
| if kubectl wait pod \ | |
| --for=condition=Ready \ | |
| --all \ | |
| -n "${NAMESPACE}" \ | |
| --timeout=10m; then | |
| echo "✅ All pods are ready." | |
| kubectl get pods -n "${NAMESPACE}" | |
| else | |
| echo "❌ Pods failed to become ready within timeout" | |
| echo "=== Pod Status ===" | |
| kubectl get pods -n "${NAMESPACE}" | |
| echo "=== Pod Descriptions ===" | |
| kubectl describe pods -n "${NAMESPACE}" | |
| exit 1 | |
| fi | |
| - name: Check gateway pod is up | |
| if: always() | |
| run: | | |
| GATEWAY_POD_READY=$(kubectl get pods -n "${NAMESPACE}" | grep "inference-gateway" | awk '{print $2}') | |
| if [ "${GATEWAY_POD_READY}" = "1/1" ]; then | |
| echo "✅ Gateway pod ready." | |
| else | |
| echo "❌ Missing gateway pod" | |
| fi | |
| - name: Show deployment status | |
| if: always() | |
| run: | | |
| echo "=== Deployments ===" | |
| kubectl get deployments -n "${NAMESPACE}" | |
| echo "" | |
| echo "=== Replica Sets ===" | |
| kubectl get replicasets -n "${NAMESPACE}" | |
| echo "" | |
| echo "=== Pods ===" | |
| kubectl get pods -n "${NAMESPACE}" | |
| echo "" | |
| echo "=== Services ===" | |
| kubectl get svc -n "${NAMESPACE}" | |
| echo "" | |
| echo "=== Helm releases ===" | |
| helm list -n "${NAMESPACE}" || true | |
| echo "" | |
| echo "=== Inference Pools ===" | |
| kubectl get InferencePool.inference.networking.k8s.io -n "${NAMESPACE}" || true | |
| echo "" | |
| echo "=== HTTPRoutes ===" | |
| kubectl get httproutes -n "${NAMESPACE}" || true | |
| echo "" | |
| echo "=== Gateway ===" | |
| kubectl get Gateway -n "${NAMESPACE}" || true | |
| echo "" | |
| echo "=== Destination Rule ===" | |
| kubectl get destinationrule -n "${NAMESPACE}" || true | |
| echo "" | |
| - name: Verify installation and run inference tests | |
| if: steps.wait_pods.outcome == 'success' | |
| run: | | |
| cd .github/scripts/e2e | |
| ./e2e-validate.sh -n "${NAMESPACE}" -v | |
| - name: Collect and upload Kubernetes pod logs | |
| if: always() | |
| run: | | |
| mkdir -p pod-logs-pd-xpu | |
| cd pod-logs-pd-xpu | |
| echo "Fetching ${NAMESPACE} pods log..." | |
| kubectl get pods -n "${NAMESPACE}" -l "llm-d.ai/inferenceServing" -o yaml > ./inference-pods.yaml || true | |
| kubectl logs -n "${NAMESPACE}" -l "llm-d.ai/inferenceServing" 2>&1 | grep -v "waiting for vLLM to be ready" > ./inference-pod-logs.log || true | |
| kubectl describe pod -n "${NAMESPACE}" -l "llm-d.ai/inferenceServing" > ./inference-describe-pod-logs.log || true | |
| echo "Collecting logs from all pods..." | |
| kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \ | |
| | xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1' || true | |
| echo "Fetching ${NAMESPACE} pods descriptions..." | |
| kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \ | |
| | xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1' || true | |
| echo "Collecting events..." | |
| kubectl get events -n "${NAMESPACE}" --sort-by='.lastTimestamp' > events.log 2>&1 || true | |
| mv ~/pd-xpu-deployment.log . || true | |
| mv ~/install-deps.log . || true | |
| echo "Log collection completed." | |
| ls -la | |
| echo "Log collection completed." | |
| ls -la | |
| - name: Upload pod logs as artifact | |
| uses: actions/upload-artifact@v7 | |
| if: always() | |
| with: | |
| name: llmd-pod-logs-pd-xpu | |
| path: pod-logs-pd-xpu | |
| - name: Cleanup deployment | |
| if: always() | |
| run: | | |
| cd guides/pd-disaggregation | |
| helmfile destroy -e xpu -n "${NAMESPACE}" | |
| kubectl delete ns ${NAMESPACE} |