XPU PD Test #52

Workflow file for this run

.github/workflows/e2e-pd-xpu.yaml at 19ff2e8

	name: XPU PD Test

	on:
	schedule:
	- cron: '0 10 * * *' # 3AM PST (10:00 UTC)
	workflow_dispatch:
	inputs:
	pr_or_branch:
	description: 'PR number (e.g. 123 or pr-123) or branch name (e.g. main, feature/xyz) to test'
	required: true
	default: 'main'
	type: string
	custom_image_tag:
	description: 'Tag only (e.g. pr-123, auto-expanded to ghcr.io/<org>/<repo>-xpu-dev:pr-123) or full image address (e.g. ghcr.io/llm-d/llm-d-xpu:v0.5.1). Leave empty to use default from values file'
	required: false
	default: ''
	type: string
	workflow_call:
	inputs:
	pr_or_branch:
	description: 'Pull-request number or branch name to test'
	required: true
	default: 'main'
	type: string
	custom_image_tag:
	description: 'Custom XPU image tag to use (optional)'
	required: false
	default: ''
	type: string

	jobs:
	deploy_and_validate:
	if: github.repository == 'llm-d/llm-d'
	runs-on: xpu
	env:
	NAMESPACE: "llm-d-xpu-pd"
	GATEWAY_TYPE: "istio"
	RELEASE_NAME_POSTFIX: "pd-xpu"
	INFRA_RELEASE_NAME: "infra-pd-xpu"
	GAIE_RELEASE_NAME: "gaie-pd-xpu"
	MS_RELEASE_NAME: "ms-pd-xpu"

	steps:
	- name: Checkout
	uses: actions/checkout@v6
	with:
	persist-credentials: false

	- name: Determine if pr_or_branch is a PR number
	id: check_pr
	env:
	PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch }}
	shell: bash
	run: \|
	echo "PR_OR_BRANCH=${PR_OR_BRANCH:-main}" >> "$GITHUB_ENV"
	# Strip optional 'pr-' prefix so both '704' and 'pr-704' work
	CLEAN="${PR_OR_BRANCH#pr-}"
	if [[ "$CLEAN" =~ ^[0-9]+$ ]]; then
	echo "PR_OR_BRANCH=$CLEAN" >> "$GITHUB_ENV"
	echo "is_pr=true" >> "$GITHUB_OUTPUT"
	elif [[ "${{ github.event_name }}" = "pull_request" ]]; then
	echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
	echo "is_pr=true" >> "$GITHUB_OUTPUT"
	else
	echo "is_pr=false" >> "$GITHUB_OUTPUT"
	fi

	- name: Fetch and checkout PR
	if: steps.check_pr.outputs.is_pr == 'true'
	run: \|
	git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH"
	git checkout pr-"$PR_OR_BRANCH"

	- name: Checkout branch
	if: steps.check_pr.outputs.is_pr == 'false'
	run: git checkout "$PR_OR_BRANCH"

	- name: Install prerequisites idempotently
	run: \|
	./helpers/client-setup/install-deps.sh \| tee ~/install-deps.log

	- name: Install chart dependencies (CRDs and Istio)
	run: \|
	cd guides/prereq/gateway-provider
	./install-gateway-provider-dependencies.sh
	helmfile apply -f istio.helmfile.yaml

	- name: Install monitoring stack
	run: \|
	cd docs/monitoring
	./scripts/install-prometheus-grafana.sh \|\| true

	- name: Create namespace
	run: \|
	kubectl create namespace "${NAMESPACE}" \|\| echo "Namespace already exists"

	- name: Create llm-d-hf-token secret
	run: \|
	kubectl create secret generic llm-d-hf-token \
	--from-literal="HF_TOKEN=${{ secrets.HF_TOKEN }}" \
	--namespace "${NAMESPACE}" \
	--dry-run=client -o yaml \| kubectl apply -f -

	- name: Set image tag for deployment
	id: set_image_tag
	run: \|
	CUSTOM_TAG="${{ inputs.custom_image_tag }}"
	if [ -n "${CUSTOM_TAG}" ]; then
	# Check if it's a full image address (contains '/') or just a tag
	if [[ "${CUSTOM_TAG}" == "/" ]]; then
	echo "Using custom full image address: ${CUSTOM_TAG}"
	FULL_IMAGE="${CUSTOM_TAG}"
	else
	echo "Using custom image tag: ${CUSTOM_TAG}"
	FULL_IMAGE="ghcr.io/${{ github.repository }}-xpu-dev:${CUSTOM_TAG}"
	fi
	echo "IMAGE_TAG=${CUSTOM_TAG}" >> $GITHUB_ENV
	echo "HELM_SET_IMAGE=--set decode.containers[0].image=${FULL_IMAGE} --set decode.containers[0].imagePullPolicy=Always --set prefill.containers[0].image=${FULL_IMAGE} --set prefill.containers[0].imagePullPolicy=Always" >> $GITHUB_ENV
	else
	echo "Using default image tag from values file"
	echo "HELM_SET_IMAGE=" >> $GITHUB_ENV
	fi

	- name: Deploy guide
	run: \|
	cd guides/pd-disaggregation
	RELEASE_NAME_POSTFIX=${RELEASE_NAME_POSTFIX} \
	helmfile apply -e xpu -n "${NAMESPACE}" ${HELM_SET_IMAGE} \
	--skip-schema-validation \
	\| tee ~/pd-xpu-deployment.log
	echo "---------------------------------------" >> ~/pd-xpu-deployment.log

	- name: Deploy HTTPRoute
	run: \|
	cd guides/pd-disaggregation
	echo "Deploying HTTPRoute with RELEASE_NAME_POSTFIX=${RELEASE_NAME_POSTFIX}..."
	# Create a temporary HTTPRoute file with the correct names
	sed -e "s/infra-pd-inference-gateway/${INFRA_RELEASE_NAME}-inference-gateway/g" \
	-e "s/gaie-pd/${GAIE_RELEASE_NAME}/g" \
	-e "s/name: llm-d-pd-disaggregation/name: llm-d-xpu-pd-${RELEASE_NAME_POSTFIX}/g" \
	httproute.yaml > httproute-xpu.yaml
	echo "Generated HTTPRoute configuration:"
	cat httproute-xpu.yaml
	kubectl apply -f httproute-xpu.yaml -n "${NAMESPACE}" \
	\| tee -a ~/pd-xpu-deployment.log
	echo "---------------------------------------" >> ~/pd-xpu-deployment.log

	- name: fetch helm manifests - prepare for upload
	run: \|
	for release_name in "${INFRA_RELEASE_NAME}" "${GAIE_RELEASE_NAME}" "${MS_RELEASE_NAME}"; do
	bash .github/scripts/e2e/helm-get-all.sh \
	~/pd-xpu-deployment.log \
	"$release_name" \
	"$NAMESPACE"
	done

	- name: Wait for all pods to be ready
	id: wait_pods
	run: \|
	echo "Waiting for all pods to be ready..."
	if kubectl wait pod \
	--for=condition=Ready \
	--all \
	-n "${NAMESPACE}" \
	--timeout=10m; then
	echo "✅ All pods are ready."
	kubectl get pods -n "${NAMESPACE}"
	else
	echo "❌ Pods failed to become ready within timeout"
	echo "=== Pod Status ==="
	kubectl get pods -n "${NAMESPACE}"
	echo "=== Pod Descriptions ==="
	kubectl describe pods -n "${NAMESPACE}"
	exit 1
	fi

	- name: Check gateway pod is up
	if: always()
	run: \|
	GATEWAY_POD_READY=$(kubectl get pods -n "${NAMESPACE}" \| grep "inference-gateway" \| awk '{print $2}')
	if [ "${GATEWAY_POD_READY}" = "1/1" ]; then
	echo "✅ Gateway pod ready."
	else
	echo "❌ Missing gateway pod"
	fi

	- name: Show deployment status
	if: always()
	run: \|
	echo "=== Deployments ==="
	kubectl get deployments -n "${NAMESPACE}"
	echo ""
	echo "=== Replica Sets ==="
	kubectl get replicasets -n "${NAMESPACE}"
	echo ""
	echo "=== Pods ==="
	kubectl get pods -n "${NAMESPACE}"
	echo ""
	echo "=== Services ==="
	kubectl get svc -n "${NAMESPACE}"
	echo ""
	echo "=== Helm releases ==="
	helm list -n "${NAMESPACE}" \|\| true
	echo ""
	echo "=== Inference Pools ==="
	kubectl get InferencePool.inference.networking.k8s.io -n "${NAMESPACE}" \|\| true
	echo ""
	echo "=== HTTPRoutes ==="
	kubectl get httproutes -n "${NAMESPACE}" \|\| true
	echo ""
	echo "=== Gateway ==="
	kubectl get Gateway -n "${NAMESPACE}" \|\| true
	echo ""
	echo "=== Destination Rule ==="
	kubectl get destinationrule -n "${NAMESPACE}" \|\| true
	echo ""

	- name: Verify installation and run inference tests
	if: steps.wait_pods.outcome == 'success'
	run: \|
	cd .github/scripts/e2e
	./e2e-validate.sh -n "${NAMESPACE}" -v

	- name: Collect and upload Kubernetes pod logs
	if: always()
	run: \|
	mkdir -p pod-logs-pd-xpu
	cd pod-logs-pd-xpu
	echo "Fetching ${NAMESPACE} pods log..."
	kubectl get pods -n "${NAMESPACE}" -l "llm-d.ai/inferenceServing" -o yaml > ./inference-pods.yaml \|\| true
	kubectl logs -n "${NAMESPACE}" -l "llm-d.ai/inferenceServing" 2>&1 \| grep -v "waiting for vLLM to be ready" > ./inference-pod-logs.log \|\| true
	kubectl describe pod -n "${NAMESPACE}" -l "llm-d.ai/inferenceServing" > ./inference-describe-pod-logs.log \|\| true

	echo "Collecting logs from all pods..."
	kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
	\| xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1' \|\| true

	echo "Fetching ${NAMESPACE} pods descriptions..."
	kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
	\| xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1' \|\| true

	echo "Collecting events..."
	kubectl get events -n "${NAMESPACE}" --sort-by='.lastTimestamp' > events.log 2>&1 \|\| true

	mv ~/pd-xpu-deployment.log . \|\| true
	mv ~/install-deps.log . \|\| true

	echo "Log collection completed."
	ls -la

	echo "Log collection completed."
	ls -la

	- name: Upload pod logs as artifact
	uses: actions/upload-artifact@v7
	if: always()
	with:
	name: llmd-pod-logs-pd-xpu
	path: pod-logs-pd-xpu

	- name: Cleanup deployment
	if: always()
	run: \|
	cd guides/pd-disaggregation
	helmfile destroy -e xpu -n "${NAMESPACE}"
	kubectl delete ns ${NAMESPACE}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

XPU PD Test #52

Workflow file

XPU PD Test #52

Uh oh!

Workflow file for this run