E2E

E2E #207

Workflow file for this run

	# Trusted workflow: runs in the base repo context so self-hosted runners
	# and secrets are never exposed to untrusted fork code. Triggers after
	# the CI workflow completes successfully.
	#
	# No untrusted code is checked out or compiled here. All binaries and
	# scripts are pre-built artifacts uploaded by the CI workflow. K8s
	# manifests and test scripts are checked out from the default branch.
	name: E2E

	on:
	workflow_run:
	workflows: ["CI"]
	types: [completed]

	permissions:
	statuses: write
	actions: read
	contents: read
	packages: write

	jobs:
	mark-skipped:
	name: Mark E2E skipped
	runs-on: ubuntu-latest
	if: github.event.workflow_run.conclusion != 'success'
	permissions:
	statuses: write
	contents: read
	steps:
	- uses: actions/checkout@v6
	with:
	sparse-checkout: .github/workflows/e2e.yml
	sparse-checkout-cone-mode: false

	- name: Mark E2E statuses
	uses: actions/github-script@v9
	with:
	script: \|
	const fs = require('fs');
	const text = fs.readFileSync('.github/workflows/e2e.yml', 'utf8');
	const matches = [...text.matchAll(/^\sSTATUS_CONTEXT:\s["']([^"']+)["']\s*$/gm)];
	const contexts = [...new Set(matches.map((m) => m[1]))];

	if (contexts.length === 0) {
	core.setFailed('No STATUS_CONTEXT values found in .github/workflows/e2e.yml');
	return;
	}

	const sha = '${{ github.event.workflow_run.head_sha }}';
	const conclusion = '${{ github.event.workflow_run.conclusion }}';
	const desc = conclusion === 'failure'
	? 'Skipped: CI did not pass'
	: `Skipped: CI ${conclusion}`;

	for (const ctx of contexts) {
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner, repo: context.repo.repo, sha,
	state: 'failure',
	context: ctx,
	description: desc,
	target_url: '${{ github.event.workflow_run.html_url }}'
	});
	}

	native-host:
	name: Native-Host
	runs-on: [self-hosted, amd-aac02-rocm]
	if: github.event.workflow_run.conclusion == 'success'
	concurrency:
	group: native-host-cluster
	queue: max
	env:
	STATUS_CONTEXT: "E2E / Native-Host"
	BM_NODES: ${{ secrets.BM_NODES }}
	BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
	BM_REMOTE_BASE: /tmp/spur-bm-${{ github.run_id }}
	steps:
	- name: Set pending status
	uses: actions/github-script@v9
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner, repo: context.repo.repo,
	sha: '${{ github.event.workflow_run.head_sha }}',
	state: 'pending',
	target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
	context: '${{ env.STATUS_CONTEXT }}'
	});

	- name: Download release binaries
	uses: actions/download-artifact@v8
	with:
	name: release-binaries
	path: /tmp/release-binaries
	run-id: ${{ github.event.workflow_run.id }}
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Download E2E test binary
	uses: actions/download-artifact@v8
	with:
	name: k8s-test-binary
	path: /tmp/e2e-bin
	run-id: ${{ github.event.workflow_run.id }}
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Checkout trusted deploy assets and scripts from main
	uses: actions/checkout@v6
	with:
	persist-credentials: false
	sparse-checkout: \|
	deploy/native-host
	scripts
	sparse-checkout-cone-mode: false
	path: trusted-repo

	- name: Load SSH key into agent
	run: \|
	eval "$(ssh-agent -s)"
	printf '%s\n' "$BM_SSH_KEY" \| ssh-add -
	echo "SSH_AUTH_SOCK=$SSH_AUTH_SOCK" >> "$GITHUB_ENV"
	echo "SSH_AGENT_PID=$SSH_AGENT_PID" >> "$GITHUB_ENV"
	env:
	BM_SSH_KEY: ${{ secrets.BM_SSH_KEY }}

	- name: Mask sensitive values in console
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for node in "${NODES[@]}"; do
	node="${node#"${node%%[![:space:]]*}"}"
	node="${node%"${node##*[![:space:]]}"}"
	[ -n "$node" ] && echo "::add-mask::${node}"
	done
	echo "::add-mask::${BM_SSH_USER}"

	- name: Verify node connectivity
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	FAILED=0
	for node in "${NODES[@]}"; do
	if ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes -o ConnectTimeout=10 \
	"${BM_SSH_USER}@${node}" "echo ok" >/dev/null 2>&1; then
	echo "PASS: ${node}"
	else
	echo "FAIL: ${node} unreachable"
	FAILED=1
	fi
	done
	[ "$FAILED" -eq 0 ] \|\| { echo "ERROR: not all nodes reachable"; exit 1; }

	- name: Check if DNS injection needed
	id: dns-check
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for node in "${NODES[@]}"; do
	if ! [[ "$node" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
	echo "needed=true" >> "$GITHUB_OUTPUT"
	exit 0
	fi
	done
	echo "needed=false" >> "$GITHUB_OUTPUT"

	- name: Inject node DNS on cluster
	if: steps.dns-check.outputs.needed == 'true'
	run: \|
	MARKER="## SPUR_CI_DYNAMIC_NODES"
	IFS=',' read -ra NODES <<< "$BM_NODES"
	HOSTS_BLOCK="${MARKER}"
	for node in "${NODES[@]}"; do
	IP=$(getent ahosts "$node" 2>/dev/null \| awk 'NR==1{print $1}')
	if [ -z "$IP" ]; then
	echo "ERROR: cannot resolve $node from runner"
	exit 1
	fi
	HOSTS_BLOCK="${HOSTS_BLOCK}
	${IP} ${node}"
	done
	for node in "${NODES[@]}"; do
	ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes "${BM_SSH_USER}@${node}" "
	grep -q '${MARKER}' /etc/hosts 2>/dev/null && exit 0
	echo '${HOSTS_BLOCK}' \| sudo tee -a /etc/hosts >/dev/null
	"
	done

	- name: Load AppArmor profiles for rootless containers
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for step in single-node multi-node gpu-single-node gpu-multi-node; do
	SPURD="${BM_REMOTE_BASE}/${step}/bin/spurd"
	PROF="abi <abi/4.0>,
	profile spur-ci-${step} ${SPURD} flags=(unconfined) {
	userns,
	}"
	for node in "${NODES[@]}"; do
	ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes "${BM_SSH_USER}@${node}" \
	"echo '${PROF}' \| sudo apparmor_parser -r 2>/dev/null \|\| true"
	done
	done

	- name: Run single-node native-host tests
	env:
	SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
	SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
	SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
	SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
	SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/single-node
	RUST_LOG: info
	run: \|
	chmod +x /tmp/e2e-bin/spur-k8s-tests
	/tmp/e2e-bin/spur-k8s-tests native_host::single_node --ignored --test-threads=1

	- name: Run multi-node native-host tests
	env:
	SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
	SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
	SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
	SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
	SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/multi-node
	RUST_LOG: info
	run: /tmp/e2e-bin/spur-k8s-tests native_host::multi_node --ignored --test-threads=1

	- name: Run single-node GPU native-host tests
	env:
	SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
	SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
	SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
	SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
	SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/gpu-single-node
	SPUR_TEST_BM_GPU_VENV: /opt/spur-ci/gpu-venv
	RUST_LOG: info
	run: /tmp/e2e-bin/spur-k8s-tests native_host::gpu::single_node --ignored --test-threads=1

	- name: Run multi-node GPU native-host tests
	env:
	SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
	SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
	SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
	SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
	SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/gpu-multi-node
	SPUR_TEST_BM_GPU_VENV: /opt/spur-ci/gpu-venv
	RUST_LOG: info
	run: /tmp/e2e-bin/spur-k8s-tests native_host::gpu::multi_node --ignored --test-threads=1

	- name: Collect cluster logs
	if: failure()
	run: \|
	set -euo pipefail
	mkdir -p /tmp/bm-logs
	IFS=',' read -ra NODES <<< "$BM_NODES"
	idx=0
	for node in "${NODES[@]}"; do
	node="${node#"${node%%[![:space:]]*}"}"
	node="${node%"${node##*[![:space:]]}"}"
	[ -n "$node" ] \|\| continue
	node_dir="/tmp/bm-logs/node-${idx}"
	idx=$((idx + 1))
	mkdir -p "$node_dir"
	for step in single-node multi-node gpu-single-node gpu-multi-node; do
	remote="${BM_REMOTE_BASE}/${step}/log"
	local_step="${node_dir}/${step}"
	mkdir -p "$local_step"
	scp -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
	"${BM_SSH_USER}@${node}:${remote}/*.log" "$local_step/" 2>/dev/null \|\| true
	done
	done
	echo "Collected logs:"
	find /tmp/bm-logs -type f -name '*.log' \| head -20

	- name: Scrub cluster logs
	id: scrub-logs
	if: failure()
	run: \|
	chmod +x trusted-repo/scripts/scrub-ci-logs.sh
	trusted-repo/scripts/scrub-ci-logs.sh /tmp/bm-logs \
	--hostnames "$BM_NODES" \
	--ssh-user "$BM_SSH_USER"

	- name: Upload cluster logs
	if: failure() && steps.scrub-logs.outcome == 'success'
	uses: actions/upload-artifact@v7
	with:
	name: native-host-logs
	path: /tmp/bm-logs/
	retention-days: 3
	if-no-files-found: ignore

	- name: Cleanup remote processes
	if: always()
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for node in "${NODES[@]}"; do
	ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
	"${BM_SSH_USER}@${node}" \
	"pkill -f spurctld 2>/dev/null; pkill -f spurd 2>/dev/null" \|\| true
	done

	- name: Cleanup remote dirs
	if: always()
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for node in "${NODES[@]}"; do
	ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
	"${BM_SSH_USER}@${node}" "rm -rf '${BM_REMOTE_BASE}'" \|\| true
	done

	- name: Cleanup AppArmor profiles
	if: always()
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for node in "${NODES[@]}"; do
	ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes "${BM_SSH_USER}@${node}" "
	sudo aa-status 2>/dev/null \| grep -o 'spur-ci-[^ ]*' \| sort -u \| while read -r prof; do
	sudo apparmor_parser -R /dev/stdin 2>/dev/null <<< \"profile \$prof /dev/null {}\" \|\| true
	done
	" \|\| true
	done

	- name: Cleanup injected DNS
	if: always() && steps.dns-check.outputs.needed == 'true'
	run: \|
	IFS=',' read -ra NODES <<< "$BM_NODES"
	for node in "${NODES[@]}"; do
	ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
	"${BM_SSH_USER}@${node}" \
	"sudo sed -i '/## SPUR_CI_DYNAMIC_NODES/,\$d' /etc/hosts 2>/dev/null" \|\| true
	done

	- name: Kill SSH agent
	if: always()
	run: ssh-agent -k \|\| true

	- name: Report status
	if: always()
	uses: actions/github-script@v9
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner, repo: context.repo.repo,
	sha: '${{ github.event.workflow_run.head_sha }}',
	state: '${{ job.status }}' === 'success' ? 'success' : 'failure',
	target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
	context: '${{ env.STATUS_CONTEXT }}',
	description: '${{ job.status }}'
	});

	push-image:
	name: Push Image to GHCR
	runs-on: ubuntu-latest
	if: github.event.workflow_run.conclusion == 'success'
	steps:
	- name: Compute GHCR image reference
	id: image
	uses: actions/github-script@v9
	with:
	result-encoding: string
	script: \|
	const sha = '${{ github.event.workflow_run.head_sha }}';
	const repo = '${{ github.repository }}'.toLowerCase();
	return `ghcr.io/${repo}:${sha}`;

	- name: Download image artifact
	uses: actions/download-artifact@v8
	with:
	name: spur-image
	path: /tmp
	run-id: ${{ github.event.workflow_run.id }}
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Log in to GHCR
	uses: docker/login-action@v4
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Load, retag, and push
	env:
	IMAGE_TAG: ${{ steps.image.outputs.result }}
	run: \|
	docker load -i /tmp/spur-image.tar
	docker tag spur:ci "$IMAGE_TAG"
	docker push "$IMAGE_TAG"

	k8s:
	name: K8s
	runs-on: [self-hosted, amd-aac02-rocm]
	needs: push-image
	concurrency:
	group: k8s-integration
	queue: max
	env:
	STATUS_CONTEXT: "E2E / K8s"
	KUBECONFIG: /home/amd/.kube/config
	SPUR_TEST_NS: spur-ci-${{ github.event.workflow_run.id }}
	RUST_LOG: info
	steps:
	- name: Compute GHCR image reference
	id: image
	uses: actions/github-script@v9
	with:
	result-encoding: string
	script: \|
	const sha = '${{ github.event.workflow_run.head_sha }}';
	const repo = '${{ github.repository }}'.toLowerCase();
	return `ghcr.io/${repo}:${sha}`;

	- name: Set SPUR_CI_IMAGE
	run: \|
	echo "SPUR_CI_IMAGE=${{ steps.image.outputs.result }}" >> "$GITHUB_ENV"
	echo "SPUR_CI_IMAGE=${{ steps.image.outputs.result }}"

	- name: Set pending status
	uses: actions/github-script@v9
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner, repo: context.repo.repo,
	sha: '${{ github.event.workflow_run.head_sha }}',
	state: 'pending',
	target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
	context: '${{ env.STATUS_CONTEXT }}'
	});

	- name: Download K8s test binary
	uses: actions/download-artifact@v8
	with:
	name: k8s-test-binary
	path: /tmp/k8s-bin
	run-id: ${{ github.event.workflow_run.id }}
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Checkout K8s manifests from main
	uses: actions/checkout@v6
	with:
	persist-credentials: false
	sparse-checkout: deploy/k8s
	sparse-checkout-cone-mode: false
	path: trusted-repo

	- name: Select kubectl context
	run: kubectl config use-context kubernetes-admin@kubernetes

	- name: Remove leftover spur-ci namespaces
	run: \|
	set -euo pipefail
	leftover=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' \
	\| tr ' ' '\n' \| grep '^spur-ci-' \|\| true)
	if [ -z "$leftover" ]; then
	echo "No leftover spur-ci namespaces"
	exit 0
	fi
	echo "Deleting leftover spur-ci namespaces:"
	echo "$leftover"
	for ns in $leftover; do
	kubectl delete ns "$ns" --ignore-not-found --wait=false
	done
	deadline=$((SECONDS + 120))
	while [ "$SECONDS" -lt "$deadline" ]; do
	remaining=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' \
	\| tr ' ' '\n' \| grep '^spur-ci-' \|\| true)
	if [ -z "$remaining" ]; then
	echo "All spur-ci namespaces removed"
	exit 0
	fi
	echo "Waiting for namespaces to terminate: $remaining"
	sleep 5
	done
	echo "ERROR: timed out after 2m waiting for spur-ci namespace deletion:"
	kubectl get ns -o wide $(echo "$remaining" \| tr '\n' ' ')
	exit 1

	- name: Clean cluster-scoped Spur RBAC (pre)
	run: \|
	set -euo pipefail
	kubectl delete clusterrolebinding spur-operator --ignore-not-found
	kubectl delete clusterrole spur-operator --ignore-not-found

	- name: Create test namespace
	run: \|
	kubectl create namespace "$SPUR_TEST_NS" \
	--dry-run=client -o yaml \| kubectl apply -f -

	- name: Create registry pull secret
	run: \|
	set -euo pipefail
	kubectl create secret docker-registry regcred \
	--docker-server=ghcr.io \
	--docker-username="${{ github.actor }}" \
	--docker-password="${{ secrets.GITHUB_TOKEN }}" \
	-n "$SPUR_TEST_NS" \
	--dry-run=client -o yaml \| kubectl apply -f -

	- name: Attach pull secret to default ServiceAccount
	run: \|
	kubectl patch serviceaccount default -n "$SPUR_TEST_NS" \
	-p '{"imagePullSecrets": [{"name": "regcred"}]}'

	- name: Apply RBAC
	run: \|
	set -euo pipefail
	sed "s/namespace: spur/namespace: $SPUR_TEST_NS/g" \
	trusted-repo/deploy/k8s/rbac.yaml \| kubectl apply -f -

	- name: Verify cluster can pull CI image
	run: \|
	set -euo pipefail
	kubectl run image-pull-check \
	--namespace="$SPUR_TEST_NS" \
	--image="$SPUR_CI_IMAGE" \
	--restart=Never \
	--command -- sleep 30
	if ! kubectl wait --namespace="$SPUR_TEST_NS" \
	--for=condition=Ready pod/image-pull-check --timeout=120s; then
	echo "ERROR: cluster failed to pull $SPUR_CI_IMAGE"
	kubectl describe pod image-pull-check -n "$SPUR_TEST_NS" \|\| true
	kubectl get events -n "$SPUR_TEST_NS" --sort-by='.lastTimestamp' \| tail -30 \|\| true
	exit 1
	fi
	kubectl delete pod image-pull-check -n "$SPUR_TEST_NS" --ignore-not-found --wait=true

	- name: Run single-node K8s tests
	env:
	SPUR_DEPLOY_DIR: trusted-repo/deploy/k8s
	run: \|
	chmod +x /tmp/k8s-bin/spur-k8s-tests
	/tmp/k8s-bin/spur-k8s-tests k8s::single_node --ignored --test-threads=1

	- name: Run multi-node K8s tests
	env:
	SPUR_DEPLOY_DIR: trusted-repo/deploy/k8s
	run: \|
	/tmp/k8s-bin/spur-k8s-tests k8s::multi_node --ignored --test-threads=1

	- name: Cleanup test resources
	if: always()
	run: \|
	set -euo pipefail
	kubectl delete namespace "$SPUR_TEST_NS" --ignore-not-found --timeout=120s \|\| true
	kubectl delete clusterrolebinding spur-operator --ignore-not-found
	kubectl delete clusterrole spur-operator --ignore-not-found
	kubectl delete crd spurjobs.spur.ai --ignore-not-found --timeout=120s \|\| true

	- name: Report status
	if: always()
	uses: actions/github-script@v9
	with:
	script: \|
	await github.rest.repos.createCommitStatus({
	owner: context.repo.owner, repo: context.repo.repo,
	sha: '${{ github.event.workflow_run.head_sha }}',
	state: '${{ job.status }}' === 'success' ? 'success' : 'failure',
	target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
	context: '${{ env.STATUS_CONTEXT }}',
	description: '${{ job.status }}'
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

E2E #207

Workflow file

E2E #207

Uh oh!

Workflow file for this run