Skip to content

E2E

E2E #207

Workflow file for this run

# Trusted workflow: runs in the base repo context so self-hosted runners
# and secrets are never exposed to untrusted fork code. Triggers after
# the CI workflow completes successfully.
#
# No untrusted code is checked out or compiled here. All binaries and
# scripts are pre-built artifacts uploaded by the CI workflow. K8s
# manifests and test scripts are checked out from the default branch.
name: E2E
on:
workflow_run:
workflows: ["CI"]
types: [completed]
permissions:
statuses: write
actions: read
contents: read
packages: write
jobs:
mark-skipped:
name: Mark E2E skipped
runs-on: ubuntu-latest
if: github.event.workflow_run.conclusion != 'success'
permissions:
statuses: write
contents: read
steps:
- uses: actions/checkout@v6
with:
sparse-checkout: .github/workflows/e2e.yml
sparse-checkout-cone-mode: false
- name: Mark E2E statuses
uses: actions/github-script@v9
with:
script: |
const fs = require('fs');
const text = fs.readFileSync('.github/workflows/e2e.yml', 'utf8');
const matches = [...text.matchAll(/^\s*STATUS_CONTEXT:\s*["']([^"']+)["']\s*$/gm)];
const contexts = [...new Set(matches.map((m) => m[1]))];
if (contexts.length === 0) {
core.setFailed('No STATUS_CONTEXT values found in .github/workflows/e2e.yml');
return;
}
const sha = '${{ github.event.workflow_run.head_sha }}';
const conclusion = '${{ github.event.workflow_run.conclusion }}';
const desc = conclusion === 'failure'
? 'Skipped: CI did not pass'
: `Skipped: CI ${conclusion}`;
for (const ctx of contexts) {
await github.rest.repos.createCommitStatus({
owner: context.repo.owner, repo: context.repo.repo, sha,
state: 'failure',
context: ctx,
description: desc,
target_url: '${{ github.event.workflow_run.html_url }}'
});
}
native-host:
name: Native-Host
runs-on: [self-hosted, amd-aac02-rocm]
if: github.event.workflow_run.conclusion == 'success'
concurrency:
group: native-host-cluster
queue: max
env:
STATUS_CONTEXT: "E2E / Native-Host"
BM_NODES: ${{ secrets.BM_NODES }}
BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
BM_REMOTE_BASE: /tmp/spur-bm-${{ github.run_id }}
steps:
- name: Set pending status
uses: actions/github-script@v9
with:
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner, repo: context.repo.repo,
sha: '${{ github.event.workflow_run.head_sha }}',
state: 'pending',
target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
context: '${{ env.STATUS_CONTEXT }}'
});
- name: Download release binaries
uses: actions/download-artifact@v8
with:
name: release-binaries
path: /tmp/release-binaries
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Download E2E test binary
uses: actions/download-artifact@v8
with:
name: k8s-test-binary
path: /tmp/e2e-bin
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Checkout trusted deploy assets and scripts from main
uses: actions/checkout@v6
with:
persist-credentials: false
sparse-checkout: |
deploy/native-host
scripts
sparse-checkout-cone-mode: false
path: trusted-repo
- name: Load SSH key into agent
run: |
eval "$(ssh-agent -s)"
printf '%s\n' "$BM_SSH_KEY" | ssh-add -
echo "SSH_AUTH_SOCK=$SSH_AUTH_SOCK" >> "$GITHUB_ENV"
echo "SSH_AGENT_PID=$SSH_AGENT_PID" >> "$GITHUB_ENV"
env:
BM_SSH_KEY: ${{ secrets.BM_SSH_KEY }}
- name: Mask sensitive values in console
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for node in "${NODES[@]}"; do
node="${node#"${node%%[![:space:]]*}"}"
node="${node%"${node##*[![:space:]]}"}"
[ -n "$node" ] && echo "::add-mask::${node}"
done
echo "::add-mask::${BM_SSH_USER}"
- name: Verify node connectivity
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
FAILED=0
for node in "${NODES[@]}"; do
if ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes -o ConnectTimeout=10 \
"${BM_SSH_USER}@${node}" "echo ok" >/dev/null 2>&1; then
echo "PASS: ${node}"
else
echo "FAIL: ${node} unreachable"
FAILED=1
fi
done
[ "$FAILED" -eq 0 ] || { echo "ERROR: not all nodes reachable"; exit 1; }
- name: Check if DNS injection needed
id: dns-check
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for node in "${NODES[@]}"; do
if ! [[ "$node" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "needed=true" >> "$GITHUB_OUTPUT"
exit 0
fi
done
echo "needed=false" >> "$GITHUB_OUTPUT"
- name: Inject node DNS on cluster
if: steps.dns-check.outputs.needed == 'true'
run: |
MARKER="## SPUR_CI_DYNAMIC_NODES"
IFS=',' read -ra NODES <<< "$BM_NODES"
HOSTS_BLOCK="${MARKER}"
for node in "${NODES[@]}"; do
IP=$(getent ahosts "$node" 2>/dev/null | awk 'NR==1{print $1}')
if [ -z "$IP" ]; then
echo "ERROR: cannot resolve $node from runner"
exit 1
fi
HOSTS_BLOCK="${HOSTS_BLOCK}
${IP} ${node}"
done
for node in "${NODES[@]}"; do
ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes "${BM_SSH_USER}@${node}" "
grep -q '${MARKER}' /etc/hosts 2>/dev/null && exit 0
echo '${HOSTS_BLOCK}' | sudo tee -a /etc/hosts >/dev/null
"
done
- name: Load AppArmor profiles for rootless containers
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for step in single-node multi-node gpu-single-node gpu-multi-node; do
SPURD="${BM_REMOTE_BASE}/${step}/bin/spurd"
PROF="abi <abi/4.0>,
profile spur-ci-${step} ${SPURD} flags=(unconfined) {
userns,
}"
for node in "${NODES[@]}"; do
ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes "${BM_SSH_USER}@${node}" \
"echo '${PROF}' | sudo apparmor_parser -r 2>/dev/null || true"
done
done
- name: Run single-node native-host tests
env:
SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/single-node
RUST_LOG: info
run: |
chmod +x /tmp/e2e-bin/spur-k8s-tests
/tmp/e2e-bin/spur-k8s-tests native_host::single_node --ignored --test-threads=1
- name: Run multi-node native-host tests
env:
SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/multi-node
RUST_LOG: info
run: /tmp/e2e-bin/spur-k8s-tests native_host::multi_node --ignored --test-threads=1
- name: Run single-node GPU native-host tests
env:
SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/gpu-single-node
SPUR_TEST_BM_GPU_VENV: /opt/spur-ci/gpu-venv
RUST_LOG: info
run: /tmp/e2e-bin/spur-k8s-tests native_host::gpu::single_node --ignored --test-threads=1
- name: Run multi-node GPU native-host tests
env:
SPUR_TEST_BM_NODES: ${{ env.BM_NODES }}
SPUR_TEST_BM_SSH_USER: ${{ secrets.BM_SSH_USER }}
SPUR_TEST_BM_BINARIES_DIR: /tmp/release-binaries
SPUR_TEST_BM_DEPLOY_DIR: trusted-repo/deploy/native-host
SPUR_TEST_BM_REMOTE_DIR: ${{ env.BM_REMOTE_BASE }}/gpu-multi-node
SPUR_TEST_BM_GPU_VENV: /opt/spur-ci/gpu-venv
RUST_LOG: info
run: /tmp/e2e-bin/spur-k8s-tests native_host::gpu::multi_node --ignored --test-threads=1
- name: Collect cluster logs
if: failure()
run: |
set -euo pipefail
mkdir -p /tmp/bm-logs
IFS=',' read -ra NODES <<< "$BM_NODES"
idx=0
for node in "${NODES[@]}"; do
node="${node#"${node%%[![:space:]]*}"}"
node="${node%"${node##*[![:space:]]}"}"
[ -n "$node" ] || continue
node_dir="/tmp/bm-logs/node-${idx}"
idx=$((idx + 1))
mkdir -p "$node_dir"
for step in single-node multi-node gpu-single-node gpu-multi-node; do
remote="${BM_REMOTE_BASE}/${step}/log"
local_step="${node_dir}/${step}"
mkdir -p "$local_step"
scp -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
"${BM_SSH_USER}@${node}:${remote}/*.log" "$local_step/" 2>/dev/null || true
done
done
echo "Collected logs:"
find /tmp/bm-logs -type f -name '*.log' | head -20
- name: Scrub cluster logs
id: scrub-logs
if: failure()
run: |
chmod +x trusted-repo/scripts/scrub-ci-logs.sh
trusted-repo/scripts/scrub-ci-logs.sh /tmp/bm-logs \
--hostnames "$BM_NODES" \
--ssh-user "$BM_SSH_USER"
- name: Upload cluster logs
if: failure() && steps.scrub-logs.outcome == 'success'
uses: actions/upload-artifact@v7
with:
name: native-host-logs
path: /tmp/bm-logs/
retention-days: 3
if-no-files-found: ignore
- name: Cleanup remote processes
if: always()
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for node in "${NODES[@]}"; do
ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
"${BM_SSH_USER}@${node}" \
"pkill -f spurctld 2>/dev/null; pkill -f spurd 2>/dev/null" || true
done
- name: Cleanup remote dirs
if: always()
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for node in "${NODES[@]}"; do
ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
"${BM_SSH_USER}@${node}" "rm -rf '${BM_REMOTE_BASE}'" || true
done
- name: Cleanup AppArmor profiles
if: always()
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for node in "${NODES[@]}"; do
ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes "${BM_SSH_USER}@${node}" "
sudo aa-status 2>/dev/null | grep -o 'spur-ci-[^ ]*' | sort -u | while read -r prof; do
sudo apparmor_parser -R /dev/stdin 2>/dev/null <<< \"profile \$prof /dev/null {}\" || true
done
" || true
done
- name: Cleanup injected DNS
if: always() && steps.dns-check.outputs.needed == 'true'
run: |
IFS=',' read -ra NODES <<< "$BM_NODES"
for node in "${NODES[@]}"; do
ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \
"${BM_SSH_USER}@${node}" \
"sudo sed -i '/## SPUR_CI_DYNAMIC_NODES/,\$d' /etc/hosts 2>/dev/null" || true
done
- name: Kill SSH agent
if: always()
run: ssh-agent -k || true
- name: Report status
if: always()
uses: actions/github-script@v9
with:
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner, repo: context.repo.repo,
sha: '${{ github.event.workflow_run.head_sha }}',
state: '${{ job.status }}' === 'success' ? 'success' : 'failure',
target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
context: '${{ env.STATUS_CONTEXT }}',
description: '${{ job.status }}'
});
push-image:
name: Push Image to GHCR
runs-on: ubuntu-latest
if: github.event.workflow_run.conclusion == 'success'
steps:
- name: Compute GHCR image reference
id: image
uses: actions/github-script@v9
with:
result-encoding: string
script: |
const sha = '${{ github.event.workflow_run.head_sha }}';
const repo = '${{ github.repository }}'.toLowerCase();
return `ghcr.io/${repo}:${sha}`;
- name: Download image artifact
uses: actions/download-artifact@v8
with:
name: spur-image
path: /tmp
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Log in to GHCR
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Load, retag, and push
env:
IMAGE_TAG: ${{ steps.image.outputs.result }}
run: |
docker load -i /tmp/spur-image.tar
docker tag spur:ci "$IMAGE_TAG"
docker push "$IMAGE_TAG"
k8s:
name: K8s
runs-on: [self-hosted, amd-aac02-rocm]
needs: push-image
concurrency:
group: k8s-integration
queue: max
env:
STATUS_CONTEXT: "E2E / K8s"
KUBECONFIG: /home/amd/.kube/config
SPUR_TEST_NS: spur-ci-${{ github.event.workflow_run.id }}
RUST_LOG: info
steps:
- name: Compute GHCR image reference
id: image
uses: actions/github-script@v9
with:
result-encoding: string
script: |
const sha = '${{ github.event.workflow_run.head_sha }}';
const repo = '${{ github.repository }}'.toLowerCase();
return `ghcr.io/${repo}:${sha}`;
- name: Set SPUR_CI_IMAGE
run: |
echo "SPUR_CI_IMAGE=${{ steps.image.outputs.result }}" >> "$GITHUB_ENV"
echo "SPUR_CI_IMAGE=${{ steps.image.outputs.result }}"
- name: Set pending status
uses: actions/github-script@v9
with:
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner, repo: context.repo.repo,
sha: '${{ github.event.workflow_run.head_sha }}',
state: 'pending',
target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
context: '${{ env.STATUS_CONTEXT }}'
});
- name: Download K8s test binary
uses: actions/download-artifact@v8
with:
name: k8s-test-binary
path: /tmp/k8s-bin
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Checkout K8s manifests from main
uses: actions/checkout@v6
with:
persist-credentials: false
sparse-checkout: deploy/k8s
sparse-checkout-cone-mode: false
path: trusted-repo
- name: Select kubectl context
run: kubectl config use-context kubernetes-admin@kubernetes
- name: Remove leftover spur-ci namespaces
run: |
set -euo pipefail
leftover=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' \
| tr ' ' '\n' | grep '^spur-ci-' || true)
if [ -z "$leftover" ]; then
echo "No leftover spur-ci namespaces"
exit 0
fi
echo "Deleting leftover spur-ci namespaces:"
echo "$leftover"
for ns in $leftover; do
kubectl delete ns "$ns" --ignore-not-found --wait=false
done
deadline=$((SECONDS + 120))
while [ "$SECONDS" -lt "$deadline" ]; do
remaining=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}' \
| tr ' ' '\n' | grep '^spur-ci-' || true)
if [ -z "$remaining" ]; then
echo "All spur-ci namespaces removed"
exit 0
fi
echo "Waiting for namespaces to terminate: $remaining"
sleep 5
done
echo "ERROR: timed out after 2m waiting for spur-ci namespace deletion:"
kubectl get ns -o wide $(echo "$remaining" | tr '\n' ' ')
exit 1
- name: Clean cluster-scoped Spur RBAC (pre)
run: |
set -euo pipefail
kubectl delete clusterrolebinding spur-operator --ignore-not-found
kubectl delete clusterrole spur-operator --ignore-not-found
- name: Create test namespace
run: |
kubectl create namespace "$SPUR_TEST_NS" \
--dry-run=client -o yaml | kubectl apply -f -
- name: Create registry pull secret
run: |
set -euo pipefail
kubectl create secret docker-registry regcred \
--docker-server=ghcr.io \
--docker-username="${{ github.actor }}" \
--docker-password="${{ secrets.GITHUB_TOKEN }}" \
-n "$SPUR_TEST_NS" \
--dry-run=client -o yaml | kubectl apply -f -
- name: Attach pull secret to default ServiceAccount
run: |
kubectl patch serviceaccount default -n "$SPUR_TEST_NS" \
-p '{"imagePullSecrets": [{"name": "regcred"}]}'
- name: Apply RBAC
run: |
set -euo pipefail
sed "s/namespace: spur/namespace: $SPUR_TEST_NS/g" \
trusted-repo/deploy/k8s/rbac.yaml | kubectl apply -f -
- name: Verify cluster can pull CI image
run: |
set -euo pipefail
kubectl run image-pull-check \
--namespace="$SPUR_TEST_NS" \
--image="$SPUR_CI_IMAGE" \
--restart=Never \
--command -- sleep 30
if ! kubectl wait --namespace="$SPUR_TEST_NS" \
--for=condition=Ready pod/image-pull-check --timeout=120s; then
echo "ERROR: cluster failed to pull $SPUR_CI_IMAGE"
kubectl describe pod image-pull-check -n "$SPUR_TEST_NS" || true
kubectl get events -n "$SPUR_TEST_NS" --sort-by='.lastTimestamp' | tail -30 || true
exit 1
fi
kubectl delete pod image-pull-check -n "$SPUR_TEST_NS" --ignore-not-found --wait=true
- name: Run single-node K8s tests
env:
SPUR_DEPLOY_DIR: trusted-repo/deploy/k8s
run: |
chmod +x /tmp/k8s-bin/spur-k8s-tests
/tmp/k8s-bin/spur-k8s-tests k8s::single_node --ignored --test-threads=1
- name: Run multi-node K8s tests
env:
SPUR_DEPLOY_DIR: trusted-repo/deploy/k8s
run: |
/tmp/k8s-bin/spur-k8s-tests k8s::multi_node --ignored --test-threads=1
- name: Cleanup test resources
if: always()
run: |
set -euo pipefail
kubectl delete namespace "$SPUR_TEST_NS" --ignore-not-found --timeout=120s || true
kubectl delete clusterrolebinding spur-operator --ignore-not-found
kubectl delete clusterrole spur-operator --ignore-not-found
kubectl delete crd spurjobs.spur.ai --ignore-not-found --timeout=120s || true
- name: Report status
if: always()
uses: actions/github-script@v9
with:
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner, repo: context.repo.repo,
sha: '${{ github.event.workflow_run.head_sha }}',
state: '${{ job.status }}' === 'success' ? 'success' : 'failure',
target_url: '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}',
context: '${{ env.STATUS_CONTEXT }}',
description: '${{ job.status }}'
});