Skip to content

fix(exec): preserve UTF-8 codepoints split across attach chunks #417

fix(exec): preserve UTF-8 codepoints split across attach chunks

fix(exec): preserve UTF-8 codepoints split across attach chunks #417

Workflow file for this run

# Run VM-based E2E integration tests on a persistent AWS EC2 runner.
#
# GitHub-hosted runners do not support nested virtualization / KVM.
# This workflow starts a pre-configured EC2 instance, runs integration tests,
# then stops it. The instance is never terminated — build caches persist.
#
# Architecture:
# start-runner (ubuntu-latest) → e2e-tests (self-hosted) → stop-runner (ubuntu-latest)
# Start stopped EC2 Build + run tests Stop EC2
# Wait for runner online 35-min timeout if: always()
#
# Concurrency: only ONE e2e run at a time (single persistent instance).
# Queued runs wait; in-progress runs are cancelled by newer pushes.
#
# Cost: ~$0.17/hr (c8i.xlarge) only while running + ~$4/mo EBS storage.
# Subsequent runs skip setup/compilation (cached) → ~5-10 min → ~$0.03/run.
#
# Authentication:
# AWS: GitHub OIDC → AWS STS (no stored AWS credentials)
# GitHub: GitHub App → short-lived installation token (no PAT)
#
# Setup: ./scripts/ci/setup-ci-runner.sh
name: E2E Tests
on:
push:
branches: [main]
paths:
- 'src/boxlite/**'
- 'src/shared/**'
- 'src/cli/**'
- 'src/guest/**'
- 'sdks/**'
- '**/Cargo.toml'
- 'Cargo.lock'
- '.github/workflows/e2e-test.yml'
pull_request:
types: [labeled, synchronize]
branches: [main]
paths:
- 'src/boxlite/**'
- 'src/shared/**'
- 'src/cli/**'
- 'src/guest/**'
- 'sdks/**'
- '**/Cargo.toml'
- 'Cargo.lock'
workflow_dispatch:
inputs:
debug:
description: 'Open SSH session on failure (via tmate)'
type: boolean
default: false
# Single global concurrency group — only one e2e run at a time.
# The persistent instance can only serve one job. Newer pushes cancel older runs.
concurrency:
group: e2e-runner
cancel-in-progress: true
env:
AWS_REGION: us-east-1
AWS_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/boxlite-e2e-github-actions
EC2_INSTANCE_ID: ${{ vars.EC2_E2E_INSTANCE_ID }}
EC2_INSTANCE_TYPE: c8i.4xlarge
EC2_AMI_ID: ami-05cf1e9f73fbad2e2
EC2_SUBNET_IDS: ${{ vars.AWS_SUBNET_IDS || vars.AWS_SUBNET_ID }}
EC2_SECURITY_GROUP_ID: ${{ vars.AWS_SECURITY_GROUP_ID }}
RUNNER_VERSION: '2.334.0'
RUNNER_LABEL: boxlite-e2e
jobs:
# =========================================================================
# Gate: PRs require 'e2e-test' label (only maintainers can add it)
# =========================================================================
should-run:
runs-on: ubuntu-latest
outputs:
run: ${{ steps.check.outputs.run }}
steps:
- name: Check trigger conditions
id: check
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
LABELS='${{ toJson(github.event.pull_request.labels.*.name) }}'
if echo "$LABELS" | jq -e 'index("e2e-test")' > /dev/null 2>&1; then
echo "run=true" >> "$GITHUB_OUTPUT"
else
echo "run=false" >> "$GITHUB_OUTPUT"
fi
else
echo "run=true" >> "$GITHUB_OUTPUT"
fi
# =========================================================================
# Job 1: Start the persistent EC2 instance and wait for runner
# =========================================================================
start-runner:
name: Start E2E Runner
needs: should-run
if: needs.should-run.outputs.run == 'true'
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
outputs:
instance-id: ${{ steps.ensure-instance.outputs.instance-id }}
steps:
- name: Generate GitHub App token
id: app-token
uses: actions/create-github-app-token@v1
with:
app-id: ${{ vars.GH_APP_ID }}
private-key: ${{ secrets.GH_APP_PRIVATE_KEY }}
- name: Configure AWS credentials (OIDC)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
role-session-name: e2e-start-${{ github.run_id }}
- name: Start or create EC2 instance
id: ensure-instance
run: |
INSTANCE_ID="${{ env.EC2_INSTANCE_ID }}"
# Fallback: discover by tag if variable is empty
if [ -z "$INSTANCE_ID" ]; then
INSTANCE_ID=$(aws ec2 describe-instances \
--filters "Name=tag:Name,Values=boxlite-e2e" "Name=instance-state-name,Values=running,stopped,stopping,pending" \
--query "Reservations[0].Instances[0].InstanceId" --output text 2>/dev/null || echo "")
[ "$INSTANCE_ID" = "None" ] && INSTANCE_ID=""
fi
# Check if instance exists and get its state
if [ -n "$INSTANCE_ID" ]; then
STATE=$(aws ec2 describe-instances \
--instance-ids "$INSTANCE_ID" \
--query "Reservations[0].Instances[0].State.Name" --output text 2>/dev/null || echo "not-found")
else
STATE="not-found"
fi
if [ "$STATE" = "running" ]; then
echo "Instance already running"
echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT"
elif [ "$STATE" = "stopped" ]; then
echo "Starting instance..."
aws ec2 start-instances --instance-ids "$INSTANCE_ID"
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID"
echo "Instance started"
echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT"
elif [ "$STATE" = "stopping" ] || [ "$STATE" = "pending" ]; then
echo "Instance in transitional state ($STATE), waiting..."
aws ec2 wait instance-stopped --instance-ids "$INSTANCE_ID" 2>/dev/null || \
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" 2>/dev/null || true
# Re-check and start if stopped
STATE=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \
--query "Reservations[0].Instances[0].State.Name" --output text)
if [ "$STATE" = "stopped" ]; then
aws ec2 start-instances --instance-ids "$INSTANCE_ID"
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID"
elif [ "$STATE" != "running" ]; then
echo "::error::Instance stuck in state: $STATE"
exit 1
fi
echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT"
else
echo "Instance not found or terminated. Creating new one..."
# Generate runner registration token for user-data
REG_TOKEN=$(curl -sf -X POST \
-H "Authorization: token ${{ steps.app-token.outputs.token }}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \
| jq -r .token)
# Build user-data
cat > /tmp/user-data.sh << 'USERDATA'
#!/bin/bash
set -euo pipefail
export HOME=/root
modprobe kvm_intel 2>/dev/null || modprobe kvm_amd 2>/dev/null || true
chmod 666 /dev/kvm 2>/dev/null || true
usermod -aG kvm root 2>/dev/null || true
# User-data only installs the actions-runner. Build deps
# (apt + rustup + cargo-nextest) are installed by the
# e2e-tests job's "Install build dependencies" step so they
# stream to the GitHub Actions log and don't race the
# runner-online poll.
apt-get update -qq && apt-get install -y -qq curl jq tar
RUNNER_VERSION="__RUNNER_VERSION__"
mkdir -p /opt/actions-runner && cd /opt/actions-runner
if [ ! -f ./config.sh ]; then
curl -fsSL "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" | tar xz
fi
RUNNER_ALLOW_RUNASROOT=1 ./config.sh \
--url "https://github.com/__REPO__" \
--token "__REG_TOKEN__" \
--name "boxlite-e2e" \
--labels "self-hosted,linux,x64,kvm,boxlite-e2e" \
--unattended --disableupdate --replace || true
./svc.sh install root || true
./svc.sh start || true
USERDATA
sed -i "s|__REPO__|${{ github.repository }}|g" /tmp/user-data.sh
sed -i "s|__RUNNER_VERSION__|${{ env.RUNNER_VERSION }}|g" /tmp/user-data.sh
sed -i "s|__REG_TOKEN__|${REG_TOKEN}|g" /tmp/user-data.sh
# Launch instance — try each subnet (AZ) until one has capacity
INSTANCE_ID=""
IFS=',' read -ra SUBNET_LIST <<< "${{ env.EC2_SUBNET_IDS }}"
for SUBNET in "${SUBNET_LIST[@]}"; do
SUBNET=$(echo "$SUBNET" | xargs)
echo "Attempting launch in subnet $SUBNET..."
if RESULT=$(aws ec2 run-instances \
--instance-type "${{ env.EC2_INSTANCE_TYPE }}" \
--image-id "${{ env.EC2_AMI_ID }}" \
--subnet-id "$SUBNET" \
--security-group-ids "${{ env.EC2_SECURITY_GROUP_ID }}" \
--iam-instance-profile Name=boxlite-e2e-runner \
--cpu-options "NestedVirtualization=enabled" \
--user-data file:///tmp/user-data.sh \
--block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":50,"VolumeType":"gp3","DeleteOnTermination":false}}]' \
--tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=boxlite-e2e},{Key=Purpose,Value=boxlite-e2e}]" \
--metadata-options "HttpTokens=required,HttpEndpoint=enabled" \
--query "Instances[0].InstanceId" --output text 2>&1); then
INSTANCE_ID="$RESULT"
echo "Launched $INSTANCE_ID in subnet $SUBNET"
break
else
echo "::warning::Subnet $SUBNET unavailable: $RESULT"
fi
done
if [ -z "$INSTANCE_ID" ]; then
echo "::error::Failed to launch instance in any availability zone"
exit 1
fi
echo "Created instance: $INSTANCE_ID"
aws ec2 wait instance-running --instance-ids "$INSTANCE_ID"
echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT"
# Try to save instance ID for faster lookup on the next run.
# Best-effort: tag-based fallback (Name=boxlite-e2e) handles rediscovery
# if this fails (e.g., the auth token lacks variables:write).
if ! gh variable set EC2_E2E_INSTANCE_ID --body "$INSTANCE_ID" -R "${{ github.repository }}"; then
echo "::warning::Could not persist EC2_E2E_INSTANCE_ID. Future runs will rediscover by tag."
fi
fi
env:
GH_TOKEN: ${{ steps.app-token.outputs.token }}
- name: Wait for runner to come online
run: |
TIMEOUT=180
ELAPSED=0
INTERVAL=10
echo "Waiting for runner '${{ env.RUNNER_LABEL }}' to come online..."
while [ $ELAPSED -lt $TIMEOUT ]; do
STATUS=$(curl -sf \
-H "Authorization: token ${{ steps.app-token.outputs.token }}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${{ github.repository }}/actions/runners" \
| jq -r '.runners[] | select(.labels[].name == "${{ env.RUNNER_LABEL }}") | .status // empty' \
2>/dev/null || echo "")
if [ "$STATUS" = "online" ]; then
echo "Runner is online"
exit 0
fi
echo " Waiting... (${ELAPSED}s / ${TIMEOUT}s)"
sleep $INTERVAL
ELAPSED=$((ELAPSED + INTERVAL))
done
echo "::error::Runner did not come online within ${TIMEOUT}s"
exit 1
# =========================================================================
# Job 2: Run integration tests on the persistent runner
# =========================================================================
e2e-tests:
name: E2E Integration Tests
needs: start-runner
runs-on:
- self-hosted
- boxlite-e2e
timeout-minutes: 50
env:
CARGO_TERM_COLOR: always
CARGO_INCREMENTAL: '0'
HOME: /root
steps:
- name: Clean workspace (persistent runner)
run: |
rm -rf /tmp/boxlite-* 2>/dev/null || true
if [ -d "$GITHUB_WORKSPACE/.git" ]; then
cd "$GITHUB_WORKSPACE"
git clean -ffd 2>/dev/null || true
git checkout -- . 2>/dev/null || true
fi
- name: Stage prior-run logs for rescue
# If a previous run died mid-step (e.g. Worker ENOSPC), its logs
# are still on this persistent runner under /var/log/boxlite-ci/.
# Move them aside so the upload step below can ship them as a
# separate artifact, then clear them off the runner volume.
#
# Namespace key is "<run_id>-<run_attempt>": reruns of the same
# workflow share github.run_id and only github.run_attempt
# increments, so run_id alone would let attempt N misidentify
# attempt N-1's directory as its own and skip rescuing it.
env:
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
run: |
set -u
BASE=/var/log/boxlite-ci
STAGE=/tmp/boxlite-rescue
CURRENT="${GH_RUN_ID}-${GH_RUN_ATTEMPT}"
rm -rf "$STAGE" && mkdir -p "$STAGE"
if [ -d "$BASE" ]; then
for d in "$BASE"/*; do
[ -d "$d" ] || continue
[ "$(basename "$d")" = "$CURRENT" ] && continue
mv "$d" "$STAGE/"
done
fi
- name: Upload rescued prior-run logs
uses: actions/upload-artifact@v4
with:
name: e2e-test-logs-rescued-${{ github.run_id }}-${{ github.run_attempt }}
path: /tmp/boxlite-rescue/
retention-days: 7
if-no-files-found: ignore
- name: Pre-flight runner cleanup (runner internals only — never caches)
# Housekeeping for actions-runner's own diagnostic logs so they
# don't accumulate over the runner's lifetime. Build/image caches
# (~/.boxlite/, ~/.cargo/, target/) are LEFT ALONE — they are why
# this runner is persistent.
#
# Triggered by run 25726812554 where _diag was the canary for a
# 50→500GB root-volume migration: _diag was the first write to
# hit ENOSPC, but the actual disk fill came from build artifacts
# under _work/.../target/, not from _diag itself. This step is
# preventative housekeeping, not the fix for that incident.
run: |
find /opt/actions-runner/_diag -maxdepth 1 -type f -name 'Worker_*.log' -mtime +1 -delete 2>/dev/null || true
find /opt/actions-runner/_diag -maxdepth 1 -type f -name 'Runner_*.log' -mtime +1 -delete 2>/dev/null || true
- name: Checkout code
uses: actions/checkout@v5
with:
submodules: recursive
clean: false
- name: Verify KVM access
run: |
test -c /dev/kvm && test -r /dev/kvm -a -w /dev/kvm && echo "/dev/kvm is accessible" || {
echo "::error::/dev/kvm not accessible"; exit 1
}
- name: Install build dependencies (make setup)
run: |
# First run on a fresh instance: ~5–10 min (apt + rustup +
# cargo-nextest; CI=true so prek install is skipped).
# Subsequent runs reuse the persistent EBS volume and are
# idempotent fast-paths (~10–30 s).
bash scripts/setup/setup-ubuntu.sh
- name: Disk-pressure cache eviction
# Replaces the prior hard-fail precheck (run 25726812554 post-mortem):
# at >=THRESHOLD_PCT root use, evict e2e caches in place, re-check,
# and only fail if disk is still over threshold afterward. Caches
# re-warm on next run. Runner-internals housekeeping is separate
# (see "Pre-flight runner cleanup" step above).
#
# df parse-contract is validated before any comparison so a silent
# df format change surfaces as ::error::, not a fake "disk OK".
env:
THRESHOLD_PCT: '80'
MIN_FREE_GB: '20'
run: |
set -euo pipefail
read_df() {
DF_AVAIL=$(df --output=avail -BG / | tail -n 1)
DF_PCENT=$(df --output=pcent / | tail -n 1)
FREE_GB=$(printf '%s' "$DF_AVAIL" | tr -dc '0-9')
USED_PCT=$(printf '%s' "$DF_PCENT" | tr -dc '0-9')
if ! [[ "$FREE_GB" =~ ^[0-9]+$ ]] || ! [[ "$USED_PCT" =~ ^[0-9]+$ ]]; then
echo "::error::Could not parse df output (FREE_GB='${FREE_GB}' USED_PCT='${USED_PCT}') — df contract may have changed."
echo "Raw df --output=avail -BG / tail: ${DF_AVAIL}"
echo "Raw df --output=pcent / tail: ${DF_PCENT}"
exit 1
fi
}
read_df
echo "Root volume: ${FREE_GB} GB free (${USED_PCT}% used)"
if [ "${USED_PCT}" -ge "${THRESHOLD_PCT}" ]; then
echo "── Disk >=${THRESHOLD_PCT}% used — evicting e2e caches ──"
for path in target/boxlite-test /tmp/boxlite-* "$HOME/.boxlite"; do
for resolved in $path; do
[ -e "$resolved" ] || continue
size=$(du -sh "$resolved" 2>/dev/null | cut -f1 || echo "?")
echo " clearing $resolved (was: $size)"
rm -rf "$resolved" 2>/dev/null || true
done
done
read_df
echo "Post-eviction: ${FREE_GB} GB free (${USED_PCT}% used)"
if [ "${USED_PCT}" -ge "${THRESHOLD_PCT}" ]; then
echo "::error::Disk still ${USED_PCT}% used after clearing all e2e caches — manual cleanup needed."
exit 1
fi
fi
if [ "${FREE_GB}" -lt "${MIN_FREE_GB}" ]; then
echo "::error::Only ${FREE_GB} GB free on / (floor=${MIN_FREE_GB}) — refusing to run integration tests."
exit 1
fi
- name: Run integration tests
env:
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
run: |
source "$HOME/.cargo/env" 2>/dev/null || true
export PATH="/usr/local/go/bin:$HOME/go/bin:$PATH"
LOG_DIR="/var/log/boxlite-ci/${GH_RUN_ID}-${GH_RUN_ATTEMPT}"
mkdir -p "$LOG_DIR"
# tee to persistent volume so logs survive runner Worker death.
# The next run's Stage step rescues + uploads "$LOG_DIR".
set -o pipefail
make test:integration 2>&1 | tee "${LOG_DIR}/integration.log"
- name: Upload test logs on failure or cancellation
# `failure()` alone misses cancelled jobs (concurrency cancellation,
# host reboot, timeout — all surface as `cancelled()`, not failure).
# We want logs for every non-success outcome.
if: failure() || cancelled()
uses: actions/upload-artifact@v4
with:
name: e2e-test-logs-${{ github.run_id }}-${{ github.run_attempt }}
path: |
target/nextest/
/tmp/boxlite-*/
!/tmp/boxlite-rescue/
/var/log/boxlite-ci/${{ github.run_id }}-${{ github.run_attempt }}/
retention-days: 7
if-no-files-found: ignore
- name: Debug via SSH on failure
if: failure() && inputs.debug
uses: mxschmitt/action-tmate@v3
with:
limit-access-to-actor: true
timeout-minutes: 30
# =========================================================================
# Job 3: Stop the instance (always runs)
# =========================================================================
stop-runner:
name: Stop E2E Runner
needs: [start-runner, e2e-tests]
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
if: always() && needs.start-runner.outputs.instance-id != ''
steps:
- name: Configure AWS credentials (OIDC)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
role-session-name: e2e-stop-${{ github.run_id }}
- name: Stop EC2 instance
if: ${{ !(needs.e2e-tests.result == 'failure' && inputs.debug) }}
run: |
echo "Stopping instance ${{ needs.start-runner.outputs.instance-id }}..."
aws ec2 stop-instances --instance-ids "${{ needs.start-runner.outputs.instance-id }}" \
&& echo "Instance stopping" \
|| echo "::warning::Failed to stop instance"
- name: Skip stop (debug mode — tests failed with SSH active)
if: ${{ needs.e2e-tests.result == 'failure' && inputs.debug }}
run: echo "::warning::Instance left running for SSH debug session. Will auto-stop after 30 min idle."