fix(exec): preserve UTF-8 codepoints split across attach chunks #416
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Run VM-based E2E integration tests on a persistent AWS EC2 runner. | |
| # | |
| # GitHub-hosted runners do not support nested virtualization / KVM. | |
| # This workflow starts a pre-configured EC2 instance, runs integration tests, | |
| # then stops it. The instance is never terminated — build caches persist. | |
| # | |
| # Architecture: | |
| # start-runner (ubuntu-latest) → e2e-tests (self-hosted) → stop-runner (ubuntu-latest) | |
| # Start stopped EC2 Build + run tests Stop EC2 | |
| # Wait for runner online 35-min timeout if: always() | |
| # | |
| # Concurrency: only ONE e2e run at a time (single persistent instance). | |
| # Queued runs wait; in-progress runs are cancelled by newer pushes. | |
| # | |
| # Cost: ~$0.17/hr (c8i.xlarge) only while running + ~$4/mo EBS storage. | |
| # Subsequent runs skip setup/compilation (cached) → ~5-10 min → ~$0.03/run. | |
| # | |
| # Authentication: | |
| # AWS: GitHub OIDC → AWS STS (no stored AWS credentials) | |
| # GitHub: GitHub App → short-lived installation token (no PAT) | |
| # | |
| # Setup: ./scripts/ci/setup-ci-runner.sh | |
| name: E2E Tests | |
| on: | |
| push: | |
| branches: [main] | |
| paths: | |
| - 'src/boxlite/**' | |
| - 'src/shared/**' | |
| - 'src/cli/**' | |
| - 'src/guest/**' | |
| - 'sdks/**' | |
| - '**/Cargo.toml' | |
| - 'Cargo.lock' | |
| - '.github/workflows/e2e-test.yml' | |
| pull_request: | |
| types: [labeled, synchronize] | |
| branches: [main] | |
| paths: | |
| - 'src/boxlite/**' | |
| - 'src/shared/**' | |
| - 'src/cli/**' | |
| - 'src/guest/**' | |
| - 'sdks/**' | |
| - '**/Cargo.toml' | |
| - 'Cargo.lock' | |
| workflow_dispatch: | |
| inputs: | |
| debug: | |
| description: 'Open SSH session on failure (via tmate)' | |
| type: boolean | |
| default: false | |
| # Single global concurrency group — only one e2e run at a time. | |
| # The persistent instance can only serve one job. Newer pushes cancel older runs. | |
| concurrency: | |
| group: e2e-runner | |
| cancel-in-progress: true | |
| env: | |
| AWS_REGION: us-east-1 | |
| AWS_ROLE_ARN: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/boxlite-e2e-github-actions | |
| EC2_INSTANCE_ID: ${{ vars.EC2_E2E_INSTANCE_ID }} | |
| EC2_INSTANCE_TYPE: c8i.4xlarge | |
| EC2_AMI_ID: ami-05cf1e9f73fbad2e2 | |
| EC2_SUBNET_IDS: ${{ vars.AWS_SUBNET_IDS || vars.AWS_SUBNET_ID }} | |
| EC2_SECURITY_GROUP_ID: ${{ vars.AWS_SECURITY_GROUP_ID }} | |
| RUNNER_VERSION: '2.334.0' | |
| RUNNER_LABEL: boxlite-e2e | |
| jobs: | |
| # ========================================================================= | |
| # Gate: PRs require 'e2e-test' label (only maintainers can add it) | |
| # ========================================================================= | |
| should-run: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| run: ${{ steps.check.outputs.run }} | |
| steps: | |
| - name: Check trigger conditions | |
| id: check | |
| run: | | |
| if [ "${{ github.event_name }}" = "pull_request" ]; then | |
| LABELS='${{ toJson(github.event.pull_request.labels.*.name) }}' | |
| if echo "$LABELS" | jq -e 'index("e2e-test")' > /dev/null 2>&1; then | |
| echo "run=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "run=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| else | |
| echo "run=true" >> "$GITHUB_OUTPUT" | |
| fi | |
| # ========================================================================= | |
| # Job 1: Start the persistent EC2 instance and wait for runner | |
| # ========================================================================= | |
| start-runner: | |
| name: Start E2E Runner | |
| needs: should-run | |
| if: needs.should-run.outputs.run == 'true' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| id-token: write | |
| contents: read | |
| outputs: | |
| instance-id: ${{ steps.ensure-instance.outputs.instance-id }} | |
| steps: | |
| - name: Generate GitHub App token | |
| id: app-token | |
| uses: actions/create-github-app-token@v1 | |
| with: | |
| app-id: ${{ vars.GH_APP_ID }} | |
| private-key: ${{ secrets.GH_APP_PRIVATE_KEY }} | |
| - name: Configure AWS credentials (OIDC) | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| role-session-name: e2e-start-${{ github.run_id }} | |
| - name: Start or create EC2 instance | |
| id: ensure-instance | |
| run: | | |
| INSTANCE_ID="${{ env.EC2_INSTANCE_ID }}" | |
| # Fallback: discover by tag if variable is empty | |
| if [ -z "$INSTANCE_ID" ]; then | |
| INSTANCE_ID=$(aws ec2 describe-instances \ | |
| --filters "Name=tag:Name,Values=boxlite-e2e" "Name=instance-state-name,Values=running,stopped,stopping,pending" \ | |
| --query "Reservations[0].Instances[0].InstanceId" --output text 2>/dev/null || echo "") | |
| [ "$INSTANCE_ID" = "None" ] && INSTANCE_ID="" | |
| fi | |
| # Check if instance exists and get its state | |
| if [ -n "$INSTANCE_ID" ]; then | |
| STATE=$(aws ec2 describe-instances \ | |
| --instance-ids "$INSTANCE_ID" \ | |
| --query "Reservations[0].Instances[0].State.Name" --output text 2>/dev/null || echo "not-found") | |
| else | |
| STATE="not-found" | |
| fi | |
| if [ "$STATE" = "running" ]; then | |
| echo "Instance already running" | |
| echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT" | |
| elif [ "$STATE" = "stopped" ]; then | |
| echo "Starting instance..." | |
| aws ec2 start-instances --instance-ids "$INSTANCE_ID" | |
| aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" | |
| echo "Instance started" | |
| echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT" | |
| elif [ "$STATE" = "stopping" ] || [ "$STATE" = "pending" ]; then | |
| echo "Instance in transitional state ($STATE), waiting..." | |
| aws ec2 wait instance-stopped --instance-ids "$INSTANCE_ID" 2>/dev/null || \ | |
| aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" 2>/dev/null || true | |
| # Re-check and start if stopped | |
| STATE=$(aws ec2 describe-instances --instance-ids "$INSTANCE_ID" \ | |
| --query "Reservations[0].Instances[0].State.Name" --output text) | |
| if [ "$STATE" = "stopped" ]; then | |
| aws ec2 start-instances --instance-ids "$INSTANCE_ID" | |
| aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" | |
| elif [ "$STATE" != "running" ]; then | |
| echo "::error::Instance stuck in state: $STATE" | |
| exit 1 | |
| fi | |
| echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "Instance not found or terminated. Creating new one..." | |
| # Generate runner registration token for user-data | |
| REG_TOKEN=$(curl -sf -X POST \ | |
| -H "Authorization: token ${{ steps.app-token.outputs.token }}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${{ github.repository }}/actions/runners/registration-token" \ | |
| | jq -r .token) | |
| # Build user-data | |
| cat > /tmp/user-data.sh << 'USERDATA' | |
| #!/bin/bash | |
| set -euo pipefail | |
| export HOME=/root | |
| modprobe kvm_intel 2>/dev/null || modprobe kvm_amd 2>/dev/null || true | |
| chmod 666 /dev/kvm 2>/dev/null || true | |
| usermod -aG kvm root 2>/dev/null || true | |
| # User-data only installs the actions-runner. Build deps | |
| # (apt + rustup + cargo-nextest) are installed by the | |
| # e2e-tests job's "Install build dependencies" step so they | |
| # stream to the GitHub Actions log and don't race the | |
| # runner-online poll. | |
| apt-get update -qq && apt-get install -y -qq curl jq tar | |
| RUNNER_VERSION="__RUNNER_VERSION__" | |
| mkdir -p /opt/actions-runner && cd /opt/actions-runner | |
| if [ ! -f ./config.sh ]; then | |
| curl -fsSL "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" | tar xz | |
| fi | |
| RUNNER_ALLOW_RUNASROOT=1 ./config.sh \ | |
| --url "https://github.com/__REPO__" \ | |
| --token "__REG_TOKEN__" \ | |
| --name "boxlite-e2e" \ | |
| --labels "self-hosted,linux,x64,kvm,boxlite-e2e" \ | |
| --unattended --disableupdate --replace || true | |
| ./svc.sh install root || true | |
| ./svc.sh start || true | |
| USERDATA | |
| sed -i "s|__REPO__|${{ github.repository }}|g" /tmp/user-data.sh | |
| sed -i "s|__RUNNER_VERSION__|${{ env.RUNNER_VERSION }}|g" /tmp/user-data.sh | |
| sed -i "s|__REG_TOKEN__|${REG_TOKEN}|g" /tmp/user-data.sh | |
| # Launch instance — try each subnet (AZ) until one has capacity | |
| INSTANCE_ID="" | |
| IFS=',' read -ra SUBNET_LIST <<< "${{ env.EC2_SUBNET_IDS }}" | |
| for SUBNET in "${SUBNET_LIST[@]}"; do | |
| SUBNET=$(echo "$SUBNET" | xargs) | |
| echo "Attempting launch in subnet $SUBNET..." | |
| if RESULT=$(aws ec2 run-instances \ | |
| --instance-type "${{ env.EC2_INSTANCE_TYPE }}" \ | |
| --image-id "${{ env.EC2_AMI_ID }}" \ | |
| --subnet-id "$SUBNET" \ | |
| --security-group-ids "${{ env.EC2_SECURITY_GROUP_ID }}" \ | |
| --iam-instance-profile Name=boxlite-e2e-runner \ | |
| --cpu-options "NestedVirtualization=enabled" \ | |
| --user-data file:///tmp/user-data.sh \ | |
| --block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":50,"VolumeType":"gp3","DeleteOnTermination":false}}]' \ | |
| --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=boxlite-e2e},{Key=Purpose,Value=boxlite-e2e}]" \ | |
| --metadata-options "HttpTokens=required,HttpEndpoint=enabled" \ | |
| --query "Instances[0].InstanceId" --output text 2>&1); then | |
| INSTANCE_ID="$RESULT" | |
| echo "Launched $INSTANCE_ID in subnet $SUBNET" | |
| break | |
| else | |
| echo "::warning::Subnet $SUBNET unavailable: $RESULT" | |
| fi | |
| done | |
| if [ -z "$INSTANCE_ID" ]; then | |
| echo "::error::Failed to launch instance in any availability zone" | |
| exit 1 | |
| fi | |
| echo "Created instance: $INSTANCE_ID" | |
| aws ec2 wait instance-running --instance-ids "$INSTANCE_ID" | |
| echo "instance-id=$INSTANCE_ID" >> "$GITHUB_OUTPUT" | |
| # Try to save instance ID for faster lookup on the next run. | |
| # Best-effort: tag-based fallback (Name=boxlite-e2e) handles rediscovery | |
| # if this fails (e.g., the auth token lacks variables:write). | |
| if ! gh variable set EC2_E2E_INSTANCE_ID --body "$INSTANCE_ID" -R "${{ github.repository }}"; then | |
| echo "::warning::Could not persist EC2_E2E_INSTANCE_ID. Future runs will rediscover by tag." | |
| fi | |
| fi | |
| env: | |
| GH_TOKEN: ${{ steps.app-token.outputs.token }} | |
| - name: Wait for runner to come online | |
| run: | | |
| TIMEOUT=180 | |
| ELAPSED=0 | |
| INTERVAL=10 | |
| echo "Waiting for runner '${{ env.RUNNER_LABEL }}' to come online..." | |
| while [ $ELAPSED -lt $TIMEOUT ]; do | |
| STATUS=$(curl -sf \ | |
| -H "Authorization: token ${{ steps.app-token.outputs.token }}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/${{ github.repository }}/actions/runners" \ | |
| | jq -r '.runners[] | select(.labels[].name == "${{ env.RUNNER_LABEL }}") | .status // empty' \ | |
| 2>/dev/null || echo "") | |
| if [ "$STATUS" = "online" ]; then | |
| echo "Runner is online" | |
| exit 0 | |
| fi | |
| echo " Waiting... (${ELAPSED}s / ${TIMEOUT}s)" | |
| sleep $INTERVAL | |
| ELAPSED=$((ELAPSED + INTERVAL)) | |
| done | |
| echo "::error::Runner did not come online within ${TIMEOUT}s" | |
| exit 1 | |
| # ========================================================================= | |
| # Job 2: Run integration tests on the persistent runner | |
| # ========================================================================= | |
| e2e-tests: | |
| name: E2E Integration Tests | |
| needs: start-runner | |
| runs-on: | |
| - self-hosted | |
| - boxlite-e2e | |
| timeout-minutes: 50 | |
| env: | |
| CARGO_TERM_COLOR: always | |
| CARGO_INCREMENTAL: '0' | |
| HOME: /root | |
| steps: | |
| - name: Clean workspace (persistent runner) | |
| run: | | |
| rm -rf /tmp/boxlite-* 2>/dev/null || true | |
| if [ -d "$GITHUB_WORKSPACE/.git" ]; then | |
| cd "$GITHUB_WORKSPACE" | |
| git clean -ffd 2>/dev/null || true | |
| git checkout -- . 2>/dev/null || true | |
| fi | |
| - name: Stage prior-run logs for rescue | |
| # If a previous run died mid-step (e.g. Worker ENOSPC), its logs | |
| # are still on this persistent runner under /var/log/boxlite-ci/. | |
| # Move them aside so the upload step below can ship them as a | |
| # separate artifact, then clear them off the runner volume. | |
| # | |
| # Namespace key is "<run_id>-<run_attempt>": reruns of the same | |
| # workflow share github.run_id and only github.run_attempt | |
| # increments, so run_id alone would let attempt N misidentify | |
| # attempt N-1's directory as its own and skip rescuing it. | |
| env: | |
| GH_RUN_ID: ${{ github.run_id }} | |
| GH_RUN_ATTEMPT: ${{ github.run_attempt }} | |
| run: | | |
| set -u | |
| BASE=/var/log/boxlite-ci | |
| STAGE=/tmp/boxlite-rescue | |
| CURRENT="${GH_RUN_ID}-${GH_RUN_ATTEMPT}" | |
| rm -rf "$STAGE" && mkdir -p "$STAGE" | |
| if [ -d "$BASE" ]; then | |
| for d in "$BASE"/*; do | |
| [ -d "$d" ] || continue | |
| [ "$(basename "$d")" = "$CURRENT" ] && continue | |
| mv "$d" "$STAGE/" | |
| done | |
| fi | |
| - name: Upload rescued prior-run logs | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-logs-rescued-${{ github.run_id }}-${{ github.run_attempt }} | |
| path: /tmp/boxlite-rescue/ | |
| retention-days: 7 | |
| if-no-files-found: ignore | |
| - name: Pre-flight runner cleanup (runner internals only — never caches) | |
| # Housekeeping for actions-runner's own diagnostic logs so they | |
| # don't accumulate over the runner's lifetime. Build/image caches | |
| # (~/.boxlite/, ~/.cargo/, target/) are LEFT ALONE — they are why | |
| # this runner is persistent. | |
| # | |
| # Triggered by run 25726812554 where _diag was the canary for a | |
| # 50→500GB root-volume migration: _diag was the first write to | |
| # hit ENOSPC, but the actual disk fill came from build artifacts | |
| # under _work/.../target/, not from _diag itself. This step is | |
| # preventative housekeeping, not the fix for that incident. | |
| run: | | |
| find /opt/actions-runner/_diag -maxdepth 1 -type f -name 'Worker_*.log' -mtime +1 -delete 2>/dev/null || true | |
| find /opt/actions-runner/_diag -maxdepth 1 -type f -name 'Runner_*.log' -mtime +1 -delete 2>/dev/null || true | |
| - name: Checkout code | |
| uses: actions/checkout@v5 | |
| with: | |
| submodules: recursive | |
| clean: false | |
| - name: Verify KVM access | |
| run: | | |
| test -c /dev/kvm && test -r /dev/kvm -a -w /dev/kvm && echo "/dev/kvm is accessible" || { | |
| echo "::error::/dev/kvm not accessible"; exit 1 | |
| } | |
| - name: Install build dependencies (make setup) | |
| run: | | |
| # First run on a fresh instance: ~5–10 min (apt + rustup + | |
| # cargo-nextest; CI=true so prek install is skipped). | |
| # Subsequent runs reuse the persistent EBS volume and are | |
| # idempotent fast-paths (~10–30 s). | |
| bash scripts/setup/setup-ubuntu.sh | |
| - name: Disk-pressure cache eviction | |
| # Replaces the prior hard-fail precheck (run 25726812554 post-mortem): | |
| # at >=THRESHOLD_PCT root use, evict e2e caches in place, re-check, | |
| # and only fail if disk is still over threshold afterward. Caches | |
| # re-warm on next run. Runner-internals housekeeping is separate | |
| # (see "Pre-flight runner cleanup" step above). | |
| # | |
| # df parse-contract is validated before any comparison so a silent | |
| # df format change surfaces as ::error::, not a fake "disk OK". | |
| env: | |
| THRESHOLD_PCT: '80' | |
| MIN_FREE_GB: '20' | |
| run: | | |
| set -euo pipefail | |
| read_df() { | |
| DF_AVAIL=$(df --output=avail -BG / | tail -n 1) | |
| DF_PCENT=$(df --output=pcent / | tail -n 1) | |
| FREE_GB=$(printf '%s' "$DF_AVAIL" | tr -dc '0-9') | |
| USED_PCT=$(printf '%s' "$DF_PCENT" | tr -dc '0-9') | |
| if ! [[ "$FREE_GB" =~ ^[0-9]+$ ]] || ! [[ "$USED_PCT" =~ ^[0-9]+$ ]]; then | |
| echo "::error::Could not parse df output (FREE_GB='${FREE_GB}' USED_PCT='${USED_PCT}') — df contract may have changed." | |
| echo "Raw df --output=avail -BG / tail: ${DF_AVAIL}" | |
| echo "Raw df --output=pcent / tail: ${DF_PCENT}" | |
| exit 1 | |
| fi | |
| } | |
| read_df | |
| echo "Root volume: ${FREE_GB} GB free (${USED_PCT}% used)" | |
| if [ "${USED_PCT}" -ge "${THRESHOLD_PCT}" ]; then | |
| echo "── Disk >=${THRESHOLD_PCT}% used — evicting e2e caches ──" | |
| for path in target/boxlite-test /tmp/boxlite-* "$HOME/.boxlite"; do | |
| for resolved in $path; do | |
| [ -e "$resolved" ] || continue | |
| size=$(du -sh "$resolved" 2>/dev/null | cut -f1 || echo "?") | |
| echo " clearing $resolved (was: $size)" | |
| rm -rf "$resolved" 2>/dev/null || true | |
| done | |
| done | |
| read_df | |
| echo "Post-eviction: ${FREE_GB} GB free (${USED_PCT}% used)" | |
| if [ "${USED_PCT}" -ge "${THRESHOLD_PCT}" ]; then | |
| echo "::error::Disk still ${USED_PCT}% used after clearing all e2e caches — manual cleanup needed." | |
| exit 1 | |
| fi | |
| fi | |
| if [ "${FREE_GB}" -lt "${MIN_FREE_GB}" ]; then | |
| echo "::error::Only ${FREE_GB} GB free on / (floor=${MIN_FREE_GB}) — refusing to run integration tests." | |
| exit 1 | |
| fi | |
| - name: Run integration tests | |
| env: | |
| GH_RUN_ID: ${{ github.run_id }} | |
| GH_RUN_ATTEMPT: ${{ github.run_attempt }} | |
| run: | | |
| source "$HOME/.cargo/env" 2>/dev/null || true | |
| export PATH="/usr/local/go/bin:$HOME/go/bin:$PATH" | |
| LOG_DIR="/var/log/boxlite-ci/${GH_RUN_ID}-${GH_RUN_ATTEMPT}" | |
| mkdir -p "$LOG_DIR" | |
| # tee to persistent volume so logs survive runner Worker death. | |
| # The next run's Stage step rescues + uploads "$LOG_DIR". | |
| set -o pipefail | |
| make test:integration 2>&1 | tee "${LOG_DIR}/integration.log" | |
| - name: Upload test logs on failure or cancellation | |
| # `failure()` alone misses cancelled jobs (concurrency cancellation, | |
| # host reboot, timeout — all surface as `cancelled()`, not failure). | |
| # We want logs for every non-success outcome. | |
| if: failure() || cancelled() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: e2e-test-logs-${{ github.run_id }}-${{ github.run_attempt }} | |
| path: | | |
| target/nextest/ | |
| /tmp/boxlite-*/ | |
| !/tmp/boxlite-rescue/ | |
| /var/log/boxlite-ci/${{ github.run_id }}-${{ github.run_attempt }}/ | |
| retention-days: 7 | |
| if-no-files-found: ignore | |
| - name: Debug via SSH on failure | |
| if: failure() && inputs.debug | |
| uses: mxschmitt/action-tmate@v3 | |
| with: | |
| limit-access-to-actor: true | |
| timeout-minutes: 30 | |
| # ========================================================================= | |
| # Job 3: Stop the instance (always runs) | |
| # ========================================================================= | |
| stop-runner: | |
| name: Stop E2E Runner | |
| needs: [start-runner, e2e-tests] | |
| runs-on: ubuntu-latest | |
| permissions: | |
| id-token: write | |
| contents: read | |
| if: always() && needs.start-runner.outputs.instance-id != '' | |
| steps: | |
| - name: Configure AWS credentials (OIDC) | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| role-session-name: e2e-stop-${{ github.run_id }} | |
| - name: Stop EC2 instance | |
| if: ${{ !(needs.e2e-tests.result == 'failure' && inputs.debug) }} | |
| run: | | |
| echo "Stopping instance ${{ needs.start-runner.outputs.instance-id }}..." | |
| aws ec2 stop-instances --instance-ids "${{ needs.start-runner.outputs.instance-id }}" \ | |
| && echo "Instance stopping" \ | |
| || echo "::warning::Failed to stop instance" | |
| - name: Skip stop (debug mode — tests failed with SSH active) | |
| if: ${{ needs.e2e-tests.result == 'failure' && inputs.debug }} | |
| run: echo "::warning::Instance left running for SSH debug session. Will auto-stop after 30 min idle." |