Skip to content

ci(deploy-app-services): touch OtelCollector + SshGateway Dockerfiles… #15

ci(deploy-app-services): touch OtelCollector + SshGateway Dockerfiles…

ci(deploy-app-services): touch OtelCollector + SshGateway Dockerfiles… #15

Workflow file for this run

# Build + deploy the boxlite-runner binary to the Tokyo EC2 host (PR #724).
#
# Self-contained: chains build-c.yml (libboxlite.a from THIS checkout's
# Rust source) → build-runner-binary.yml (Go binary linking the .a) →
# a deploy job that SCPs the binary to the EC2 over SSH and swaps it
# in place.
#
# Triggers:
# - workflow_call: reused from .github/workflows/e2e-cloud.yml so the
# e2e job depends on a fresh runner being live
# before pytest.
# - workflow_dispatch: standalone trigger for verifying the deploy
# pipeline (build + deploy) without going through
# the API rebuild + pytest chain.
#
# Deploy mechanism: SSH+SCP via EC2 Instance Connect.
# main's runner deploy uses SSM Run Command (see sst.config.ts:746 +
# scripts/deploy/runner-update-binary.sh). The Tokyo e2e-ci EC2 has had
# its IAM instance profile drift (the original RunnerProfile was deleted
# from IAM, leaving the agent with no valid STS identity → ConnectionLost).
# Until that stack is reconciled via `sst deploy --stage e2e-ci`, e2e-ci
# uses SSH+SCP as a side-channel that bypasses the agent:
# 1. ec2-instance-connect:SendSSHPublicKey pushes a 60s ephemeral key
# to the ubuntu account (no pre-shared keypair, no GHA secret).
# 2. SG inbound 22 is temporarily opened from the job runner's egress
# IP (resolved at runtime), and unconditionally revoked at job end.
# 3. scp the tarball, ssh to stop / extract / start boxlite-runner.
# OIDC perms needed: ec2:DescribeInstances, ec2:AuthorizeSecurityGroup-
# Ingress, ec2:RevokeSecurityGroupIngress, ec2-instance-connect:Send-
# SSHPublicKey. No IAM mutation.
name: Deploy Runner Binary
on:
workflow_call: {}
workflow_dispatch:
inputs:
runner_artifact_run_id:
description: 'Optional: GHA run ID to pull the runner-linux-amd64 artifact from (e.g. a previous deploy-runner or e2e-cloud run). When set, skips build_c_sdk + build_runner and goes straight to deploy — useful for verifying the deploy path against a known-good binary.'
type: string
required: false
default: ''
# `push:` trigger so we can verify the deploy pipeline standalone
# WHILE this workflow file is still on a feature branch
# (workflow_dispatch only works for workflow files that already
# exist on the default branch; this PR isn't merged yet).
#
# Trigger-level paths filter is broad (intentionally includes the
# workflow files themselves, so commits to this file fire CI for
# verification). The internal `changes` job below narrows to actual
# source changes — workflow-only commits fire the workflow but the
# build + deploy jobs skip.
push:
paths:
- 'apps/runner/**'
- 'apps/daemon/**'
- 'apps/common-go/**'
- 'apps/api-client-go/**'
- 'apps/libs/computer-use/**'
- 'sdks/go/**'
- 'src/boxlite/**'
- 'src/api-client/**'
- 'src/shared/**'
- 'src/deps/**'
- 'scripts/build/**'
- 'sdks/c/src/exec/**'
- '.github/workflows/deploy-runner.yml'
- '.github/workflows/build-c.yml'
- '.github/workflows/build-runner-binary.yml'
permissions:
contents: read
jobs:
# ── Detect real source changes so workflow-only commits don't build ──
# On push: dorny/paths-filter diffs HEAD against the prior commit. On
# workflow_call / workflow_dispatch, the comparison is meaningless
# (no diff or HEAD-vs-HEAD), so we force `should_build=true` for
# those events — caller (e2e-cloud) is responsible for its own
# changes detection before calling, and workflow_dispatch is an
# explicit "I want a fresh build + deploy" signal.
changes:
name: Detect runner source changes
runs-on: ubuntu-latest
outputs:
should_build: ${{ steps.decide.outputs.should_build }}
should_deploy: ${{ steps.decide.outputs.should_deploy }}
prev_run_id: ${{ steps.decide.outputs.prev_run_id }}
steps:
- uses: actions/checkout@v5
- id: filter
if: github.event_name == 'push'
uses: dorny/paths-filter@v3
with:
# `base: github.event.before` makes the diff "what this push
# introduced" (HEAD vs parent). Default `base` is the default
# branch's HEAD, which means a feature branch with N prior
# commits would see ALL its commits in the diff every push.
base: ${{ github.event.before }}
filters: |
runner_source:
- 'apps/runner/**'
- 'apps/daemon/**'
- 'apps/common-go/**'
- 'apps/api-client-go/**'
- 'apps/libs/computer-use/**'
- 'sdks/go/**'
- 'src/boxlite/**'
- 'src/api-client/**'
- 'src/shared/**'
- 'src/deps/**'
- 'scripts/build/**'
- 'sdks/c/src/exec/**'
- id: decide
env:
PUSH_CHANGED: ${{ steps.filter.outputs.runner_source }}
PREV_RUN_ID: ${{ inputs.runner_artifact_run_id }}
run: |
# Resolve prev-run-id, in order of precedence:
# 1. workflow_dispatch input `runner_artifact_run_id`
# 2. commit-message tag `[runner-from: <run_id>]` (push-event
# workaround: workflow_dispatch only works after the
# workflow file lands on the default branch, so we offer
# a commit-message override for testing pre-merge)
PREV=""
if [ -n "${PREV_RUN_ID:-}" ]; then
PREV="$PREV_RUN_ID"
echo "Using workflow_dispatch input runner_artifact_run_id=$PREV"
else
COMMIT_MSG=$(git log -1 --pretty=%B 2>/dev/null || true)
if [[ "$COMMIT_MSG" =~ \[runner-from:\ ([0-9]+)\] ]]; then
PREV="${BASH_REMATCH[1]}"
echo "Using commit-message tag [runner-from: $PREV]"
fi
fi
if [ -n "$PREV" ]; then
echo "Reusing artifact from run $PREV — SKIP build, RUN deploy."
echo "should_build=false" >> "$GITHUB_OUTPUT"
echo "should_deploy=true" >> "$GITHUB_OUTPUT"
echo "prev_run_id=$PREV" >> "$GITHUB_OUTPUT"
exit 0
fi
if [ "${{ github.event_name }}" != 'push' ]; then
echo "Non-push event (${{ github.event_name }}) — force build + deploy."
echo "should_build=true" >> "$GITHUB_OUTPUT"
echo "should_deploy=true" >> "$GITHUB_OUTPUT"
elif [ "${PUSH_CHANGED:-false}" = 'true' ]; then
echo "Push touched runner_source paths — build + deploy."
echo "should_build=true" >> "$GITHUB_OUTPUT"
echo "should_deploy=true" >> "$GITHUB_OUTPUT"
else
echo "Push only touched workflow files — SKIP build + deploy."
echo "should_build=false" >> "$GITHUB_OUTPUT"
echo "should_deploy=false" >> "$GITHUB_OUTPUT"
fi
# ── Build libboxlite.a from THIS checkout's Rust source ────────────
# target_filter constrains the matrix to linux-x64-gnu (Tokyo runner
# is amd64; skip macOS / linux-arm64 entries from the platforms list).
build_c_sdk:
name: Build C SDK (linux-x64-gnu)
needs: changes
if: needs.changes.outputs.should_build == 'true'
uses: ./.github/workflows/build-c.yml
with:
target_filter: linux-x64-gnu
permissions:
# Inherit defaults — build-c.yml's `upload-to-release` job declares
# contents: write; even though that job's `if:` skips for
# workflow_call, the caller still needs to grant at-least-as-much.
contents: write
# ── Build the Go runner binary linking the fresh libboxlite.a ──────
build_runner:
name: Build runner binary
needs: [changes, build_c_sdk]
if: needs.changes.outputs.should_build == 'true'
uses: ./.github/workflows/build-runner-binary.yml
with:
libboxlite_source: build
permissions:
contents: write
# ── Deploy: SSH+SCP via EC2 Instance Connect ──────────────────────
#
# Mechanism (see header comment): push 60s ephemeral SSH pubkey via
# ec2-instance-connect, temporarily open the runner SG inbound 22 from
# this job runner's egress IP, scp the binary, ssh to swap it, and
# unconditionally revoke the SG rule on exit (success or failure).
deploy:
name: Deploy runner to Tokyo EC2 (SSH+SCP)
needs: [changes, build_runner]
if: |
!failure() && !cancelled()
&& needs.changes.outputs.should_deploy == 'true'
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
id-token: write
contents: read
env:
AWS_REGION: ${{ vars.AWS_E2E_CLOUD_REGION }}
AWS_ROLE_ARN: ${{ vars.AWS_E2E_CLOUD_ROLE_ARN }}
steps:
- name: Configure AWS credentials (OIDC)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
role-session-name: deploy-runner-${{ github.run_id }}
- name: Resolve Tokyo runner instance / IP / SG
id: ec2
run: |
set -euo pipefail
read -r ID IP SG <<<"$(aws ec2 describe-instances \
--filters "Name=tag:Name,Values=boxlite-runner" \
"Name=instance-state-name,Values=running" \
--query 'Reservations[0].Instances[0].[InstanceId,PublicIpAddress,SecurityGroups[0].GroupId]' \
--output text)"
[ -n "$ID" ] && [ "$ID" != "None" ] || { echo "::error::No running boxlite-runner instance"; exit 1; }
[ -n "$IP" ] && [ "$IP" != "None" ] || { echo "::error::Runner has no public IP — SSH path requires it"; exit 1; }
[ -n "$SG" ] && [ "$SG" != "None" ] || { echo "::error::Could not resolve runner SG"; exit 1; }
echo "runner_id=$ID" >> "$GITHUB_OUTPUT"
echo "runner_ip=$IP" >> "$GITHUB_OUTPUT"
echo "runner_sg=$SG" >> "$GITHUB_OUTPUT"
echo "::notice::Runner $ID @ $IP (SG $SG)"
- name: Download runner binary artifact (this run)
if: needs.changes.outputs.prev_run_id == ''
uses: actions/download-artifact@v4
with:
name: runner-linux-amd64
path: /tmp/runner-artifact/
- name: Download runner binary artifact (from prior run)
if: needs.changes.outputs.prev_run_id != ''
uses: actions/download-artifact@v4
with:
name: runner-linux-amd64
path: /tmp/runner-artifact/
run-id: ${{ needs.changes.outputs.prev_run_id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
repository: ${{ github.repository }}
- name: Generate ephemeral SSH keypair + push via Instance Connect
id: keypush
env:
INSTANCE_ID: ${{ steps.ec2.outputs.runner_id }}
run: |
set -euo pipefail
KEY=/tmp/deploy-runner-ed25519
ssh-keygen -t ed25519 -N '' -f "$KEY" -C "gha-deploy-runner-${GITHUB_RUN_ID}" >/dev/null
# 60s validity — must scp + ssh within that window.
aws ec2-instance-connect send-ssh-public-key \
--instance-id "$INSTANCE_ID" \
--instance-os-user ubuntu \
--ssh-public-key "file://${KEY}.pub" \
--query 'Success' --output text
echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
- name: Open SG 22 inbound from this runner IP (temporary)
id: sgopen
env:
SG: ${{ steps.ec2.outputs.runner_sg }}
run: |
set -euo pipefail
MY_IP=$(curl -fsSL --max-time 10 https://api.ipify.org)
[ -n "$MY_IP" ] || { echo "::error::Could not resolve own egress IP"; exit 1; }
echo "Runner egress IP: $MY_IP"
RULE_ID=$(aws ec2 authorize-security-group-ingress \
--group-id "$SG" \
--ip-permissions "IpProtocol=tcp,FromPort=22,ToPort=22,IpRanges=[{CidrIp=${MY_IP}/32,Description=gha-deploy-runner-${GITHUB_RUN_ID}}]" \
--query 'SecurityGroupRules[0].SecurityGroupRuleId' --output text)
[ -n "$RULE_ID" ] && [ "$RULE_ID" != "None" ] || { echo "::error::Failed to add SG rule"; exit 1; }
echo "rule_id=$RULE_ID" >> "$GITHUB_OUTPUT"
echo "::notice::SG rule $RULE_ID added (22/tcp from $MY_IP/32)"
- name: SCP artifact + SSH-driven binary swap (verify PID + SHA256 changed)
env:
IP: ${{ steps.ec2.outputs.runner_ip }}
KEY: ${{ steps.keypush.outputs.key_path }}
run: |
set -euo pipefail
ARCHIVE=$(ls /tmp/runner-artifact/boxlite-runner-*-linux-amd64.tar.gz | head -1)
[ -n "$ARCHIVE" ] || { echo "::error::No runner artifact found"; exit 1; }
# Compute expected binary sha256 from the artifact tarball — the
# remote `sha256sum /usr/local/bin/boxlite-runner` after swap must
# match this, otherwise the file on disk isn't what we shipped.
EXPECTED_SHA=$(tar -xzOf "$ARCHIVE" boxlite-runner 2>/dev/null | sha256sum | awk '{print $1}') \
|| EXPECTED_SHA=$(tar -tzf "$ARCHIVE" | grep -E 'boxlite-runner$' | head -1 \
| xargs -I{} sh -c "tar -xzOf '$ARCHIVE' {} | sha256sum | awk '{print \$1}'")
[ -n "$EXPECTED_SHA" ] || { echo "::error::Could not compute expected sha256 from $ARCHIVE"; exit 1; }
echo "::notice::expected sha256: $EXPECTED_SHA"
SSH_OPTS="-i $KEY -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=15"
# SCP the tarball
scp $SSH_OPTS "$ARCHIVE" "ubuntu@${IP}:/tmp/boxlite-runner.tar.gz"
# In-place swap + restart + verification. The remote script
# captures MainPID + ActiveEnterTimestamp BEFORE the swap, then
# asserts after restart: (a) new MainPID != old MainPID
# (proves a real process replacement, not the old one still
# running), (b) ActiveEnterTimestampMonotonic strictly
# advanced (proves systemd marked it active after our start),
# and (c) /usr/local/bin/boxlite-runner sha256 matches
# EXPECTED_SHA passed from the GHA runner.
ssh $SSH_OPTS "ubuntu@${IP}" "EXPECTED_SHA='$EXPECTED_SHA' bash -s" <<'REMOTE'
set -euxo pipefail
BEFORE_PID=$(systemctl show -p MainPID --value boxlite-runner 2>/dev/null || echo 0)
BEFORE_TS=$(systemctl show -p ActiveEnterTimestampMonotonic --value boxlite-runner 2>/dev/null || echo 0)
echo "BEFORE: MainPID=$BEFORE_PID ActiveEnterTimestampMonotonic=$BEFORE_TS"
sudo systemctl stop boxlite-runner
sudo tar xzf /tmp/boxlite-runner.tar.gz -C /usr/local/bin/
sudo chmod +x /usr/local/bin/boxlite-runner
sudo systemctl start boxlite-runner
sleep 5
sudo systemctl is-active --quiet boxlite-runner || { sudo journalctl -u boxlite-runner -n 50 --no-pager; exit 1; }
AFTER_PID=$(systemctl show -p MainPID --value boxlite-runner)
AFTER_TS=$(systemctl show -p ActiveEnterTimestampMonotonic --value boxlite-runner)
INSTALLED_SHA=$(sudo sha256sum /usr/local/bin/boxlite-runner | awk '{print $1}')
echo "AFTER: MainPID=$AFTER_PID ActiveEnterTimestampMonotonic=$AFTER_TS"
echo "AFTER: sha256=$INSTALLED_SHA"
echo "EXPECT: sha256=$EXPECTED_SHA"
# (a) PID must have changed (proves real restart)
if [ "$AFTER_PID" = "$BEFORE_PID" ] && [ "$BEFORE_PID" != "0" ]; then
echo "::error::MainPID did not change ($AFTER_PID) — service did not actually restart"
exit 1
fi
# (b) systemd's monotonic timestamp for last active-enter must have advanced
if [ "$AFTER_TS" -le "$BEFORE_TS" ]; then
echo "::error::ActiveEnterTimestampMonotonic did not advance ($BEFORE_TS -> $AFTER_TS)"
exit 1
fi
# (c) installed binary sha256 must match the artifact we uploaded
if [ "$INSTALLED_SHA" != "$EXPECTED_SHA" ]; then
echo "::error::Installed binary sha256 mismatch — got $INSTALLED_SHA expected $EXPECTED_SHA"
exit 1
fi
echo "Verified: PID swap + monotonic ts advance + sha256 match"
/usr/local/bin/boxlite-runner --version 2>&1 || true
REMOTE
echo "::notice::Runner binary swap succeeded"
- name: Revoke SG 22 inbound rule (always)
if: always() && steps.sgopen.outputs.rule_id != ''
env:
SG: ${{ steps.ec2.outputs.runner_sg }}
RULE_ID: ${{ steps.sgopen.outputs.rule_id }}
run: |
aws ec2 revoke-security-group-ingress \
--group-id "$SG" \
--security-group-rule-ids "$RULE_ID" \
--query 'Return' --output text
echo "::notice::SG rule $RULE_ID revoked"
# 2026-06-11T15:04:57Z — boundary removed; re-verify SSH+SCP CI path standalone