Skip to content

Merge pull request #344 from diggerhq/fix/event-forwarder-reclaim-del… #102

Merge pull request #344 from diggerhq/fix/event-forwarder-reclaim-del…

Merge pull request #344 from diggerhq/fix/event-forwarder-reclaim-del… #102

name: Build Worker Image
on:
push:
branches: [main, autoscaling-etc]
# Paths that actually change what's inside the worker AMI.
# cmd/worker's transitive import surface (verified via `go list -deps
# ./cmd/worker/`) covers almost every internal package we have. Missing
# triggers here cause silent bugs where new worker code never gets
# shipped.
#
# Rootfs content-addressed caching (deploy/packer/worker-ami.pkr.hcl)
# means builds from unchanged rootfs inputs reuse the same cached ext4,
# so goldenVersion stays stable even when we rebuild the AMI for a
# worker-binary-only change. The narrowing here is just about avoiding
# wasted CI minutes on pushes that don't affect the worker binary
# (e.g. cmd/server/**, web/**, docs/**).
# GitHub only permits one of `paths` / `paths-ignore` per event — we use
# `paths` with `!` negation patterns to exclude control-plane-only dirs
# that would otherwise match `internal/**`.
paths:
- 'cmd/worker/**'
- 'cmd/agent/**'
- 'internal/**'
- '!internal/api/**'
- '!internal/billing/**'
- 'pkg/**'
- 'proto/agent/**'
- 'proto/worker/**'
- 'deploy/firecracker/rootfs/**'
- 'deploy/azure/setup-azure-host.sh'
- 'deploy/ec2/build-rootfs-docker.sh'
- 'deploy/packer/**'
# Vector configs + install.sh + KV populator are all baked into the
# AMI by the Packer provisioner (deploy/packer/worker-ami.pkr.hcl §4.5).
# A change here needs to roll through a new AMI to reach prod workers.
- 'deploy/vector/**'
- 'scripts/claude-agent-wrapper/**'
- 'go.mod'
- 'go.sum'
- '.github/workflows/build-worker-ami.yml'
workflow_dispatch:
env:
AZURE_RESOURCE_GROUP: ${{ secrets.AZURE_RESOURCE_GROUP }}
AZURE_LOCATION: ${{ vars.AZURE_LOCATION || 'eastus2' }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
AZURE_GALLERY_NAME: ${{ vars.AZURE_GALLERY_NAME || 'opensandbox_gallery' }}
jobs:
build-image:
name: Build Worker Image (Azure)
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v4
with:
# Full history required so AGENT_VERSION below can find the most
# recent commit that touched cmd/agent / internal/agent / proto/agent
# — without it, actions/checkout does a shallow clone (depth 1) and
# git log only sees HEAD.
fetch-depth: 0
- uses: actions/setup-go@v5
with:
go-version: '1.23'
- name: Build binaries (amd64)
run: |
VERSION=$(git rev-parse --short HEAD)
echo "VERSION=$VERSION" >> $GITHUB_ENV
# Agent version bumps only when actual agent code changes. Using the
# repo-level HEAD SHA would mark every AMI build as a new agent, causing
# spurious in-place upgrades on every rolling replace — a known source
# of fragility during wake (keepalive misses during 12MB transfer poison
# the virtio-serial gRPC connection).
AGENT_VERSION=$(git log -1 --pretty=format:%h -- cmd/agent internal/agent proto/agent)
if [ -z "$AGENT_VERSION" ]; then
AGENT_VERSION=$VERSION
fi
echo "AGENT_VERSION=$AGENT_VERSION"
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags "-X main.WorkerVersion=$VERSION -X main.AgentVersion=$AGENT_VERSION" \
-o bin/opensandbox-worker ./cmd/worker/
CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags "-X main.Version=$AGENT_VERSION" \
-o bin/osb-agent ./cmd/agent/
- name: Package rootfs context
run: |
tar czf /tmp/packer-rootfs-ctx.tar.gz \
deploy/firecracker/rootfs/ \
deploy/ec2/build-rootfs-docker.sh \
scripts/claude-agent-wrapper/
- name: Package Vector configs
# Bundled separately so Packer's file provisioner has a known
# tarball path (it can't reliably upload an arbitrary directory).
# install.sh on the builder extracts and installs Vector + the
# KV-token populator into the AMI.
run: |
tar czf /tmp/packer-vector-ctx.tar.gz -C deploy vector
- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Setup Packer
uses: hashicorp/setup-packer@main
- name: Packer init
run: packer init deploy/packer/worker-ami.pkr.hcl
- name: Read previous golden-version from Key Vault
id: prev-golden
run: |
# Previous successful build wrote its goldenVersion here; the
# NEW AMI bakes that base in so customers pinned to the last
# golden skip a runtime blob download on first fork.
# `az` may inject ANSI color codes even with -o tsv, so extract
# just the 16-char hex hash from the output.
RAW=$(az keyvault secret show \
--vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \
--name "golden-version" \
--query value -o tsv 2>/dev/null || echo "")
PREV=$(echo "$RAW" | grep -oE '[0-9a-f]{16}' | head -1)
echo "prev=$PREV" >> $GITHUB_OUTPUT
echo "Previous golden version: ${PREV:-<none>}"
- name: Build image
run: |
# Use run number as patch version for gallery (must be integer)
PATCH=${{ github.run_number }}
packer build \
-var "worker_version=$VERSION" \
-var "agent_version=$VERSION" \
-var "subscription_id=$AZURE_SUBSCRIPTION_ID" \
-var "resource_group=$AZURE_RESOURCE_GROUP" \
-var "location=$AZURE_LOCATION" \
-var "gallery_name=$AZURE_GALLERY_NAME" \
-var "image_version_patch=$PATCH" \
-var "base_archive_account=${{ secrets.AZURE_STORAGE_ACCOUNT }}" \
-var "base_archive_key=${{ secrets.AZURE_STORAGE_KEY }}" \
-var "prev_golden_version=${{ steps.prev-golden.outputs.prev }}" \
-var "tigris_endpoint=${{ secrets.TIGRIS_ENDPOINT }}" \
-var "tigris_access_key_id=${{ secrets.TIGRIS_ACCESS_KEY_ID }}" \
-var "tigris_secret_access_key=${{ secrets.TIGRIS_SECRET_ACCESS_KEY }}" \
-var "tigris_goldens_bucket=${{ secrets.TIGRIS_GOLDENS_BUCKET }}" \
deploy/packer/worker-ami.pkr.hcl | tee /tmp/packer-output.txt
# Use the gallery image ID (NVMe-compatible for v7 VMs)
GALLERY_IMAGE_ID="/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/${AZURE_RESOURCE_GROUP}/providers/Microsoft.Compute/galleries/${AZURE_GALLERY_NAME}/images/osb-worker-v7/versions/1.0.${PATCH}"
# Verify it exists
if ! az sig image-version show \
--resource-group "$AZURE_RESOURCE_GROUP" \
--gallery-name ${AZURE_GALLERY_NAME} \
--gallery-image-definition osb-worker-v7 \
--gallery-image-version "1.0.${PATCH}" -o none 2>/dev/null; then
echo "ERROR: Gallery image version 1.0.${PATCH} not found after build"
cat /tmp/packer-output.txt
exit 1
fi
echo "IMAGE_ID=$GALLERY_IMAGE_ID" >> $GITHUB_ENV
echo "Built gallery image: $GALLERY_IMAGE_ID (version=1.0.${PATCH})"
- name: Azure Re-Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Store image ID in Key Vault
run: |
az keyvault secret set \
--vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \
--name "worker-image-id" \
--value "$IMAGE_ID"
az keyvault secret set \
--vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \
--name "worker-image-version" \
--value "$VERSION"
# Record this build's goldenVersion so the NEXT AMI build can
# retain it as "previous golden" in /opt/opensandbox/images/bases/.
NEW_GOLDEN=$(grep -a 'Base image golden version:' /tmp/packer-output.txt | tail -1 | awk '{print $NF}' | tr -d '\r')
if [ -n "$NEW_GOLDEN" ]; then
az keyvault secret set \
--vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \
--name "golden-version" \
--value "$NEW_GOLDEN"
echo "Recorded new golden-version: $NEW_GOLDEN"
else
echo "WARN: could not extract goldenVersion from packer output — next build won't retain this base"
fi
echo "Published image ID to Key Vault"
- name: Cleanup old images
run: |
# List all worker images, keep the 5 most recent
IMAGES=$(az image list \
--resource-group "$AZURE_RESOURCE_GROUP" \
--query "[?tags.\"opensandbox-role\"=='worker'] | sort_by(@, &name) | reverse(@)" \
--output json)
STALE=$(echo "$IMAGES" | jq -r '.[5:] | .[].name')
if [ -z "$STALE" ]; then
echo "No old images to clean up"
exit 0
fi
for name in $STALE; do
echo "Deleting old image: $name"
az image delete --resource-group "$AZURE_RESOURCE_GROUP" --name "$name" || true
done
- name: Summary
run: |
echo "## Worker Image Build Complete" >> $GITHUB_STEP_SUMMARY
echo "- **Image:** $IMAGE_ID" >> $GITHUB_STEP_SUMMARY
echo "- **Version:** $VERSION" >> $GITHUB_STEP_SUMMARY
echo "- **Location:** $AZURE_LOCATION" >> $GITHUB_STEP_SUMMARY