Merge pull request #344 from diggerhq/fix/event-forwarder-reclaim-del… #102
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Build Worker Image | |
| on: | |
| push: | |
| branches: [main, autoscaling-etc] | |
| # Paths that actually change what's inside the worker AMI. | |
| # cmd/worker's transitive import surface (verified via `go list -deps | |
| # ./cmd/worker/`) covers almost every internal package we have. Missing | |
| # triggers here cause silent bugs where new worker code never gets | |
| # shipped. | |
| # | |
| # Rootfs content-addressed caching (deploy/packer/worker-ami.pkr.hcl) | |
| # means builds from unchanged rootfs inputs reuse the same cached ext4, | |
| # so goldenVersion stays stable even when we rebuild the AMI for a | |
| # worker-binary-only change. The narrowing here is just about avoiding | |
| # wasted CI minutes on pushes that don't affect the worker binary | |
| # (e.g. cmd/server/**, web/**, docs/**). | |
| # GitHub only permits one of `paths` / `paths-ignore` per event — we use | |
| # `paths` with `!` negation patterns to exclude control-plane-only dirs | |
| # that would otherwise match `internal/**`. | |
| paths: | |
| - 'cmd/worker/**' | |
| - 'cmd/agent/**' | |
| - 'internal/**' | |
| - '!internal/api/**' | |
| - '!internal/billing/**' | |
| - 'pkg/**' | |
| - 'proto/agent/**' | |
| - 'proto/worker/**' | |
| - 'deploy/firecracker/rootfs/**' | |
| - 'deploy/azure/setup-azure-host.sh' | |
| - 'deploy/ec2/build-rootfs-docker.sh' | |
| - 'deploy/packer/**' | |
| # Vector configs + install.sh + KV populator are all baked into the | |
| # AMI by the Packer provisioner (deploy/packer/worker-ami.pkr.hcl §4.5). | |
| # A change here needs to roll through a new AMI to reach prod workers. | |
| - 'deploy/vector/**' | |
| - 'scripts/claude-agent-wrapper/**' | |
| - 'go.mod' | |
| - 'go.sum' | |
| - '.github/workflows/build-worker-ami.yml' | |
| workflow_dispatch: | |
| env: | |
| AZURE_RESOURCE_GROUP: ${{ secrets.AZURE_RESOURCE_GROUP }} | |
| AZURE_LOCATION: ${{ vars.AZURE_LOCATION || 'eastus2' }} | |
| AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| AZURE_GALLERY_NAME: ${{ vars.AZURE_GALLERY_NAME || 'opensandbox_gallery' }} | |
| jobs: | |
| build-image: | |
| name: Build Worker Image (Azure) | |
| runs-on: ubuntu-latest | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| # Full history required so AGENT_VERSION below can find the most | |
| # recent commit that touched cmd/agent / internal/agent / proto/agent | |
| # — without it, actions/checkout does a shallow clone (depth 1) and | |
| # git log only sees HEAD. | |
| fetch-depth: 0 | |
| - uses: actions/setup-go@v5 | |
| with: | |
| go-version: '1.23' | |
| - name: Build binaries (amd64) | |
| run: | | |
| VERSION=$(git rev-parse --short HEAD) | |
| echo "VERSION=$VERSION" >> $GITHUB_ENV | |
| # Agent version bumps only when actual agent code changes. Using the | |
| # repo-level HEAD SHA would mark every AMI build as a new agent, causing | |
| # spurious in-place upgrades on every rolling replace — a known source | |
| # of fragility during wake (keepalive misses during 12MB transfer poison | |
| # the virtio-serial gRPC connection). | |
| AGENT_VERSION=$(git log -1 --pretty=format:%h -- cmd/agent internal/agent proto/agent) | |
| if [ -z "$AGENT_VERSION" ]; then | |
| AGENT_VERSION=$VERSION | |
| fi | |
| echo "AGENT_VERSION=$AGENT_VERSION" | |
| CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ | |
| -ldflags "-X main.WorkerVersion=$VERSION -X main.AgentVersion=$AGENT_VERSION" \ | |
| -o bin/opensandbox-worker ./cmd/worker/ | |
| CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \ | |
| -ldflags "-X main.Version=$AGENT_VERSION" \ | |
| -o bin/osb-agent ./cmd/agent/ | |
| - name: Package rootfs context | |
| run: | | |
| tar czf /tmp/packer-rootfs-ctx.tar.gz \ | |
| deploy/firecracker/rootfs/ \ | |
| deploy/ec2/build-rootfs-docker.sh \ | |
| scripts/claude-agent-wrapper/ | |
| - name: Package Vector configs | |
| # Bundled separately so Packer's file provisioner has a known | |
| # tarball path (it can't reliably upload an arbitrary directory). | |
| # install.sh on the builder extracts and installs Vector + the | |
| # KV-token populator into the AMI. | |
| run: | | |
| tar czf /tmp/packer-vector-ctx.tar.gz -C deploy vector | |
| - name: Azure Login | |
| uses: azure/login@v2 | |
| with: | |
| client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| - name: Setup Packer | |
| uses: hashicorp/setup-packer@main | |
| - name: Packer init | |
| run: packer init deploy/packer/worker-ami.pkr.hcl | |
| - name: Read previous golden-version from Key Vault | |
| id: prev-golden | |
| run: | | |
| # Previous successful build wrote its goldenVersion here; the | |
| # NEW AMI bakes that base in so customers pinned to the last | |
| # golden skip a runtime blob download on first fork. | |
| # `az` may inject ANSI color codes even with -o tsv, so extract | |
| # just the 16-char hex hash from the output. | |
| RAW=$(az keyvault secret show \ | |
| --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ | |
| --name "golden-version" \ | |
| --query value -o tsv 2>/dev/null || echo "") | |
| PREV=$(echo "$RAW" | grep -oE '[0-9a-f]{16}' | head -1) | |
| echo "prev=$PREV" >> $GITHUB_OUTPUT | |
| echo "Previous golden version: ${PREV:-<none>}" | |
| - name: Build image | |
| run: | | |
| # Use run number as patch version for gallery (must be integer) | |
| PATCH=${{ github.run_number }} | |
| packer build \ | |
| -var "worker_version=$VERSION" \ | |
| -var "agent_version=$VERSION" \ | |
| -var "subscription_id=$AZURE_SUBSCRIPTION_ID" \ | |
| -var "resource_group=$AZURE_RESOURCE_GROUP" \ | |
| -var "location=$AZURE_LOCATION" \ | |
| -var "gallery_name=$AZURE_GALLERY_NAME" \ | |
| -var "image_version_patch=$PATCH" \ | |
| -var "base_archive_account=${{ secrets.AZURE_STORAGE_ACCOUNT }}" \ | |
| -var "base_archive_key=${{ secrets.AZURE_STORAGE_KEY }}" \ | |
| -var "prev_golden_version=${{ steps.prev-golden.outputs.prev }}" \ | |
| -var "tigris_endpoint=${{ secrets.TIGRIS_ENDPOINT }}" \ | |
| -var "tigris_access_key_id=${{ secrets.TIGRIS_ACCESS_KEY_ID }}" \ | |
| -var "tigris_secret_access_key=${{ secrets.TIGRIS_SECRET_ACCESS_KEY }}" \ | |
| -var "tigris_goldens_bucket=${{ secrets.TIGRIS_GOLDENS_BUCKET }}" \ | |
| deploy/packer/worker-ami.pkr.hcl | tee /tmp/packer-output.txt | |
| # Use the gallery image ID (NVMe-compatible for v7 VMs) | |
| GALLERY_IMAGE_ID="/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/${AZURE_RESOURCE_GROUP}/providers/Microsoft.Compute/galleries/${AZURE_GALLERY_NAME}/images/osb-worker-v7/versions/1.0.${PATCH}" | |
| # Verify it exists | |
| if ! az sig image-version show \ | |
| --resource-group "$AZURE_RESOURCE_GROUP" \ | |
| --gallery-name ${AZURE_GALLERY_NAME} \ | |
| --gallery-image-definition osb-worker-v7 \ | |
| --gallery-image-version "1.0.${PATCH}" -o none 2>/dev/null; then | |
| echo "ERROR: Gallery image version 1.0.${PATCH} not found after build" | |
| cat /tmp/packer-output.txt | |
| exit 1 | |
| fi | |
| echo "IMAGE_ID=$GALLERY_IMAGE_ID" >> $GITHUB_ENV | |
| echo "Built gallery image: $GALLERY_IMAGE_ID (version=1.0.${PATCH})" | |
| - name: Azure Re-Login | |
| uses: azure/login@v2 | |
| with: | |
| client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| - name: Store image ID in Key Vault | |
| run: | | |
| az keyvault secret set \ | |
| --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ | |
| --name "worker-image-id" \ | |
| --value "$IMAGE_ID" | |
| az keyvault secret set \ | |
| --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ | |
| --name "worker-image-version" \ | |
| --value "$VERSION" | |
| # Record this build's goldenVersion so the NEXT AMI build can | |
| # retain it as "previous golden" in /opt/opensandbox/images/bases/. | |
| NEW_GOLDEN=$(grep -a 'Base image golden version:' /tmp/packer-output.txt | tail -1 | awk '{print $NF}' | tr -d '\r') | |
| if [ -n "$NEW_GOLDEN" ]; then | |
| az keyvault secret set \ | |
| --vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \ | |
| --name "golden-version" \ | |
| --value "$NEW_GOLDEN" | |
| echo "Recorded new golden-version: $NEW_GOLDEN" | |
| else | |
| echo "WARN: could not extract goldenVersion from packer output — next build won't retain this base" | |
| fi | |
| echo "Published image ID to Key Vault" | |
| - name: Cleanup old images | |
| run: | | |
| # List all worker images, keep the 5 most recent | |
| IMAGES=$(az image list \ | |
| --resource-group "$AZURE_RESOURCE_GROUP" \ | |
| --query "[?tags.\"opensandbox-role\"=='worker'] | sort_by(@, &name) | reverse(@)" \ | |
| --output json) | |
| STALE=$(echo "$IMAGES" | jq -r '.[5:] | .[].name') | |
| if [ -z "$STALE" ]; then | |
| echo "No old images to clean up" | |
| exit 0 | |
| fi | |
| for name in $STALE; do | |
| echo "Deleting old image: $name" | |
| az image delete --resource-group "$AZURE_RESOURCE_GROUP" --name "$name" || true | |
| done | |
| - name: Summary | |
| run: | | |
| echo "## Worker Image Build Complete" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Image:** $IMAGE_ID" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Version:** $VERSION" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Location:** $AZURE_LOCATION" >> $GITHUB_STEP_SUMMARY |