Skip to content

test(e2e): conftest helper + no-skip strategy for SDK entry / path_ve… #54

test(e2e): conftest helper + no-skip strategy for SDK entry / path_ve…

test(e2e): conftest helper + no-skip strategy for SDK entry / path_ve… #54

Workflow file for this run

# Build + deploy the Api Docker image to the Tokyo ECS service (PR #724).
#
# Mirrors deploy-runner.yml's pattern: build is conditional on actual
# source changes, and a redeploy-current path lets us exercise the
# ECS register-task-definition + UpdateService chain without rebuilding
# — useful for isolating IAM/PassRole issues from build issues.
#
# Triggers:
# - workflow_call: reused from .github/workflows/e2e-cloud.yml so the
# e2e job depends on the new API being live before
# pytest.
# - workflow_dispatch: standalone trigger; pass `redeploy_current=true`
# to swap the current image's tag-equivalent into
# a new task def and force-new-deployment (zero
# image change — but it exercises PassRole).
# - push: paths-filter on api source + this workflow file,
# so a commit landing on the PR branch fires CI
# automatically. Internal `changes` job narrows
# build vs deploy decisions per paths.
#
# Deploy mechanism (unchanged from main):
# 1. ECR login + buildx build of apps/api/Dockerfile.source.
# 2. Push image tagged with GITHUB_SHA.
# 3. ecs:RegisterTaskDefinition — clone the live Api TD, swap the
# container image, strip readonly fields. Triggers an IAM
# PassRole check on the caller for the task / execution role
# (this is the step that may fail under BoxLiteDeveloperPermissions-
# Boundary, whose NotAction includes iam:*).
# 4. ecs:UpdateService --force-new-deployment + wait services-stable.
# 5. Assert PRIMARY deployment's taskDefinition == NEW_TD_ARN (catches
# DeploymentCircuitBreaker auto-rollback).
# 6. Wait for at least one healthy ALB target.
#
# OIDC role perms used (already present on boxlite-e2e-cloud-github-
# actions in the existing inline policy):
# ecr:GetAuthorizationToken + ecr:* on repository/sst-asset
# ecs:Describe*/List* (cluster-wide)
# ecs:RegisterTaskDefinition + ecs:DeregisterTaskDefinition
# ecs:UpdateService on cluster boxlite-e2e-ci-*/Api
# iam:PassRole on role/boxlite-e2e-ci-* with PassedToService=ecs-tasks
# elasticloadbalancing:Describe*
name: Deploy API
on:
workflow_call:
inputs:
redeploy_current:
description: 'Skip build, re-register the current TD with no image change (exercises PassRole + UpdateService only).'
type: boolean
required: false
default: false
workflow_dispatch:
inputs:
redeploy_current:
description: 'Skip build, re-register the current TD with no image change.'
type: boolean
required: false
default: false
push:
paths:
- 'apps/api/**'
- 'apps/libs/**'
- 'apps/common-go/**'
- 'apps/api-client-go/**'
- '.github/workflows/deploy-api.yml'
# Serialize against the shared Tokyo ECS service. Concurrent Api
# deploys race for task-definition revision numbers and one's
# UpdateService rolls back the other (see #724 OtelCollector race).
# cancel-in-progress: false because a half-applied ECS rolling update
# is worse than waiting.
concurrency:
group: deploy-api-shared
cancel-in-progress: false
permissions:
contents: read
env:
AWS_REGION: ${{ vars.AWS_E2E_CLOUD_REGION }}
AWS_ROLE_ARN: ${{ vars.AWS_E2E_CLOUD_ROLE_ARN }}
STACK_PREFIX: boxlite-e2e-ci
# SST auto-generates cluster names like boxlite-e2e-ci-ClusterCluster-xxx
ECS_CLUSTER_PATTERN: boxlite-e2e-ci-ClusterCluster-
ECR_REPO: sst-asset
jobs:
# ── Detect real source changes so workflow-only commits don't build ──
changes:
name: Detect API source changes
runs-on: ubuntu-latest
outputs:
should_build: ${{ steps.decide.outputs.should_build }}
should_deploy: ${{ steps.decide.outputs.should_deploy }}
redeploy_current: ${{ steps.decide.outputs.redeploy_current }}
steps:
- uses: actions/checkout@v5
- id: filter
if: github.event_name == 'push'
uses: dorny/paths-filter@v3
with:
base: ${{ github.event.before }}
filters: |
api_source:
- 'apps/api/**'
- 'apps/libs/**'
- 'apps/common-go/**'
- 'apps/api-client-go/**'
- id: decide
env:
PUSH_CHANGED: ${{ steps.filter.outputs.api_source }}
INPUT_REDEPLOY: ${{ inputs.redeploy_current }}
run: |
# Resolve redeploy-current mode, in order of precedence:
# 1. workflow_call/workflow_dispatch input `redeploy_current=true`
# 2. commit-message tag `[api-redeploy]`
REDEPLOY=false
if [ "${INPUT_REDEPLOY:-false}" = 'true' ]; then
REDEPLOY=true
echo "Using workflow input redeploy_current=true"
else
COMMIT_MSG=$(git log -1 --pretty=%B 2>/dev/null || true)
if [[ "$COMMIT_MSG" == *"[api-redeploy]"* ]]; then
REDEPLOY=true
echo "Using commit-message tag [api-redeploy]"
fi
fi
if [ "$REDEPLOY" = 'true' ]; then
echo "Re-register current TD without image change — SKIP build, RUN deploy."
echo "should_build=false" >> "$GITHUB_OUTPUT"
echo "should_deploy=true" >> "$GITHUB_OUTPUT"
echo "redeploy_current=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "redeploy_current=false" >> "$GITHUB_OUTPUT"
# NEVER fall through to a no-build branch — a force-push rebase
# produces an empty diff vs. github.event.before even when the
# actually-deployed Api image is stale, and the previous "workflow
# only — skip" decision then left Tokyo on the older image while
# the head commit advertised the fix. Always build + deploy on
# any push (or non-push event). The redeploy_current short-circuit
# above still applies when explicitly requested.
if [ "${{ github.event_name }}" != 'push' ]; then
echo "Non-push event (${{ github.event_name }}) — build + deploy."
elif [ "${PUSH_CHANGED:-false}" = 'true' ]; then
echo "Push touched api_source paths — build + deploy."
else
echo "Push didn't touch api_source paths — build + deploy anyway (avoid stale Tokyo on rebase)."
fi
echo "should_build=true" >> "$GITHUB_OUTPUT"
echo "should_deploy=true" >> "$GITHUB_OUTPUT"
# ── Deploy: ECR push (optional) + ECS register-TD + UpdateService ─
deploy:
name: Deploy API to Tokyo ECS
needs: changes
if: |
!failure() && !cancelled()
&& needs.changes.outputs.should_deploy == 'true'
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v5
- name: Configure AWS credentials (OIDC)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
role-session-name: deploy-api-${{ github.run_id }}
- name: Resolve cluster + target group
id: resources
run: |
set -euo pipefail
assert_one() {
local kind="$1" count="$2"
if [ "$count" -ne 1 ]; then
echo "::error::Expected exactly 1 $kind, found $count"
exit 1
fi
}
CLUSTER_COUNT=$(aws ecs list-clusters \
--query "length(clusterArns[?contains(@, '${ECS_CLUSTER_PATTERN}')])" --output text)
assert_one "ECS cluster matching ${ECS_CLUSTER_PATTERN}" "$CLUSTER_COUNT"
CLUSTER=$(aws ecs list-clusters \
--query "clusterArns[?contains(@, '${ECS_CLUSTER_PATTERN}')]|[0]" \
--output text | awk -F/ '{print $NF}')
echo "cluster=$CLUSTER" >> "$GITHUB_OUTPUT"
# Resolve the LIVE storagebucket. TD env vars may reference a
# stale name from a previous stack instantiation (the original
# bucket was deleted while the TD held the old name in
# SST_RESOURCE_Storage / S3_DEFAULT_BUCKET). Find what actually
# exists in S3 and patch the TD env to match — the application
# boots from these env vars (configuration.ts), not from SST
# state. Single-bucket assertion catches dangling orphans.
BUCKET_COUNT=$(aws s3api list-buckets \
--query "length(Buckets[?starts_with(Name,'${STACK_PREFIX}-storagebucket-')])" --output text)
assert_one "${STACK_PREFIX}-storagebucket-* S3 bucket" "$BUCKET_COUNT"
STORAGE_BUCKET=$(aws s3api list-buckets \
--query "Buckets[?starts_with(Name,'${STACK_PREFIX}-storagebucket-')]|[0].Name" \
--output text)
echo "storage_bucket=$STORAGE_BUCKET" >> "$GITHUB_OUTPUT"
echo "::notice::storagebucket=$STORAGE_BUCKET (will patch into TD env)"
LB_COUNT=$(aws elbv2 describe-load-balancers \
--query "length(LoadBalancers[?starts_with(LoadBalancerName,'ApiLoadBalancer-')])" --output text)
assert_one "ApiLoadBalancer-*" "$LB_COUNT"
TG_ARN=$(aws elbv2 describe-load-balancers \
--query "LoadBalancers[?starts_with(LoadBalancerName,'ApiLoadBalancer-')]|[0].LoadBalancerArn" \
--output text \
| xargs -I{} aws elbv2 describe-target-groups --load-balancer-arn {} \
--query "TargetGroups[0].TargetGroupArn" --output text)
echo "tg_arn=$TG_ARN" >> "$GITHUB_OUTPUT"
echo "::notice::cluster=$CLUSTER tg=$TG_ARN"
- name: ECR login
if: needs.changes.outputs.should_build == 'true'
id: ecr_login
uses: aws-actions/amazon-ecr-login@v2
- name: Build & push API image (apps/api/Dockerfile.source)
id: build_api
if: needs.changes.outputs.should_build == 'true'
run: |
set -euo pipefail
REGISTRY="${{ steps.ecr_login.outputs.registry }}"
IMAGE="${REGISTRY}/${ECR_REPO}:api-${{ github.sha }}"
docker buildx build --platform linux/amd64 --load \
-f apps/api/Dockerfile.source -t "$IMAGE" .
docker push "$IMAGE"
echo "image=$IMAGE" >> "$GITHUB_OUTPUT"
- name: Resolve image for redeploy-current path
id: resolve_image
if: needs.changes.outputs.redeploy_current == 'true'
env:
CLUSTER: ${{ steps.resources.outputs.cluster }}
run: |
set -euo pipefail
OLD_TD_ARN=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \
--query 'services[0].taskDefinition' --output text)
CURRENT_IMAGE=$(aws ecs describe-task-definition --task-definition "$OLD_TD_ARN" \
--query 'taskDefinition.containerDefinitions[0].image' --output text)
echo "Using current image (no rebuild): $CURRENT_IMAGE"
echo "image=$CURRENT_IMAGE" >> "$GITHUB_OUTPUT"
- name: Register new task definition + UpdateService + wait stable
env:
CLUSTER: ${{ steps.resources.outputs.cluster }}
IMAGE: ${{ steps.build_api.outputs.image || steps.resolve_image.outputs.image }}
TG_ARN: ${{ steps.resources.outputs.tg_arn }}
STORAGE_BUCKET: ${{ steps.resources.outputs.storage_bucket }}
run: |
set -euo pipefail
[ -n "$IMAGE" ] || { echo "::error::No image resolved (build skipped, redeploy_current also off)"; exit 1; }
OLD_TD_ARN=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \
--query 'services[0].taskDefinition' --output text)
echo "Old TD: $OLD_TD_ARN"
# Clone old TD, swap image, strip readonly fields, drop stale
# static-IAM env vars left over from before #732 ("vend box S3
# credentials from the ECS task role, not a static IAM user").
# If S3_ACCESS_KEY is set in the env, configuration.ts:87 honors
# it and short-circuits the task-role fallback — but the IAM
# user backing that key no longer exists, so the app crashes
# on InvalidAccessKeyId at first S3 call. Stripping these env
# vars forces the SDK default credential chain (= task role).
# Plaintext env (e.g. DB_PASSWORD) lives in TD — do NOT cat the file.
# The jq pipeline:
# - swap container image to the fresh build (or current image
# for redeploy-current path)
# - drop pre-#732 static-IAM env vars (S3_ACCESS_KEY / S3_SECRET_KEY)
# so the SDK falls through to task-role credentials
# - patch S3_DEFAULT_BUCKET + SST_RESOURCE_Storage to the LIVE
# storage bucket name. The original bucket may have been
# deleted out-of-band while the TD held the dead name; the
# application boots from these env vars (VolumeManager
# .testConnection NoSuchBuckets on container startup
# otherwise) — see configuration.ts.
aws ecs describe-task-definition --task-definition "$OLD_TD_ARN" \
--query 'taskDefinition' --output json \
| jq --arg img "$IMAGE" --arg bucket "$STORAGE_BUCKET" '
.containerDefinitions[0].image = $img
| .containerDefinitions[0].environment |= (
map(select(.name != "S3_ACCESS_KEY" and .name != "S3_SECRET_KEY"))
| map(
if .name == "S3_DEFAULT_BUCKET" then .value = $bucket
elif .name == "SST_RESOURCE_Storage" then
.value = (.value | fromjson | .name = $bucket | tojson)
else . end
)
)
| del(.taskDefinitionArn, .revision, .status, .requiresAttributes,
.compatibilities, .registeredAt, .registeredBy)' \
> /tmp/new-td.json
NEW_TD_ARN=$(aws ecs register-task-definition \
--cli-input-json file:///tmp/new-td.json \
--query 'taskDefinition.taskDefinitionArn' --output text)
echo "::notice::New TD: $NEW_TD_ARN"
aws ecs update-service --cluster "$CLUSTER" --service Api \
--task-definition "$NEW_TD_ARN" --force-new-deployment >/dev/null
aws ecs wait services-stable --cluster "$CLUSTER" --services Api
echo "Service stable — verifying PRIMARY is NEW_TD..."
PRIMARY_TD=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \
--query 'services[0].deployments[?status==`PRIMARY`]|[0].taskDefinition' \
--output text)
if [ "$PRIMARY_TD" != "$NEW_TD_ARN" ]; then
echo "::error::ECS rolled back. PRIMARY=$PRIMARY_TD expected $NEW_TD_ARN"
aws ecs list-tasks --cluster "$CLUSTER" --service-name Api \
--desired-status STOPPED --max-results 5 \
--query 'taskArns' --output text \
| tr '\t' '\n' | head -3 \
| while read -r TASK; do
[ -n "$TASK" ] || continue
echo "--- stopped task $(basename "$TASK") ---"
aws ecs describe-tasks --cluster "$CLUSTER" --tasks "$TASK" \
--query 'tasks[0].[stoppedReason,containers[0].exitCode,containers[0].reason]' \
--output text || true
done
exit 1
fi
echo "::notice::PRIMARY deployment confirmed on $NEW_TD_ARN"
for i in $(seq 1 18); do
HEALTHY=$(aws elbv2 describe-target-health \
--target-group-arn "$TG_ARN" \
--query "length(TargetHealthDescriptions[?TargetHealth.State=='healthy'])" \
--output text)
if [ "$HEALTHY" -ge 1 ]; then
echo "::notice::ALB target group: $HEALTHY healthy target(s)"
exit 0
fi
echo "ALB healthy=0 — retry $i/18 in 10s"
sleep 10
done
echo "::error::ALB target group never reported a healthy target within 180s"
exit 1
# 2026-06-11T15:06:13Z — boundary removed; re-verify ECS PassRole standalone
# 2026-06-11T15:25:51Z — retest after recreating Api{Task,Execution}Role
# 2026-06-11T16:05:50Z — retest with bucket env patch [api-redeploy]