test(e2e): conftest helper + no-skip strategy for SDK entry / path_ve… #54
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Build + deploy the Api Docker image to the Tokyo ECS service (PR #724). | |
| # | |
| # Mirrors deploy-runner.yml's pattern: build is conditional on actual | |
| # source changes, and a redeploy-current path lets us exercise the | |
| # ECS register-task-definition + UpdateService chain without rebuilding | |
| # — useful for isolating IAM/PassRole issues from build issues. | |
| # | |
| # Triggers: | |
| # - workflow_call: reused from .github/workflows/e2e-cloud.yml so the | |
| # e2e job depends on the new API being live before | |
| # pytest. | |
| # - workflow_dispatch: standalone trigger; pass `redeploy_current=true` | |
| # to swap the current image's tag-equivalent into | |
| # a new task def and force-new-deployment (zero | |
| # image change — but it exercises PassRole). | |
| # - push: paths-filter on api source + this workflow file, | |
| # so a commit landing on the PR branch fires CI | |
| # automatically. Internal `changes` job narrows | |
| # build vs deploy decisions per paths. | |
| # | |
| # Deploy mechanism (unchanged from main): | |
| # 1. ECR login + buildx build of apps/api/Dockerfile.source. | |
| # 2. Push image tagged with GITHUB_SHA. | |
| # 3. ecs:RegisterTaskDefinition — clone the live Api TD, swap the | |
| # container image, strip readonly fields. Triggers an IAM | |
| # PassRole check on the caller for the task / execution role | |
| # (this is the step that may fail under BoxLiteDeveloperPermissions- | |
| # Boundary, whose NotAction includes iam:*). | |
| # 4. ecs:UpdateService --force-new-deployment + wait services-stable. | |
| # 5. Assert PRIMARY deployment's taskDefinition == NEW_TD_ARN (catches | |
| # DeploymentCircuitBreaker auto-rollback). | |
| # 6. Wait for at least one healthy ALB target. | |
| # | |
| # OIDC role perms used (already present on boxlite-e2e-cloud-github- | |
| # actions in the existing inline policy): | |
| # ecr:GetAuthorizationToken + ecr:* on repository/sst-asset | |
| # ecs:Describe*/List* (cluster-wide) | |
| # ecs:RegisterTaskDefinition + ecs:DeregisterTaskDefinition | |
| # ecs:UpdateService on cluster boxlite-e2e-ci-*/Api | |
| # iam:PassRole on role/boxlite-e2e-ci-* with PassedToService=ecs-tasks | |
| # elasticloadbalancing:Describe* | |
| name: Deploy API | |
| on: | |
| workflow_call: | |
| inputs: | |
| redeploy_current: | |
| description: 'Skip build, re-register the current TD with no image change (exercises PassRole + UpdateService only).' | |
| type: boolean | |
| required: false | |
| default: false | |
| workflow_dispatch: | |
| inputs: | |
| redeploy_current: | |
| description: 'Skip build, re-register the current TD with no image change.' | |
| type: boolean | |
| required: false | |
| default: false | |
| push: | |
| paths: | |
| - 'apps/api/**' | |
| - 'apps/libs/**' | |
| - 'apps/common-go/**' | |
| - 'apps/api-client-go/**' | |
| - '.github/workflows/deploy-api.yml' | |
| # Serialize against the shared Tokyo ECS service. Concurrent Api | |
| # deploys race for task-definition revision numbers and one's | |
| # UpdateService rolls back the other (see #724 OtelCollector race). | |
| # cancel-in-progress: false because a half-applied ECS rolling update | |
| # is worse than waiting. | |
| concurrency: | |
| group: deploy-api-shared | |
| cancel-in-progress: false | |
| permissions: | |
| contents: read | |
| env: | |
| AWS_REGION: ${{ vars.AWS_E2E_CLOUD_REGION }} | |
| AWS_ROLE_ARN: ${{ vars.AWS_E2E_CLOUD_ROLE_ARN }} | |
| STACK_PREFIX: boxlite-e2e-ci | |
| # SST auto-generates cluster names like boxlite-e2e-ci-ClusterCluster-xxx | |
| ECS_CLUSTER_PATTERN: boxlite-e2e-ci-ClusterCluster- | |
| ECR_REPO: sst-asset | |
| jobs: | |
| # ── Detect real source changes so workflow-only commits don't build ── | |
| changes: | |
| name: Detect API source changes | |
| runs-on: ubuntu-latest | |
| outputs: | |
| should_build: ${{ steps.decide.outputs.should_build }} | |
| should_deploy: ${{ steps.decide.outputs.should_deploy }} | |
| redeploy_current: ${{ steps.decide.outputs.redeploy_current }} | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - id: filter | |
| if: github.event_name == 'push' | |
| uses: dorny/paths-filter@v3 | |
| with: | |
| base: ${{ github.event.before }} | |
| filters: | | |
| api_source: | |
| - 'apps/api/**' | |
| - 'apps/libs/**' | |
| - 'apps/common-go/**' | |
| - 'apps/api-client-go/**' | |
| - id: decide | |
| env: | |
| PUSH_CHANGED: ${{ steps.filter.outputs.api_source }} | |
| INPUT_REDEPLOY: ${{ inputs.redeploy_current }} | |
| run: | | |
| # Resolve redeploy-current mode, in order of precedence: | |
| # 1. workflow_call/workflow_dispatch input `redeploy_current=true` | |
| # 2. commit-message tag `[api-redeploy]` | |
| REDEPLOY=false | |
| if [ "${INPUT_REDEPLOY:-false}" = 'true' ]; then | |
| REDEPLOY=true | |
| echo "Using workflow input redeploy_current=true" | |
| else | |
| COMMIT_MSG=$(git log -1 --pretty=%B 2>/dev/null || true) | |
| if [[ "$COMMIT_MSG" == *"[api-redeploy]"* ]]; then | |
| REDEPLOY=true | |
| echo "Using commit-message tag [api-redeploy]" | |
| fi | |
| fi | |
| if [ "$REDEPLOY" = 'true' ]; then | |
| echo "Re-register current TD without image change — SKIP build, RUN deploy." | |
| echo "should_build=false" >> "$GITHUB_OUTPUT" | |
| echo "should_deploy=true" >> "$GITHUB_OUTPUT" | |
| echo "redeploy_current=true" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| echo "redeploy_current=false" >> "$GITHUB_OUTPUT" | |
| # NEVER fall through to a no-build branch — a force-push rebase | |
| # produces an empty diff vs. github.event.before even when the | |
| # actually-deployed Api image is stale, and the previous "workflow | |
| # only — skip" decision then left Tokyo on the older image while | |
| # the head commit advertised the fix. Always build + deploy on | |
| # any push (or non-push event). The redeploy_current short-circuit | |
| # above still applies when explicitly requested. | |
| if [ "${{ github.event_name }}" != 'push' ]; then | |
| echo "Non-push event (${{ github.event_name }}) — build + deploy." | |
| elif [ "${PUSH_CHANGED:-false}" = 'true' ]; then | |
| echo "Push touched api_source paths — build + deploy." | |
| else | |
| echo "Push didn't touch api_source paths — build + deploy anyway (avoid stale Tokyo on rebase)." | |
| fi | |
| echo "should_build=true" >> "$GITHUB_OUTPUT" | |
| echo "should_deploy=true" >> "$GITHUB_OUTPUT" | |
| # ── Deploy: ECR push (optional) + ECS register-TD + UpdateService ─ | |
| deploy: | |
| name: Deploy API to Tokyo ECS | |
| needs: changes | |
| if: | | |
| !failure() && !cancelled() | |
| && needs.changes.outputs.should_deploy == 'true' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - name: Configure AWS credentials (OIDC) | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ env.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| role-session-name: deploy-api-${{ github.run_id }} | |
| - name: Resolve cluster + target group | |
| id: resources | |
| run: | | |
| set -euo pipefail | |
| assert_one() { | |
| local kind="$1" count="$2" | |
| if [ "$count" -ne 1 ]; then | |
| echo "::error::Expected exactly 1 $kind, found $count" | |
| exit 1 | |
| fi | |
| } | |
| CLUSTER_COUNT=$(aws ecs list-clusters \ | |
| --query "length(clusterArns[?contains(@, '${ECS_CLUSTER_PATTERN}')])" --output text) | |
| assert_one "ECS cluster matching ${ECS_CLUSTER_PATTERN}" "$CLUSTER_COUNT" | |
| CLUSTER=$(aws ecs list-clusters \ | |
| --query "clusterArns[?contains(@, '${ECS_CLUSTER_PATTERN}')]|[0]" \ | |
| --output text | awk -F/ '{print $NF}') | |
| echo "cluster=$CLUSTER" >> "$GITHUB_OUTPUT" | |
| # Resolve the LIVE storagebucket. TD env vars may reference a | |
| # stale name from a previous stack instantiation (the original | |
| # bucket was deleted while the TD held the old name in | |
| # SST_RESOURCE_Storage / S3_DEFAULT_BUCKET). Find what actually | |
| # exists in S3 and patch the TD env to match — the application | |
| # boots from these env vars (configuration.ts), not from SST | |
| # state. Single-bucket assertion catches dangling orphans. | |
| BUCKET_COUNT=$(aws s3api list-buckets \ | |
| --query "length(Buckets[?starts_with(Name,'${STACK_PREFIX}-storagebucket-')])" --output text) | |
| assert_one "${STACK_PREFIX}-storagebucket-* S3 bucket" "$BUCKET_COUNT" | |
| STORAGE_BUCKET=$(aws s3api list-buckets \ | |
| --query "Buckets[?starts_with(Name,'${STACK_PREFIX}-storagebucket-')]|[0].Name" \ | |
| --output text) | |
| echo "storage_bucket=$STORAGE_BUCKET" >> "$GITHUB_OUTPUT" | |
| echo "::notice::storagebucket=$STORAGE_BUCKET (will patch into TD env)" | |
| LB_COUNT=$(aws elbv2 describe-load-balancers \ | |
| --query "length(LoadBalancers[?starts_with(LoadBalancerName,'ApiLoadBalancer-')])" --output text) | |
| assert_one "ApiLoadBalancer-*" "$LB_COUNT" | |
| TG_ARN=$(aws elbv2 describe-load-balancers \ | |
| --query "LoadBalancers[?starts_with(LoadBalancerName,'ApiLoadBalancer-')]|[0].LoadBalancerArn" \ | |
| --output text \ | |
| | xargs -I{} aws elbv2 describe-target-groups --load-balancer-arn {} \ | |
| --query "TargetGroups[0].TargetGroupArn" --output text) | |
| echo "tg_arn=$TG_ARN" >> "$GITHUB_OUTPUT" | |
| echo "::notice::cluster=$CLUSTER tg=$TG_ARN" | |
| - name: ECR login | |
| if: needs.changes.outputs.should_build == 'true' | |
| id: ecr_login | |
| uses: aws-actions/amazon-ecr-login@v2 | |
| - name: Build & push API image (apps/api/Dockerfile.source) | |
| id: build_api | |
| if: needs.changes.outputs.should_build == 'true' | |
| run: | | |
| set -euo pipefail | |
| REGISTRY="${{ steps.ecr_login.outputs.registry }}" | |
| IMAGE="${REGISTRY}/${ECR_REPO}:api-${{ github.sha }}" | |
| docker buildx build --platform linux/amd64 --load \ | |
| -f apps/api/Dockerfile.source -t "$IMAGE" . | |
| docker push "$IMAGE" | |
| echo "image=$IMAGE" >> "$GITHUB_OUTPUT" | |
| - name: Resolve image for redeploy-current path | |
| id: resolve_image | |
| if: needs.changes.outputs.redeploy_current == 'true' | |
| env: | |
| CLUSTER: ${{ steps.resources.outputs.cluster }} | |
| run: | | |
| set -euo pipefail | |
| OLD_TD_ARN=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \ | |
| --query 'services[0].taskDefinition' --output text) | |
| CURRENT_IMAGE=$(aws ecs describe-task-definition --task-definition "$OLD_TD_ARN" \ | |
| --query 'taskDefinition.containerDefinitions[0].image' --output text) | |
| echo "Using current image (no rebuild): $CURRENT_IMAGE" | |
| echo "image=$CURRENT_IMAGE" >> "$GITHUB_OUTPUT" | |
| - name: Register new task definition + UpdateService + wait stable | |
| env: | |
| CLUSTER: ${{ steps.resources.outputs.cluster }} | |
| IMAGE: ${{ steps.build_api.outputs.image || steps.resolve_image.outputs.image }} | |
| TG_ARN: ${{ steps.resources.outputs.tg_arn }} | |
| STORAGE_BUCKET: ${{ steps.resources.outputs.storage_bucket }} | |
| run: | | |
| set -euo pipefail | |
| [ -n "$IMAGE" ] || { echo "::error::No image resolved (build skipped, redeploy_current also off)"; exit 1; } | |
| OLD_TD_ARN=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \ | |
| --query 'services[0].taskDefinition' --output text) | |
| echo "Old TD: $OLD_TD_ARN" | |
| # Clone old TD, swap image, strip readonly fields, drop stale | |
| # static-IAM env vars left over from before #732 ("vend box S3 | |
| # credentials from the ECS task role, not a static IAM user"). | |
| # If S3_ACCESS_KEY is set in the env, configuration.ts:87 honors | |
| # it and short-circuits the task-role fallback — but the IAM | |
| # user backing that key no longer exists, so the app crashes | |
| # on InvalidAccessKeyId at first S3 call. Stripping these env | |
| # vars forces the SDK default credential chain (= task role). | |
| # Plaintext env (e.g. DB_PASSWORD) lives in TD — do NOT cat the file. | |
| # The jq pipeline: | |
| # - swap container image to the fresh build (or current image | |
| # for redeploy-current path) | |
| # - drop pre-#732 static-IAM env vars (S3_ACCESS_KEY / S3_SECRET_KEY) | |
| # so the SDK falls through to task-role credentials | |
| # - patch S3_DEFAULT_BUCKET + SST_RESOURCE_Storage to the LIVE | |
| # storage bucket name. The original bucket may have been | |
| # deleted out-of-band while the TD held the dead name; the | |
| # application boots from these env vars (VolumeManager | |
| # .testConnection NoSuchBuckets on container startup | |
| # otherwise) — see configuration.ts. | |
| aws ecs describe-task-definition --task-definition "$OLD_TD_ARN" \ | |
| --query 'taskDefinition' --output json \ | |
| | jq --arg img "$IMAGE" --arg bucket "$STORAGE_BUCKET" ' | |
| .containerDefinitions[0].image = $img | |
| | .containerDefinitions[0].environment |= ( | |
| map(select(.name != "S3_ACCESS_KEY" and .name != "S3_SECRET_KEY")) | |
| | map( | |
| if .name == "S3_DEFAULT_BUCKET" then .value = $bucket | |
| elif .name == "SST_RESOURCE_Storage" then | |
| .value = (.value | fromjson | .name = $bucket | tojson) | |
| else . end | |
| ) | |
| ) | |
| | del(.taskDefinitionArn, .revision, .status, .requiresAttributes, | |
| .compatibilities, .registeredAt, .registeredBy)' \ | |
| > /tmp/new-td.json | |
| NEW_TD_ARN=$(aws ecs register-task-definition \ | |
| --cli-input-json file:///tmp/new-td.json \ | |
| --query 'taskDefinition.taskDefinitionArn' --output text) | |
| echo "::notice::New TD: $NEW_TD_ARN" | |
| aws ecs update-service --cluster "$CLUSTER" --service Api \ | |
| --task-definition "$NEW_TD_ARN" --force-new-deployment >/dev/null | |
| aws ecs wait services-stable --cluster "$CLUSTER" --services Api | |
| echo "Service stable — verifying PRIMARY is NEW_TD..." | |
| PRIMARY_TD=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \ | |
| --query 'services[0].deployments[?status==`PRIMARY`]|[0].taskDefinition' \ | |
| --output text) | |
| if [ "$PRIMARY_TD" != "$NEW_TD_ARN" ]; then | |
| echo "::error::ECS rolled back. PRIMARY=$PRIMARY_TD expected $NEW_TD_ARN" | |
| aws ecs list-tasks --cluster "$CLUSTER" --service-name Api \ | |
| --desired-status STOPPED --max-results 5 \ | |
| --query 'taskArns' --output text \ | |
| | tr '\t' '\n' | head -3 \ | |
| | while read -r TASK; do | |
| [ -n "$TASK" ] || continue | |
| echo "--- stopped task $(basename "$TASK") ---" | |
| aws ecs describe-tasks --cluster "$CLUSTER" --tasks "$TASK" \ | |
| --query 'tasks[0].[stoppedReason,containers[0].exitCode,containers[0].reason]' \ | |
| --output text || true | |
| done | |
| exit 1 | |
| fi | |
| echo "::notice::PRIMARY deployment confirmed on $NEW_TD_ARN" | |
| for i in $(seq 1 18); do | |
| HEALTHY=$(aws elbv2 describe-target-health \ | |
| --target-group-arn "$TG_ARN" \ | |
| --query "length(TargetHealthDescriptions[?TargetHealth.State=='healthy'])" \ | |
| --output text) | |
| if [ "$HEALTHY" -ge 1 ]; then | |
| echo "::notice::ALB target group: $HEALTHY healthy target(s)" | |
| exit 0 | |
| fi | |
| echo "ALB healthy=0 — retry $i/18 in 10s" | |
| sleep 10 | |
| done | |
| echo "::error::ALB target group never reported a healthy target within 180s" | |
| exit 1 | |
| # 2026-06-11T15:06:13Z — boundary removed; re-verify ECS PassRole standalone | |
| # 2026-06-11T15:25:51Z — retest after recreating Api{Task,Execution}Role | |
| # 2026-06-11T16:05:50Z — retest with bucket env patch [api-redeploy] |