|
| 1 | +# Build + deploy the Api Docker image to the Tokyo ECS service (PR #724). |
| 2 | +# |
| 3 | +# Mirrors deploy-runner.yml's pattern: build is conditional on actual |
| 4 | +# source changes, and a redeploy-current path lets us exercise the |
| 5 | +# ECS register-task-definition + UpdateService chain without rebuilding |
| 6 | +# — useful for isolating IAM/PassRole issues from build issues. |
| 7 | +# |
| 8 | +# Triggers: |
| 9 | +# - workflow_call: reused from .github/workflows/e2e-cloud.yml so the |
| 10 | +# e2e job depends on the new API being live before |
| 11 | +# pytest. |
| 12 | +# - workflow_dispatch: standalone trigger; pass `redeploy_current=true` |
| 13 | +# to swap the current image's tag-equivalent into |
| 14 | +# a new task def and force-new-deployment (zero |
| 15 | +# image change — but it exercises PassRole). |
| 16 | +# - push: paths-filter on api source + this workflow file, |
| 17 | +# so a commit landing on the PR branch fires CI |
| 18 | +# automatically. Internal `changes` job narrows |
| 19 | +# build vs deploy decisions per paths. |
| 20 | +# |
| 21 | +# Deploy mechanism (unchanged from main): |
| 22 | +# 1. ECR login + buildx build of apps/api/Dockerfile.source. |
| 23 | +# 2. Push image tagged with GITHUB_SHA. |
| 24 | +# 3. ecs:RegisterTaskDefinition — clone the live Api TD, swap the |
| 25 | +# container image, strip readonly fields. Triggers an IAM |
| 26 | +# PassRole check on the caller for the task / execution role |
| 27 | +# (this is the step that may fail under BoxLiteDeveloperPermissions- |
| 28 | +# Boundary, whose NotAction includes iam:*). |
| 29 | +# 4. ecs:UpdateService --force-new-deployment + wait services-stable. |
| 30 | +# 5. Assert PRIMARY deployment's taskDefinition == NEW_TD_ARN (catches |
| 31 | +# DeploymentCircuitBreaker auto-rollback). |
| 32 | +# 6. Wait for at least one healthy ALB target. |
| 33 | +# |
| 34 | +# OIDC role perms used (already present on boxlite-e2e-cloud-github- |
| 35 | +# actions in the existing inline policy): |
| 36 | +# ecr:GetAuthorizationToken + ecr:* on repository/sst-asset |
| 37 | +# ecs:Describe*/List* (cluster-wide) |
| 38 | +# ecs:RegisterTaskDefinition + ecs:DeregisterTaskDefinition |
| 39 | +# ecs:UpdateService on cluster boxlite-e2e-ci-*/Api |
| 40 | +# iam:PassRole on role/boxlite-e2e-ci-* with PassedToService=ecs-tasks |
| 41 | +# elasticloadbalancing:Describe* |
| 42 | +name: Deploy API |
| 43 | + |
| 44 | +on: |
| 45 | + workflow_call: |
| 46 | + inputs: |
| 47 | + redeploy_current: |
| 48 | + description: 'Skip build, re-register the current TD with no image change (exercises PassRole + UpdateService only).' |
| 49 | + type: boolean |
| 50 | + required: false |
| 51 | + default: false |
| 52 | + workflow_dispatch: |
| 53 | + inputs: |
| 54 | + redeploy_current: |
| 55 | + description: 'Skip build, re-register the current TD with no image change.' |
| 56 | + type: boolean |
| 57 | + required: false |
| 58 | + default: false |
| 59 | + push: |
| 60 | + paths: |
| 61 | + - 'apps/api/**' |
| 62 | + - 'apps/libs/**' |
| 63 | + - 'apps/common-go/**' |
| 64 | + - 'apps/api-client-go/**' |
| 65 | + - '.github/workflows/deploy-api.yml' |
| 66 | + |
| 67 | +permissions: |
| 68 | + contents: read |
| 69 | + |
| 70 | +env: |
| 71 | + AWS_REGION: ${{ vars.AWS_E2E_CLOUD_REGION }} |
| 72 | + AWS_ROLE_ARN: ${{ vars.AWS_E2E_CLOUD_ROLE_ARN }} |
| 73 | + STACK_PREFIX: boxlite-e2e-ci |
| 74 | + ECR_REPO: sst-asset |
| 75 | + |
| 76 | +jobs: |
| 77 | + # ── Detect real source changes so workflow-only commits don't build ── |
| 78 | + changes: |
| 79 | + name: Detect API source changes |
| 80 | + runs-on: ubuntu-latest |
| 81 | + outputs: |
| 82 | + should_build: ${{ steps.decide.outputs.should_build }} |
| 83 | + should_deploy: ${{ steps.decide.outputs.should_deploy }} |
| 84 | + redeploy_current: ${{ steps.decide.outputs.redeploy_current }} |
| 85 | + steps: |
| 86 | + - uses: actions/checkout@v5 |
| 87 | + - id: filter |
| 88 | + if: github.event_name == 'push' |
| 89 | + uses: dorny/paths-filter@v3 |
| 90 | + with: |
| 91 | + base: ${{ github.event.before }} |
| 92 | + filters: | |
| 93 | + api_source: |
| 94 | + - 'apps/api/**' |
| 95 | + - 'apps/libs/**' |
| 96 | + - 'apps/common-go/**' |
| 97 | + - 'apps/api-client-go/**' |
| 98 | + - id: decide |
| 99 | + env: |
| 100 | + PUSH_CHANGED: ${{ steps.filter.outputs.api_source }} |
| 101 | + INPUT_REDEPLOY: ${{ inputs.redeploy_current }} |
| 102 | + run: | |
| 103 | + # Resolve redeploy-current mode, in order of precedence: |
| 104 | + # 1. workflow_call/workflow_dispatch input `redeploy_current=true` |
| 105 | + # 2. commit-message tag `[api-redeploy]` |
| 106 | + REDEPLOY=false |
| 107 | + if [ "${INPUT_REDEPLOY:-false}" = 'true' ]; then |
| 108 | + REDEPLOY=true |
| 109 | + echo "Using workflow input redeploy_current=true" |
| 110 | + else |
| 111 | + COMMIT_MSG=$(git log -1 --pretty=%B 2>/dev/null || true) |
| 112 | + if [[ "$COMMIT_MSG" == *"[api-redeploy]"* ]]; then |
| 113 | + REDEPLOY=true |
| 114 | + echo "Using commit-message tag [api-redeploy]" |
| 115 | + fi |
| 116 | + fi |
| 117 | +
|
| 118 | + if [ "$REDEPLOY" = 'true' ]; then |
| 119 | + echo "Re-register current TD without image change — SKIP build, RUN deploy." |
| 120 | + echo "should_build=false" >> "$GITHUB_OUTPUT" |
| 121 | + echo "should_deploy=true" >> "$GITHUB_OUTPUT" |
| 122 | + echo "redeploy_current=true" >> "$GITHUB_OUTPUT" |
| 123 | + exit 0 |
| 124 | + fi |
| 125 | + echo "redeploy_current=false" >> "$GITHUB_OUTPUT" |
| 126 | +
|
| 127 | + if [ "${{ github.event_name }}" != 'push' ]; then |
| 128 | + echo "Non-push event (${{ github.event_name }}) — build + deploy." |
| 129 | + echo "should_build=true" >> "$GITHUB_OUTPUT" |
| 130 | + echo "should_deploy=true" >> "$GITHUB_OUTPUT" |
| 131 | + elif [ "${PUSH_CHANGED:-false}" = 'true' ]; then |
| 132 | + echo "Push touched api_source paths — build + deploy." |
| 133 | + echo "should_build=true" >> "$GITHUB_OUTPUT" |
| 134 | + echo "should_deploy=true" >> "$GITHUB_OUTPUT" |
| 135 | + else |
| 136 | + echo "Push only touched workflow files — SKIP build + deploy." |
| 137 | + echo "should_build=false" >> "$GITHUB_OUTPUT" |
| 138 | + echo "should_deploy=false" >> "$GITHUB_OUTPUT" |
| 139 | + fi |
| 140 | +
|
| 141 | + # ── Deploy: ECR push (optional) + ECS register-TD + UpdateService ─ |
| 142 | + deploy: |
| 143 | + name: Deploy API to Tokyo ECS |
| 144 | + needs: changes |
| 145 | + if: | |
| 146 | + !failure() && !cancelled() |
| 147 | + && needs.changes.outputs.should_deploy == 'true' |
| 148 | + runs-on: ubuntu-latest |
| 149 | + timeout-minutes: 30 |
| 150 | + permissions: |
| 151 | + id-token: write |
| 152 | + contents: read |
| 153 | + steps: |
| 154 | + - uses: actions/checkout@v5 |
| 155 | + |
| 156 | + - name: Configure AWS credentials (OIDC) |
| 157 | + uses: aws-actions/configure-aws-credentials@v4 |
| 158 | + with: |
| 159 | + role-to-assume: ${{ env.AWS_ROLE_ARN }} |
| 160 | + aws-region: ${{ env.AWS_REGION }} |
| 161 | + role-session-name: deploy-api-${{ github.run_id }} |
| 162 | + |
| 163 | + - name: Resolve cluster + target group |
| 164 | + id: resources |
| 165 | + run: | |
| 166 | + set -euo pipefail |
| 167 | +
|
| 168 | + assert_one() { |
| 169 | + local kind="$1" count="$2" |
| 170 | + if [ "$count" -ne 1 ]; then |
| 171 | + echo "::error::Expected exactly 1 $kind, found $count" |
| 172 | + exit 1 |
| 173 | + fi |
| 174 | + } |
| 175 | +
|
| 176 | + CLUSTER_COUNT=$(aws ecs list-clusters \ |
| 177 | + --query "length(clusterArns[?contains(@, '${STACK_PREFIX}-cluster')])" --output text) |
| 178 | + assert_one "ECS cluster matching ${STACK_PREFIX}-cluster" "$CLUSTER_COUNT" |
| 179 | + CLUSTER=$(aws ecs list-clusters \ |
| 180 | + --query "clusterArns[?contains(@, '${STACK_PREFIX}-cluster')]|[0]" \ |
| 181 | + --output text | awk -F/ '{print $NF}') |
| 182 | + echo "cluster=$CLUSTER" >> "$GITHUB_OUTPUT" |
| 183 | +
|
| 184 | + LB_COUNT=$(aws elbv2 describe-load-balancers \ |
| 185 | + --query "length(LoadBalancers[?starts_with(LoadBalancerName,'ApiLoadBalancer-')])" --output text) |
| 186 | + assert_one "ApiLoadBalancer-*" "$LB_COUNT" |
| 187 | + TG_ARN=$(aws elbv2 describe-load-balancers \ |
| 188 | + --query "LoadBalancers[?starts_with(LoadBalancerName,'ApiLoadBalancer-')]|[0].LoadBalancerArn" \ |
| 189 | + --output text \ |
| 190 | + | xargs -I{} aws elbv2 describe-target-groups --load-balancer-arn {} \ |
| 191 | + --query "TargetGroups[0].TargetGroupArn" --output text) |
| 192 | + echo "tg_arn=$TG_ARN" >> "$GITHUB_OUTPUT" |
| 193 | + echo "::notice::cluster=$CLUSTER tg=$TG_ARN" |
| 194 | +
|
| 195 | + - name: ECR login |
| 196 | + if: needs.changes.outputs.should_build == 'true' |
| 197 | + id: ecr_login |
| 198 | + uses: aws-actions/amazon-ecr-login@v2 |
| 199 | + |
| 200 | + - name: Build & push API image (apps/api/Dockerfile.source) |
| 201 | + id: build_api |
| 202 | + if: needs.changes.outputs.should_build == 'true' |
| 203 | + run: | |
| 204 | + set -euo pipefail |
| 205 | + REGISTRY="${{ steps.ecr_login.outputs.registry }}" |
| 206 | + IMAGE="${REGISTRY}/${ECR_REPO}:api-${{ github.sha }}" |
| 207 | + docker buildx build --platform linux/amd64 --load \ |
| 208 | + -f apps/api/Dockerfile.source -t "$IMAGE" . |
| 209 | + docker push "$IMAGE" |
| 210 | + echo "image=$IMAGE" >> "$GITHUB_OUTPUT" |
| 211 | +
|
| 212 | + - name: Resolve image for redeploy-current path |
| 213 | + id: resolve_image |
| 214 | + if: needs.changes.outputs.redeploy_current == 'true' |
| 215 | + env: |
| 216 | + CLUSTER: ${{ steps.resources.outputs.cluster }} |
| 217 | + run: | |
| 218 | + set -euo pipefail |
| 219 | + OLD_TD_ARN=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \ |
| 220 | + --query 'services[0].taskDefinition' --output text) |
| 221 | + CURRENT_IMAGE=$(aws ecs describe-task-definition --task-definition "$OLD_TD_ARN" \ |
| 222 | + --query 'taskDefinition.containerDefinitions[0].image' --output text) |
| 223 | + echo "Using current image (no rebuild): $CURRENT_IMAGE" |
| 224 | + echo "image=$CURRENT_IMAGE" >> "$GITHUB_OUTPUT" |
| 225 | +
|
| 226 | + - name: Register new task definition + UpdateService + wait stable |
| 227 | + env: |
| 228 | + CLUSTER: ${{ steps.resources.outputs.cluster }} |
| 229 | + IMAGE: ${{ steps.build_api.outputs.image || steps.resolve_image.outputs.image }} |
| 230 | + TG_ARN: ${{ steps.resources.outputs.tg_arn }} |
| 231 | + run: | |
| 232 | + set -euo pipefail |
| 233 | + [ -n "$IMAGE" ] || { echo "::error::No image resolved (build skipped, redeploy_current also off)"; exit 1; } |
| 234 | +
|
| 235 | + OLD_TD_ARN=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \ |
| 236 | + --query 'services[0].taskDefinition' --output text) |
| 237 | + echo "Old TD: $OLD_TD_ARN" |
| 238 | +
|
| 239 | + # Clone old TD, swap image, strip readonly fields. Plaintext |
| 240 | + # env (e.g. DB_PASSWORD) lives here — do NOT cat the file. |
| 241 | + aws ecs describe-task-definition --task-definition "$OLD_TD_ARN" \ |
| 242 | + --query 'taskDefinition' --output json \ |
| 243 | + | jq --arg img "$IMAGE" ' |
| 244 | + .containerDefinitions[0].image = $img |
| 245 | + | del(.taskDefinitionArn, .revision, .status, .requiresAttributes, |
| 246 | + .compatibilities, .registeredAt, .registeredBy)' \ |
| 247 | + > /tmp/new-td.json |
| 248 | +
|
| 249 | + NEW_TD_ARN=$(aws ecs register-task-definition \ |
| 250 | + --cli-input-json file:///tmp/new-td.json \ |
| 251 | + --query 'taskDefinition.taskDefinitionArn' --output text) |
| 252 | + echo "::notice::New TD: $NEW_TD_ARN" |
| 253 | +
|
| 254 | + aws ecs update-service --cluster "$CLUSTER" --service Api \ |
| 255 | + --task-definition "$NEW_TD_ARN" --force-new-deployment >/dev/null |
| 256 | + aws ecs wait services-stable --cluster "$CLUSTER" --services Api |
| 257 | + echo "Service stable — verifying PRIMARY is NEW_TD..." |
| 258 | +
|
| 259 | + PRIMARY_TD=$(aws ecs describe-services --cluster "$CLUSTER" --services Api \ |
| 260 | + --query 'services[0].deployments[?status==`PRIMARY`]|[0].taskDefinition' \ |
| 261 | + --output text) |
| 262 | + if [ "$PRIMARY_TD" != "$NEW_TD_ARN" ]; then |
| 263 | + echo "::error::ECS rolled back. PRIMARY=$PRIMARY_TD expected $NEW_TD_ARN" |
| 264 | + aws ecs list-tasks --cluster "$CLUSTER" --service-name Api \ |
| 265 | + --desired-status STOPPED --max-results 5 \ |
| 266 | + --query 'taskArns' --output text \ |
| 267 | + | tr '\t' '\n' | head -3 \ |
| 268 | + | while read -r TASK; do |
| 269 | + [ -n "$TASK" ] || continue |
| 270 | + echo "--- stopped task $(basename "$TASK") ---" |
| 271 | + aws ecs describe-tasks --cluster "$CLUSTER" --tasks "$TASK" \ |
| 272 | + --query 'tasks[0].[stoppedReason,containers[0].exitCode,containers[0].reason]' \ |
| 273 | + --output text || true |
| 274 | + done |
| 275 | + exit 1 |
| 276 | + fi |
| 277 | + echo "::notice::PRIMARY deployment confirmed on $NEW_TD_ARN" |
| 278 | +
|
| 279 | + for i in $(seq 1 18); do |
| 280 | + HEALTHY=$(aws elbv2 describe-target-health \ |
| 281 | + --target-group-arn "$TG_ARN" \ |
| 282 | + --query "length(TargetHealthDescriptions[?TargetHealth.State=='healthy'])" \ |
| 283 | + --output text) |
| 284 | + if [ "$HEALTHY" -ge 1 ]; then |
| 285 | + echo "::notice::ALB target group: $HEALTHY healthy target(s)" |
| 286 | + exit 0 |
| 287 | + fi |
| 288 | + echo "ALB healthy=0 — retry $i/18 in 10s" |
| 289 | + sleep 10 |
| 290 | + done |
| 291 | + echo "::error::ALB target group never reported a healthy target within 180s" |
| 292 | + exit 1 |
0 commit comments