diff --git a/.github/workflows/smoke-nuke.yml b/.github/workflows/smoke-nuke.yml new file mode 100644 index 00000000..dc0589d8 --- /dev/null +++ b/.github/workflows/smoke-nuke.yml @@ -0,0 +1,68 @@ +name: Smoke Account Nuke + +# One-shot manual cleanup of the smoke account: deletes every ndx-try-* +# stack (umbrella + nested), all S3Files file systems (with access points + +# mount targets), AppRegistry NDXTry_* apps, ndx-try-* Connect instances, +# and ndx-try-*${ACCOUNT_ID}* S3 buckets. Use when iterative pre-deploy +# cleanup has accumulated too much debris to recover from. +# +# Triggered manually via workflow_dispatch. Reuses the smoke-test-deploy +# role + concurrency group with smoke.yml so we can't run both at once. + +on: + workflow_dispatch: + inputs: + confirm: + description: 'Type NUKE to confirm destructive cleanup of the smoke account' + required: true + type: string + +permissions: + id-token: write + contents: read + issues: write + +concurrency: + group: smoke + cancel-in-progress: false + +env: + STACK_NAME: all-demo + AWS_REGION: us-east-1 + +jobs: + nuke: + runs-on: ubuntu-latest + timeout-minutes: 180 + environment: smoke-test-deploy + steps: + - name: Validate confirmation + run: | + if [ "${{ inputs.confirm }}" != "NUKE" ]; then + echo "::error::confirm input must be exactly 'NUKE' to proceed" + exit 1 + fi + - uses: actions/checkout@v6 + - name: Install yq + run: | + YQ_VERSION="v4.45.4" + sudo wget -qO /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" + sudo chmod +x /usr/local/bin/yq + - name: Read smoke-test-account-config.yml + id: cfg + run: | + echo "role_arn=$(yq -r '.smoke_test_deploy_role_arn' docs/smoke-test-account-config.yml)" >> "$GITHUB_OUTPUT" + echo "region=$(yq -r '.smoke_test_region' docs/smoke-test-account-config.yml)" >> "$GITHUB_OUTPUT" + - uses: aws-actions/configure-aws-credentials@v6 + with: + role-to-assume: ${{ steps.cfg.outputs.role_arn }} + role-session-name: smoke-nuke-${{ github.run_id }} + aws-region: ${{ steps.cfg.outputs.region }} + role-duration-seconds: 21600 + + - name: Nuke everything + env: + STACK_NAME: ${{ env.STACK_NAME }} + GITHUB_RUN_ID: ${{ github.run_id }} + run: ./scripts/smoke-nuke.sh diff --git a/scripts/smoke-nuke.sh b/scripts/smoke-nuke.sh new file mode 100755 index 00000000..5b98a063 --- /dev/null +++ b/scripts/smoke-nuke.sh @@ -0,0 +1,245 @@ +#!/usr/bin/env bash +# Destructive cleanup of the smoke account. Deletes: +# - ${STACK_NAME} umbrella + every ${STACK_NAME}-* stack (top-level or nested) +# - All AWS::S3Files::FileSystem (with access points + mount targets first) +# - All ndx-try-*${ACCOUNT_ID}* S3 buckets (versioned-safe empty + delete) +# - All NDXTry_* AppRegistry applications +# - All ndx-try-* Amazon Connect instance aliases +# +# Re-uses helpers from smoke-pre-deploy-state.sh so the logic stays in one +# place. Runs under the smoke-test-deploy role via the GH Actions wrapper +# workflow. + +set -euo pipefail + +STACK="${STACK_NAME:?STACK_NAME required}" + +# Source the pre-deploy helpers (wait_for_stable, cleanup_orphan, +# delete_bucket_completely, sweep_orphan_*, etc.). The pre-deploy script +# normally runs end-to-end on its own, but we only want the helpers, so +# `return 0` before its main case statement to short-circuit execution. +# Hacky: set a sentinel env var the source script checks, OR just inline +# the helpers we need. The latter is simpler and keeps this script +# self-contained. + +# ── helpers (kept in sync with smoke-pre-deploy-state.sh) ───────────── + +wait_for_stable() { + local stack="$1" max_wait="${2:-3600}" interval=30 elapsed=0 s + while (( elapsed < max_wait )); do + s=$(aws cloudformation describe-stacks --stack-name "$stack" \ + --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "DOES_NOT_EXIST") + if [[ "$s" != *_IN_PROGRESS ]]; then + echo "$s"; return 0 + fi + echo " $stack still $s (${elapsed}s elapsed, max ${max_wait}s)" >&2 + sleep "$interval" + elapsed=$((elapsed + interval)) + done + echo "$s"; return 1 +} + +force_delete_stack() { + local stack="$1" status retain remaining + echo "deleting stack: $stack" + aws cloudformation delete-stack --stack-name "$stack" 2>/dev/null || true + status=$(wait_for_stable "$stack" 3600 || echo TIMEOUT) + case "$status" in + DELETE_COMPLETE|DOES_NOT_EXIST) return 0 ;; + esac + retain=$(aws cloudformation list-stack-resources --stack-name "$stack" \ + --query 'StackResourceSummaries[?ResourceStatus==`DELETE_FAILED`].LogicalResourceId' \ + --output text 2>/dev/null | tr '\t' ' ') + if [ -n "$retain" ]; then + echo " retain-retry: $retain" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$stack" \ + --retain-resources $retain 2>/dev/null || true + status=$(wait_for_stable "$stack" 3600 || echo TIMEOUT) + fi + case "$status" in + DELETE_COMPLETE|DOES_NOT_EXIST) return 0 ;; + esac + remaining=$(aws cloudformation list-stack-resources --stack-name "$stack" \ + --query 'StackResourceSummaries[].LogicalResourceId' \ + --output text 2>/dev/null | tr '\t' ' ') + if [ -n "$remaining" ]; then + echo " force-retain everything: $remaining" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$stack" \ + --retain-resources $remaining 2>/dev/null || true + wait_for_stable "$stack" 1800 || true + fi +} + +delete_bucket_completely() { + local bucket="$1" attempt versions markers rb_err del_payload del_err + for attempt in 1 2 3; do + versions=$(aws s3api list-object-versions --bucket "$bucket" --max-items 1000 \ + --query '{Objects: Versions[].{Key: Key, VersionId: VersionId}}' \ + --output json 2>/dev/null | jq -c '.Objects // []' || echo '[]') + if [ "$versions" != "[]" ] && [ -n "$versions" ]; then + del_payload=$(jq -n --argjson o "$versions" '{Objects: $o}') + del_err=$(aws s3api delete-objects --bucket "$bucket" --delete "$del_payload" 2>&1) || true + [ -n "$del_err" ] && echo "$del_err" | sed 's/^/ /' + fi + markers=$(aws s3api list-object-versions --bucket "$bucket" --max-items 1000 \ + --query '{Objects: DeleteMarkers[].{Key: Key, VersionId: VersionId}}' \ + --output json 2>/dev/null | jq -c '.Objects // []' || echo '[]') + if [ "$markers" != "[]" ] && [ -n "$markers" ]; then + del_payload=$(jq -n --argjson o "$markers" '{Objects: $o}') + del_err=$(aws s3api delete-objects --bucket "$bucket" --delete "$del_payload" 2>&1) || true + [ -n "$del_err" ] && echo "$del_err" | sed 's/^/ /' + fi + echo " attempt $attempt: rb s3://$bucket --force" + rb_err=$(aws s3 rb "s3://$bucket" --force 2>&1) || true + if ! aws s3api head-bucket --bucket "$bucket" 2>/dev/null; then + echo " $bucket deleted"; return 0 + fi + echo " $bucket still present after attempt $attempt; rb stderr: ${rb_err:-}" + sleep 5 + done + return 1 +} + +# ── phases ──────────────────────────────────────────────────────────── + +echo "============================================================" +echo "PHASE 1: delete CFN stacks matching ${STACK}*" +echo "============================================================" +# Top-level stacks named STACK or STACK-*: delete each. +# delete-stack on the umbrella cascades to nested children. +top_stacks=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE \ + DELETE_FAILED UPDATE_ROLLBACK_FAILED CREATE_FAILED UPDATE_FAILED \ + ROLLBACK_COMPLETE ROLLBACK_FAILED \ + --query "StackSummaries[?(StackName=='${STACK}' || starts_with(StackName, '${STACK}-')) && ParentId==\`null\`].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$top_stacks" ]; then + echo "(none in terminal states)" +else + echo "Top-level stacks to delete:" + echo "$top_stacks" + while IFS= read -r s; do + [ -z "$s" ] && continue + force_delete_stack "$s" || true + done <<< "$top_stacks" +fi + +# Wait for any in-progress matches to settle too. +in_progress=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_IN_PROGRESS UPDATE_IN_PROGRESS DELETE_IN_PROGRESS \ + ROLLBACK_IN_PROGRESS UPDATE_ROLLBACK_IN_PROGRESS UPDATE_COMPLETE_CLEANUP_IN_PROGRESS REVIEW_IN_PROGRESS \ + --query "StackSummaries[?(StackName=='${STACK}' || starts_with(StackName, '${STACK}-')) && ParentId==\`null\`].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -n "$in_progress" ]; then + echo "Waiting for in-progress matches to settle:" + echo "$in_progress" + while IFS= read -r s; do + [ -z "$s" ] && continue + wait_for_stable "$s" 3600 >/dev/null || echo " $s timeout, moving on" + # After stable, try delete again + force_delete_stack "$s" || true + done <<< "$in_progress" +fi + +echo +echo "============================================================" +echo "PHASE 2: delete AWS::S3Files::FileSystem instances" +echo "============================================================" +# All FS in the account; access points → mount targets → file system. +fs_list=$(aws s3files list-file-systems --max-results 100 \ + --query 'fileSystems[].fileSystemId' --output text 2>/dev/null \ + | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$fs_list" ]; then + echo "(none)" +else + while IFS= read -r fs; do + [ -z "$fs" ] && continue + echo "File system: $fs" + aps=$(aws s3files list-access-points --file-system-id "$fs" \ + --query 'accessPoints[].accessPointId' --output text 2>/dev/null \ + | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$aps" ]; then + while IFS= read -r ap; do + [ -z "$ap" ] && continue + echo " delete access point: $ap" + aws s3files delete-access-point --access-point-id "$ap" 2>&1 | sed 's/^/ /' || true + done <<< "$aps" + sleep 30 + fi + mts=$(aws s3files list-mount-targets --file-system-id "$fs" \ + --query 'mountTargets[].mountTargetId' --output text 2>/dev/null \ + | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$mts" ]; then + while IFS= read -r mt; do + [ -z "$mt" ] && continue + echo " delete mount target: $mt" + aws s3files delete-mount-target --mount-target-id "$mt" 2>&1 | sed 's/^/ /' || true + done <<< "$mts" + sleep 60 + fi + echo " delete file system: $fs" + aws s3files delete-file-system --file-system-id "$fs" --force-delete 2>&1 | sed 's/^/ /' || true + done <<< "$fs_list" + sleep 60 +fi + +echo +echo "============================================================" +echo "PHASE 3: delete NDXTry_* AppRegistry applications" +echo "============================================================" +apps=$(aws servicecatalog-appregistry list-applications \ + --query "applications[?starts_with(name, 'NDXTry_')].name" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$apps" ]; then + echo "(none)" +else + while IFS= read -r app; do + [ -z "$app" ] && continue + echo "delete appregistry: $app" + aws servicecatalog-appregistry delete-application --application "$app" 2>&1 | sed 's/^/ /' || true + done <<< "$apps" + sleep 30 +fi + +echo +echo "============================================================" +echo "PHASE 4: delete ndx-try-* Amazon Connect instances" +echo "============================================================" +instances=$(aws connect list-instances \ + --query "InstanceSummaryList[?starts_with(InstanceAlias, 'ndx-try-')].[Id,InstanceAlias]" \ + --output text 2>/dev/null | grep -v '^$' || true) +if [ -z "$instances" ]; then + echo "(none)" +else + while IFS=$'\t' read -r inst alias; do + [ -z "$inst" ] && continue + echo "delete connect: $alias ($inst)" + aws connect delete-instance --instance-id "$inst" 2>&1 | sed 's/^/ /' || true + done <<< "$instances" +fi + +echo +echo "============================================================" +echo "PHASE 5: delete ndx-try-*\${ACCOUNT_ID}* S3 buckets" +echo "============================================================" +acct=$(aws sts get-caller-identity --query Account --output text 2>/dev/null) +buckets=$(aws s3api list-buckets \ + --query "Buckets[?starts_with(Name, 'ndx-try-') && contains(Name, '${acct}')].Name" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$buckets" ]; then + echo "(none)" +else + echo "Buckets to delete:" + echo "$buckets" + while IFS= read -r b; do + [ -z "$b" ] && continue + delete_bucket_completely "$b" || true + done <<< "$buckets" +fi + +echo +echo "============================================================" +echo "DONE. Smoke account should now be in a clean slate." +echo "============================================================" diff --git a/scripts/smoke-pre-deploy-state.sh b/scripts/smoke-pre-deploy-state.sh index a93382b6..18545935 100755 --- a/scripts/smoke-pre-deploy-state.sh +++ b/scripts/smoke-pre-deploy-state.sh @@ -107,7 +107,7 @@ cleanup_orphan() { # stacks left over from delete-with-retain (all-demo-PaperlessNgx-*) and any # previous recovery stacks (all-demo-recovery-*). sweep_orphan_stacks() { - local orphans orphan + local orphans orphan in_progress # CRITICAL: filter on ParentId being absent. Stacks whose name starts # with ${STACK}- but ParentId is set are LIVE nested children of the # active umbrella, not orphans. Without this filter the sweep happily @@ -118,13 +118,35 @@ sweep_orphan_stacks() { --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE DELETE_FAILED UPDATE_ROLLBACK_FAILED CREATE_FAILED UPDATE_FAILED ROLLBACK_COMPLETE ROLLBACK_FAILED \ --query "StackSummaries[?starts_with(StackName, '${STACK}-') && ParentId==\`null\`].StackName" \ --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) - [ -z "$orphans" ] && { echo "No orphan ${STACK}-* stacks to sweep."; return 0; } - echo "Orphan stacks to sweep:" - echo "$orphans" - while IFS= read -r orphan; do - [ -z "$orphan" ] && continue - cleanup_orphan "$orphan" - done <<< "$orphans" + if [ -n "$orphans" ]; then + echo "Orphan stacks to sweep:" + echo "$orphans" + while IFS= read -r orphan; do + [ -z "$orphan" ] && continue + # `|| true` keeps set -e from killing the loop if one orphan can't + # be fully removed; cleanup_orphan opens its own stranded-stack issue. + cleanup_orphan "$orphan" || true + done <<< "$orphans" + else + echo "No orphan ${STACK}-* stacks to sweep." + fi + # Wait for any ${STACK}-* stacks currently in *_IN_PROGRESS (either from + # our just-issued deletes or from leftover server-side work) to settle + # before returning. Otherwise the next deploy step races with their + # AppRegistryAssociation children and hits InvalidRequest. + in_progress=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_IN_PROGRESS UPDATE_IN_PROGRESS DELETE_IN_PROGRESS ROLLBACK_IN_PROGRESS UPDATE_ROLLBACK_IN_PROGRESS UPDATE_COMPLETE_CLEANUP_IN_PROGRESS REVIEW_IN_PROGRESS \ + --query "StackSummaries[?starts_with(StackName, '${STACK}-') && ParentId==\`null\`].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$in_progress" ]; then + echo "Waiting for in-progress orphan ${STACK}-* stacks to finish:" + echo "$in_progress" + while IFS= read -r orphan; do + [ -z "$orphan" ] && continue + wait_for_stable "$orphan" 3600 >/dev/null || \ + echo " $orphan still in progress after 60m; proceeding anyway" + done <<< "$in_progress" + fi } # Delete orphan S3 Files file systems whose bucket name matches ndx-try-*. @@ -149,11 +171,35 @@ sweep_orphan_s3files() { # fs_bucket is an arn like arn:aws:s3:::ndx-try-*-- case "$fs_bucket" in *ndx-try-*${acct}*) + # Delete order: access points → mount targets → file system. + # Each parent refuses delete while children exist: + # ConflictException "has access points" / "has mount targets". + local ap_list ap_id ap_err mt_list mt_id mt_err fs_err + ap_list=$(aws s3files list-access-points --file-system-id "$fs_id" \ + --query 'accessPoints[].accessPointId' --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$ap_list" ]; then + while IFS= read -r ap_id; do + [ -z "$ap_id" ] && continue + echo " deleting access point: $ap_id (fs: $fs_id)" + ap_err=$(aws s3files delete-access-point --access-point-id "$ap_id" 2>&1) || true + [ -n "$ap_err" ] && echo " $ap_err" + done <<< "$ap_list" + sleep 15 + fi + mt_list=$(aws s3files list-mount-targets --file-system-id "$fs_id" \ + --query 'mountTargets[].mountTargetId' --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$mt_list" ]; then + while IFS= read -r mt_id; do + [ -z "$mt_id" ] && continue + echo " deleting mount target: $mt_id (fs: $fs_id)" + mt_err=$(aws s3files delete-mount-target --mount-target-id "$mt_id" 2>&1) || true + [ -n "$mt_err" ] && echo " $mt_err" + done <<< "$mt_list" + sleep 30 + fi echo " deleting file system: $fs_id (bucket: $fs_bucket)" - aws s3files delete-file-system --file-system-id "$fs_id" --force-delete 2>&1 | sed 's/^/ /' || \ - gh issue create --title "smoke: S3 Files filesystem $fs_id couldn't be deleted" \ - --label stranded-stack \ - --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete file system $fs_id (bucket $fs_bucket)." || true + fs_err=$(aws s3files delete-file-system --file-system-id "$fs_id" --force-delete 2>&1) || true + [ -n "$fs_err" ] && echo " $fs_err" ;; *) echo " skipping unrelated file system: $fs_id (bucket: $fs_bucket)" @@ -233,7 +279,9 @@ sweep_orphan_s3_buckets() { echo "$buckets" while IFS= read -r bucket; do [ -z "$bucket" ] && continue - delete_bucket_completely "$bucket" + # `|| true` keeps set -e from killing the loop if one bucket can't + # be deleted; delete_bucket_completely opens its own stranded-stack issue. + delete_bucket_completely "$bucket" || true done <<< "$buckets" } @@ -279,6 +327,10 @@ sweep_orphan_appregistry() { --label stranded-stack \ --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete $app." || true done <<< "$apps" + # AppRegistry delete propagates async; the next umbrella deploy raced + # CFN's "already own application" check without this grace period. + echo " waiting 30s for AppRegistry deletes to propagate" + sleep 30 } # Final exit path for "use the canonical stack name". Sweeps orphans (so