From 00521b9e56709ca26c60d1ae91070dee9705e50c Mon Sep 17 00:00:00 2001 From: Chris Nesbitt-Smith Date: Wed, 20 May 2026 11:24:17 +0100 Subject: [PATCH 1/2] fix(smoke): cover S3 Files access points + async-delete propagation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR 321's smoke retry surfaced three more layers of orphan-cleanup gaps: 1. delete-file-system fails with ConflictException "has access points" even after mount targets are gone. AWS::S3Files::AccessPoint must be deleted before mount targets, which must be deleted before the file system. Update the s3files sweep to delete in that order: access points (15s settle) → mount targets (30s settle) → file system. 2. AppRegistry delete-application is async on the server side. The sweep returned success, deploy fired immediately, and CFN's "already own application " check rejected the create. Add a 30s grace period at the end of sweep_orphan_appregistry, matching the s3files pattern. 3. Orphan ${STACK}-* stacks left over from prior runs were sometimes in DELETE_IN_PROGRESS state — invisible to the sweep's status filter (which only listed terminal states). The deploy raced their AppRegistryAssociation children and hit "stack status as DELETE_IN_PROGRESS … is not allowed". Add a second pass at the end of sweep_orphan_stacks that polls list-stacks for in-progress matches (ParentId==null) and wait_for_stable on each. Also bundled in: `|| true` on cleanup_orphan and delete_bucket_completely calls inside their respective sweep loops so a per-item failure no longer kills the whole pre-deploy via set -e. (The stranded-stack issues remain the audit trail.) Plus s3files delete stderr captured to var instead of piped through sed — avoids the pipefail trap that bit us earlier. --- scripts/smoke-pre-deploy-state.sh | 78 +++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 13 deletions(-) diff --git a/scripts/smoke-pre-deploy-state.sh b/scripts/smoke-pre-deploy-state.sh index a93382b6..18545935 100755 --- a/scripts/smoke-pre-deploy-state.sh +++ b/scripts/smoke-pre-deploy-state.sh @@ -107,7 +107,7 @@ cleanup_orphan() { # stacks left over from delete-with-retain (all-demo-PaperlessNgx-*) and any # previous recovery stacks (all-demo-recovery-*). sweep_orphan_stacks() { - local orphans orphan + local orphans orphan in_progress # CRITICAL: filter on ParentId being absent. Stacks whose name starts # with ${STACK}- but ParentId is set are LIVE nested children of the # active umbrella, not orphans. Without this filter the sweep happily @@ -118,13 +118,35 @@ sweep_orphan_stacks() { --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE DELETE_FAILED UPDATE_ROLLBACK_FAILED CREATE_FAILED UPDATE_FAILED ROLLBACK_COMPLETE ROLLBACK_FAILED \ --query "StackSummaries[?starts_with(StackName, '${STACK}-') && ParentId==\`null\`].StackName" \ --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) - [ -z "$orphans" ] && { echo "No orphan ${STACK}-* stacks to sweep."; return 0; } - echo "Orphan stacks to sweep:" - echo "$orphans" - while IFS= read -r orphan; do - [ -z "$orphan" ] && continue - cleanup_orphan "$orphan" - done <<< "$orphans" + if [ -n "$orphans" ]; then + echo "Orphan stacks to sweep:" + echo "$orphans" + while IFS= read -r orphan; do + [ -z "$orphan" ] && continue + # `|| true` keeps set -e from killing the loop if one orphan can't + # be fully removed; cleanup_orphan opens its own stranded-stack issue. + cleanup_orphan "$orphan" || true + done <<< "$orphans" + else + echo "No orphan ${STACK}-* stacks to sweep." + fi + # Wait for any ${STACK}-* stacks currently in *_IN_PROGRESS (either from + # our just-issued deletes or from leftover server-side work) to settle + # before returning. Otherwise the next deploy step races with their + # AppRegistryAssociation children and hits InvalidRequest. + in_progress=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_IN_PROGRESS UPDATE_IN_PROGRESS DELETE_IN_PROGRESS ROLLBACK_IN_PROGRESS UPDATE_ROLLBACK_IN_PROGRESS UPDATE_COMPLETE_CLEANUP_IN_PROGRESS REVIEW_IN_PROGRESS \ + --query "StackSummaries[?starts_with(StackName, '${STACK}-') && ParentId==\`null\`].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$in_progress" ]; then + echo "Waiting for in-progress orphan ${STACK}-* stacks to finish:" + echo "$in_progress" + while IFS= read -r orphan; do + [ -z "$orphan" ] && continue + wait_for_stable "$orphan" 3600 >/dev/null || \ + echo " $orphan still in progress after 60m; proceeding anyway" + done <<< "$in_progress" + fi } # Delete orphan S3 Files file systems whose bucket name matches ndx-try-*. @@ -149,11 +171,35 @@ sweep_orphan_s3files() { # fs_bucket is an arn like arn:aws:s3:::ndx-try-*-- case "$fs_bucket" in *ndx-try-*${acct}*) + # Delete order: access points → mount targets → file system. + # Each parent refuses delete while children exist: + # ConflictException "has access points" / "has mount targets". + local ap_list ap_id ap_err mt_list mt_id mt_err fs_err + ap_list=$(aws s3files list-access-points --file-system-id "$fs_id" \ + --query 'accessPoints[].accessPointId' --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$ap_list" ]; then + while IFS= read -r ap_id; do + [ -z "$ap_id" ] && continue + echo " deleting access point: $ap_id (fs: $fs_id)" + ap_err=$(aws s3files delete-access-point --access-point-id "$ap_id" 2>&1) || true + [ -n "$ap_err" ] && echo " $ap_err" + done <<< "$ap_list" + sleep 15 + fi + mt_list=$(aws s3files list-mount-targets --file-system-id "$fs_id" \ + --query 'mountTargets[].mountTargetId' --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$mt_list" ]; then + while IFS= read -r mt_id; do + [ -z "$mt_id" ] && continue + echo " deleting mount target: $mt_id (fs: $fs_id)" + mt_err=$(aws s3files delete-mount-target --mount-target-id "$mt_id" 2>&1) || true + [ -n "$mt_err" ] && echo " $mt_err" + done <<< "$mt_list" + sleep 30 + fi echo " deleting file system: $fs_id (bucket: $fs_bucket)" - aws s3files delete-file-system --file-system-id "$fs_id" --force-delete 2>&1 | sed 's/^/ /' || \ - gh issue create --title "smoke: S3 Files filesystem $fs_id couldn't be deleted" \ - --label stranded-stack \ - --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete file system $fs_id (bucket $fs_bucket)." || true + fs_err=$(aws s3files delete-file-system --file-system-id "$fs_id" --force-delete 2>&1) || true + [ -n "$fs_err" ] && echo " $fs_err" ;; *) echo " skipping unrelated file system: $fs_id (bucket: $fs_bucket)" @@ -233,7 +279,9 @@ sweep_orphan_s3_buckets() { echo "$buckets" while IFS= read -r bucket; do [ -z "$bucket" ] && continue - delete_bucket_completely "$bucket" + # `|| true` keeps set -e from killing the loop if one bucket can't + # be deleted; delete_bucket_completely opens its own stranded-stack issue. + delete_bucket_completely "$bucket" || true done <<< "$buckets" } @@ -279,6 +327,10 @@ sweep_orphan_appregistry() { --label stranded-stack \ --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete $app." || true done <<< "$apps" + # AppRegistry delete propagates async; the next umbrella deploy raced + # CFN's "already own application" check without this grace period. + echo " waiting 30s for AppRegistry deletes to propagate" + sleep 30 } # Final exit path for "use the canonical stack name". Sweeps orphans (so From f9fd34d3a9876656178d02d5e628ba47eb62cdcb Mon Sep 17 00:00:00 2001 From: Chris Nesbitt-Smith Date: Wed, 20 May 2026 14:27:37 +0100 Subject: [PATCH 2/2] chore(smoke): add one-shot smoke-account nuke workflow When iterative pre-deploy cleanup has accumulated more debris than the script can sweep in 90 minutes, manual recovery is needed. Add a workflow-dispatch workflow + script that runs under the existing smoke-test-deploy role and clears every: - ${STACK}* CloudFormation stack (top-level) - AWS::S3Files::FileSystem (with access points + mount targets first) - NDXTry_* AppRegistry applications - ndx-try-* Connect instances - ndx-try-*${ACCOUNT_ID}* S3 buckets Same identity, same concurrency group as smoke.yml, so the two can't run simultaneously. Requires typing NUKE as the confirm input to fire, to avoid accidental destruction of the smoke account. This is the script side of "manual cleanup" we kept reaching for during PR #321's CI iterations. After running this, the next smoke run starts from a true clean slate. --- .github/workflows/smoke-nuke.yml | 68 +++++++++ scripts/smoke-nuke.sh | 245 +++++++++++++++++++++++++++++++ 2 files changed, 313 insertions(+) create mode 100644 .github/workflows/smoke-nuke.yml create mode 100755 scripts/smoke-nuke.sh diff --git a/.github/workflows/smoke-nuke.yml b/.github/workflows/smoke-nuke.yml new file mode 100644 index 00000000..dc0589d8 --- /dev/null +++ b/.github/workflows/smoke-nuke.yml @@ -0,0 +1,68 @@ +name: Smoke Account Nuke + +# One-shot manual cleanup of the smoke account: deletes every ndx-try-* +# stack (umbrella + nested), all S3Files file systems (with access points + +# mount targets), AppRegistry NDXTry_* apps, ndx-try-* Connect instances, +# and ndx-try-*${ACCOUNT_ID}* S3 buckets. Use when iterative pre-deploy +# cleanup has accumulated too much debris to recover from. +# +# Triggered manually via workflow_dispatch. Reuses the smoke-test-deploy +# role + concurrency group with smoke.yml so we can't run both at once. + +on: + workflow_dispatch: + inputs: + confirm: + description: 'Type NUKE to confirm destructive cleanup of the smoke account' + required: true + type: string + +permissions: + id-token: write + contents: read + issues: write + +concurrency: + group: smoke + cancel-in-progress: false + +env: + STACK_NAME: all-demo + AWS_REGION: us-east-1 + +jobs: + nuke: + runs-on: ubuntu-latest + timeout-minutes: 180 + environment: smoke-test-deploy + steps: + - name: Validate confirmation + run: | + if [ "${{ inputs.confirm }}" != "NUKE" ]; then + echo "::error::confirm input must be exactly 'NUKE' to proceed" + exit 1 + fi + - uses: actions/checkout@v6 + - name: Install yq + run: | + YQ_VERSION="v4.45.4" + sudo wget -qO /usr/local/bin/yq \ + "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64" + sudo chmod +x /usr/local/bin/yq + - name: Read smoke-test-account-config.yml + id: cfg + run: | + echo "role_arn=$(yq -r '.smoke_test_deploy_role_arn' docs/smoke-test-account-config.yml)" >> "$GITHUB_OUTPUT" + echo "region=$(yq -r '.smoke_test_region' docs/smoke-test-account-config.yml)" >> "$GITHUB_OUTPUT" + - uses: aws-actions/configure-aws-credentials@v6 + with: + role-to-assume: ${{ steps.cfg.outputs.role_arn }} + role-session-name: smoke-nuke-${{ github.run_id }} + aws-region: ${{ steps.cfg.outputs.region }} + role-duration-seconds: 21600 + + - name: Nuke everything + env: + STACK_NAME: ${{ env.STACK_NAME }} + GITHUB_RUN_ID: ${{ github.run_id }} + run: ./scripts/smoke-nuke.sh diff --git a/scripts/smoke-nuke.sh b/scripts/smoke-nuke.sh new file mode 100755 index 00000000..5b98a063 --- /dev/null +++ b/scripts/smoke-nuke.sh @@ -0,0 +1,245 @@ +#!/usr/bin/env bash +# Destructive cleanup of the smoke account. Deletes: +# - ${STACK_NAME} umbrella + every ${STACK_NAME}-* stack (top-level or nested) +# - All AWS::S3Files::FileSystem (with access points + mount targets first) +# - All ndx-try-*${ACCOUNT_ID}* S3 buckets (versioned-safe empty + delete) +# - All NDXTry_* AppRegistry applications +# - All ndx-try-* Amazon Connect instance aliases +# +# Re-uses helpers from smoke-pre-deploy-state.sh so the logic stays in one +# place. Runs under the smoke-test-deploy role via the GH Actions wrapper +# workflow. + +set -euo pipefail + +STACK="${STACK_NAME:?STACK_NAME required}" + +# Source the pre-deploy helpers (wait_for_stable, cleanup_orphan, +# delete_bucket_completely, sweep_orphan_*, etc.). The pre-deploy script +# normally runs end-to-end on its own, but we only want the helpers, so +# `return 0` before its main case statement to short-circuit execution. +# Hacky: set a sentinel env var the source script checks, OR just inline +# the helpers we need. The latter is simpler and keeps this script +# self-contained. + +# ── helpers (kept in sync with smoke-pre-deploy-state.sh) ───────────── + +wait_for_stable() { + local stack="$1" max_wait="${2:-3600}" interval=30 elapsed=0 s + while (( elapsed < max_wait )); do + s=$(aws cloudformation describe-stacks --stack-name "$stack" \ + --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "DOES_NOT_EXIST") + if [[ "$s" != *_IN_PROGRESS ]]; then + echo "$s"; return 0 + fi + echo " $stack still $s (${elapsed}s elapsed, max ${max_wait}s)" >&2 + sleep "$interval" + elapsed=$((elapsed + interval)) + done + echo "$s"; return 1 +} + +force_delete_stack() { + local stack="$1" status retain remaining + echo "deleting stack: $stack" + aws cloudformation delete-stack --stack-name "$stack" 2>/dev/null || true + status=$(wait_for_stable "$stack" 3600 || echo TIMEOUT) + case "$status" in + DELETE_COMPLETE|DOES_NOT_EXIST) return 0 ;; + esac + retain=$(aws cloudformation list-stack-resources --stack-name "$stack" \ + --query 'StackResourceSummaries[?ResourceStatus==`DELETE_FAILED`].LogicalResourceId' \ + --output text 2>/dev/null | tr '\t' ' ') + if [ -n "$retain" ]; then + echo " retain-retry: $retain" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$stack" \ + --retain-resources $retain 2>/dev/null || true + status=$(wait_for_stable "$stack" 3600 || echo TIMEOUT) + fi + case "$status" in + DELETE_COMPLETE|DOES_NOT_EXIST) return 0 ;; + esac + remaining=$(aws cloudformation list-stack-resources --stack-name "$stack" \ + --query 'StackResourceSummaries[].LogicalResourceId' \ + --output text 2>/dev/null | tr '\t' ' ') + if [ -n "$remaining" ]; then + echo " force-retain everything: $remaining" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$stack" \ + --retain-resources $remaining 2>/dev/null || true + wait_for_stable "$stack" 1800 || true + fi +} + +delete_bucket_completely() { + local bucket="$1" attempt versions markers rb_err del_payload del_err + for attempt in 1 2 3; do + versions=$(aws s3api list-object-versions --bucket "$bucket" --max-items 1000 \ + --query '{Objects: Versions[].{Key: Key, VersionId: VersionId}}' \ + --output json 2>/dev/null | jq -c '.Objects // []' || echo '[]') + if [ "$versions" != "[]" ] && [ -n "$versions" ]; then + del_payload=$(jq -n --argjson o "$versions" '{Objects: $o}') + del_err=$(aws s3api delete-objects --bucket "$bucket" --delete "$del_payload" 2>&1) || true + [ -n "$del_err" ] && echo "$del_err" | sed 's/^/ /' + fi + markers=$(aws s3api list-object-versions --bucket "$bucket" --max-items 1000 \ + --query '{Objects: DeleteMarkers[].{Key: Key, VersionId: VersionId}}' \ + --output json 2>/dev/null | jq -c '.Objects // []' || echo '[]') + if [ "$markers" != "[]" ] && [ -n "$markers" ]; then + del_payload=$(jq -n --argjson o "$markers" '{Objects: $o}') + del_err=$(aws s3api delete-objects --bucket "$bucket" --delete "$del_payload" 2>&1) || true + [ -n "$del_err" ] && echo "$del_err" | sed 's/^/ /' + fi + echo " attempt $attempt: rb s3://$bucket --force" + rb_err=$(aws s3 rb "s3://$bucket" --force 2>&1) || true + if ! aws s3api head-bucket --bucket "$bucket" 2>/dev/null; then + echo " $bucket deleted"; return 0 + fi + echo " $bucket still present after attempt $attempt; rb stderr: ${rb_err:-}" + sleep 5 + done + return 1 +} + +# ── phases ──────────────────────────────────────────────────────────── + +echo "============================================================" +echo "PHASE 1: delete CFN stacks matching ${STACK}*" +echo "============================================================" +# Top-level stacks named STACK or STACK-*: delete each. +# delete-stack on the umbrella cascades to nested children. +top_stacks=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE \ + DELETE_FAILED UPDATE_ROLLBACK_FAILED CREATE_FAILED UPDATE_FAILED \ + ROLLBACK_COMPLETE ROLLBACK_FAILED \ + --query "StackSummaries[?(StackName=='${STACK}' || starts_with(StackName, '${STACK}-')) && ParentId==\`null\`].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$top_stacks" ]; then + echo "(none in terminal states)" +else + echo "Top-level stacks to delete:" + echo "$top_stacks" + while IFS= read -r s; do + [ -z "$s" ] && continue + force_delete_stack "$s" || true + done <<< "$top_stacks" +fi + +# Wait for any in-progress matches to settle too. +in_progress=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_IN_PROGRESS UPDATE_IN_PROGRESS DELETE_IN_PROGRESS \ + ROLLBACK_IN_PROGRESS UPDATE_ROLLBACK_IN_PROGRESS UPDATE_COMPLETE_CLEANUP_IN_PROGRESS REVIEW_IN_PROGRESS \ + --query "StackSummaries[?(StackName=='${STACK}' || starts_with(StackName, '${STACK}-')) && ParentId==\`null\`].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -n "$in_progress" ]; then + echo "Waiting for in-progress matches to settle:" + echo "$in_progress" + while IFS= read -r s; do + [ -z "$s" ] && continue + wait_for_stable "$s" 3600 >/dev/null || echo " $s timeout, moving on" + # After stable, try delete again + force_delete_stack "$s" || true + done <<< "$in_progress" +fi + +echo +echo "============================================================" +echo "PHASE 2: delete AWS::S3Files::FileSystem instances" +echo "============================================================" +# All FS in the account; access points → mount targets → file system. +fs_list=$(aws s3files list-file-systems --max-results 100 \ + --query 'fileSystems[].fileSystemId' --output text 2>/dev/null \ + | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$fs_list" ]; then + echo "(none)" +else + while IFS= read -r fs; do + [ -z "$fs" ] && continue + echo "File system: $fs" + aps=$(aws s3files list-access-points --file-system-id "$fs" \ + --query 'accessPoints[].accessPointId' --output text 2>/dev/null \ + | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$aps" ]; then + while IFS= read -r ap; do + [ -z "$ap" ] && continue + echo " delete access point: $ap" + aws s3files delete-access-point --access-point-id "$ap" 2>&1 | sed 's/^/ /' || true + done <<< "$aps" + sleep 30 + fi + mts=$(aws s3files list-mount-targets --file-system-id "$fs" \ + --query 'mountTargets[].mountTargetId' --output text 2>/dev/null \ + | tr '\t' '\n' | grep -v '^$' || true) + if [ -n "$mts" ]; then + while IFS= read -r mt; do + [ -z "$mt" ] && continue + echo " delete mount target: $mt" + aws s3files delete-mount-target --mount-target-id "$mt" 2>&1 | sed 's/^/ /' || true + done <<< "$mts" + sleep 60 + fi + echo " delete file system: $fs" + aws s3files delete-file-system --file-system-id "$fs" --force-delete 2>&1 | sed 's/^/ /' || true + done <<< "$fs_list" + sleep 60 +fi + +echo +echo "============================================================" +echo "PHASE 3: delete NDXTry_* AppRegistry applications" +echo "============================================================" +apps=$(aws servicecatalog-appregistry list-applications \ + --query "applications[?starts_with(name, 'NDXTry_')].name" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$apps" ]; then + echo "(none)" +else + while IFS= read -r app; do + [ -z "$app" ] && continue + echo "delete appregistry: $app" + aws servicecatalog-appregistry delete-application --application "$app" 2>&1 | sed 's/^/ /' || true + done <<< "$apps" + sleep 30 +fi + +echo +echo "============================================================" +echo "PHASE 4: delete ndx-try-* Amazon Connect instances" +echo "============================================================" +instances=$(aws connect list-instances \ + --query "InstanceSummaryList[?starts_with(InstanceAlias, 'ndx-try-')].[Id,InstanceAlias]" \ + --output text 2>/dev/null | grep -v '^$' || true) +if [ -z "$instances" ]; then + echo "(none)" +else + while IFS=$'\t' read -r inst alias; do + [ -z "$inst" ] && continue + echo "delete connect: $alias ($inst)" + aws connect delete-instance --instance-id "$inst" 2>&1 | sed 's/^/ /' || true + done <<< "$instances" +fi + +echo +echo "============================================================" +echo "PHASE 5: delete ndx-try-*\${ACCOUNT_ID}* S3 buckets" +echo "============================================================" +acct=$(aws sts get-caller-identity --query Account --output text 2>/dev/null) +buckets=$(aws s3api list-buckets \ + --query "Buckets[?starts_with(Name, 'ndx-try-') && contains(Name, '${acct}')].Name" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) +if [ -z "$buckets" ]; then + echo "(none)" +else + echo "Buckets to delete:" + echo "$buckets" + while IFS= read -r b; do + [ -z "$b" ] && continue + delete_bucket_completely "$b" || true + done <<< "$buckets" +fi + +echo +echo "============================================================" +echo "DONE. Smoke account should now be in a clean slate." +echo "============================================================"