diff --git a/.github/workflows/smoke.yml b/.github/workflows/smoke.yml index 0ca43bd1..f82f5477 100644 --- a/.github/workflows/smoke.yml +++ b/.github/workflows/smoke.yml @@ -18,6 +18,16 @@ on: push: branches: [main] + paths: + - 'cloudformation/scenarios/**' + - 'tests/smoke/**' + - 'scripts/smoke*.sh' + - 'scripts/check-quarantines.mjs' + - 'playwright.config.ts' + - 'docs/smoke-test-account-config.yml' + - 'docs/smoke-test-account-setup.md' + - '.github/workflows/smoke.yml' + - 'package.json' workflow_dispatch: diff --git a/scripts/smoke-pre-deploy-state.sh b/scripts/smoke-pre-deploy-state.sh index 2bc3b25f..c9acf995 100755 --- a/scripts/smoke-pre-deploy-state.sh +++ b/scripts/smoke-pre-deploy-state.sh @@ -19,9 +19,281 @@ set -euo pipefail # instead (PowerUserAccess on the deploy role now allows logs:DeleteLogGroup). STACK="${STACK_NAME}" -STATUS=$(aws cloudformation describe-stacks --stack-name "$STACK" \ - --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "DOES_NOT_EXIST") -echo "Current $STACK status: $STATUS" +MAX_ITER=8 + +# Block until the stack reaches a terminal (non-IN_PROGRESS) state or the +# wait budget expires. Echoes the final status. Returns 0 on stable, 1 on +# timeout. We poll instead of using `aws cloudformation wait` because the +# all-demo umbrella's rollbacks routinely run past the CLI waiter's +# 60-poll * 30s = 30-minute budget. +wait_for_stable() { + local stack="$1" + local max_wait="${2:-3600}" + local interval=30 + local elapsed=0 + local s + while (( elapsed < max_wait )); do + s=$(aws cloudformation describe-stacks --stack-name "$stack" \ + --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "DOES_NOT_EXIST") + if [[ "$s" != *_IN_PROGRESS ]]; then + echo "$s" + return 0 + fi + echo " $stack still $s (${elapsed}s elapsed, max ${max_wait}s)" >&2 + sleep "$interval" + elapsed=$((elapsed + interval)) + done + echo "$s" + return 1 +} + +# Force-cleanup one orphan stack. First plain delete, wait. If timed out +# or DELETE_FAILED, retry with --retain-resources on whatever's stuck. +# If a second pass also can't complete, retain ALL remaining resources +# so the stack at least leaves CFN's tracking — debris on the account is +# acceptable to unblock the umbrella; we log a stranded-stack issue so +# humans can sweep later. +cleanup_orphan() { + local orphan="$1" + local status retain remaining + echo " deleting orphan: $orphan" + aws cloudformation delete-stack --stack-name "$orphan" 2>/dev/null || true + status=$(wait_for_stable "$orphan" 3600 || echo "TIMEOUT") + case "$status" in + DELETE_COMPLETE|DOES_NOT_EXIST) + return 0 + ;; + esac + + retain=$(aws cloudformation list-stack-resources --stack-name "$orphan" \ + --query 'StackResourceSummaries[?ResourceStatus==`DELETE_FAILED`].LogicalResourceId' \ + --output text 2>/dev/null | tr '\t' ' ') + if [ -n "$retain" ]; then + echo " retrying orphan delete retaining: $retain" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$orphan" \ + --retain-resources $retain 2>/dev/null || true + status=$(wait_for_stable "$orphan" 3600 || echo "TIMEOUT") + fi + + case "$status" in + DELETE_COMPLETE|DOES_NOT_EXIST) + gh issue create --title "smoke: $orphan retained resources" \ + --label stranded-stack \ + --body "Retained on orphan delete: $retain. Run ${GITHUB_RUN_ID}." || true + return 0 + ;; + esac + + # Last resort: retain literally every remaining resource so the stack + # disappears from CFN's tracking. Account debris is the lesser evil + # vs. the umbrella's child stacks colliding on globally-unique names. + remaining=$(aws cloudformation list-stack-resources --stack-name "$orphan" \ + --query 'StackResourceSummaries[].LogicalResourceId' \ + --output text 2>/dev/null | tr '\t' ' ') + if [ -n "$remaining" ]; then + echo " forcing orphan delete by retaining everything: $remaining" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$orphan" \ + --retain-resources $remaining 2>/dev/null || true + wait_for_stable "$orphan" 1800 || true + gh issue create --title "smoke: $orphan force-retained all resources" \ + --label stranded-stack \ + --body "All resources retained on force-delete: $remaining. Run ${GITHUB_RUN_ID}." || true + fi +} + +# Sweep CFN stacks whose name starts with "${STACK}-" — both retained nested +# stacks left over from delete-with-retain (all-demo-PaperlessNgx-*) and any +# previous recovery stacks (all-demo-recovery-*). +sweep_orphan_stacks() { + local orphans orphan + orphans=$(aws cloudformation list-stacks \ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE UPDATE_ROLLBACK_COMPLETE DELETE_FAILED UPDATE_ROLLBACK_FAILED CREATE_FAILED UPDATE_FAILED ROLLBACK_COMPLETE ROLLBACK_FAILED \ + --query "StackSummaries[?starts_with(StackName, \`${STACK}-\`)].StackName" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + [ -z "$orphans" ] && { echo "No orphan ${STACK}-* stacks to sweep."; return 0; } + echo "Orphan stacks to sweep:" + echo "$orphans" + while IFS= read -r orphan; do + [ -z "$orphan" ] && continue + cleanup_orphan "$orphan" + done <<< "$orphans" +} + +# Delete orphan S3 Files file systems whose bucket name matches ndx-try-*. +# AWS::S3Files::FileSystem (Paperless-ngx StorageFileSystem etc.) attaches +# to an S3 bucket; the bucket then refuses delete with +# BucketHasS3FileSystemAttached until the file system is removed. Must +# run before sweep_orphan_s3_buckets so the buckets actually disappear. +sweep_orphan_s3files() { + local acct fs_list fs_id fs_bucket + acct=$(aws sts get-caller-identity --query Account --output text 2>/dev/null) || return 0 + # Output format: fileSystemIdbucket-arn per line. + fs_list=$(aws s3files list-file-systems --max-results 100 \ + --query "fileSystems[].[fileSystemId,bucket]" \ + --output text 2>&1) || { + echo " list-file-systems failed (CLI may be too old in this runner): $fs_list" + return 0 + } + [ -z "$fs_list" ] && { echo "No S3 Files file systems to sweep."; return 0; } + echo "S3 Files file systems to sweep (filtered by bucket name):" + while IFS=$'\t' read -r fs_id fs_bucket; do + [ -z "$fs_id" ] && continue + # fs_bucket is an arn like arn:aws:s3:::ndx-try-*-- + case "$fs_bucket" in + *ndx-try-*${acct}*) + echo " deleting file system: $fs_id (bucket: $fs_bucket)" + aws s3files delete-file-system --file-system-id "$fs_id" --force-delete 2>&1 | sed 's/^/ /' || \ + gh issue create --title "smoke: S3 Files filesystem $fs_id couldn't be deleted" \ + --label stranded-stack \ + --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete file system $fs_id (bucket $fs_bucket)." || true + ;; + *) + echo " skipping unrelated file system: $fs_id (bucket: $fs_bucket)" + ;; + esac + done <<< "$fs_list" + # File-system delete is async; the bucket stays "attached" until the + # delete completes server-side. Wait briefly so the subsequent bucket + # rb has a chance of succeeding. + echo " waiting 60s for S3 Files deletes to release buckets" + sleep 60 +} + +# Empty and delete a single S3 bucket. Iterates because (a) list-object- +# versions paginates at 1000 entries, (b) `aws s3 rb --force` doesn't +# touch noncurrent versions or delete-markers on versioned buckets, and +# (c) we need to verify the bucket actually disappeared rather than rely +# on a swallowed exit code. stderr is captured + surfaced so a permissions +# problem doesn't look like "no buckets found". +delete_bucket_completely() { + local bucket="$1" attempt versions markers rb_err del_payload del_err + for attempt in 1 2 3; do + # Versions[] -- noncurrent object versions. Must be JSON object form + # (`--delete '{"Objects":[...]}'`); CLI shorthand `Objects=[{...}]` + # rejects JSON-quoted keys and dies with "Expected: '=', received: '"'". + versions=$(aws s3api list-object-versions --bucket "$bucket" --max-items 1000 \ + --query '{Objects: Versions[].{Key: Key, VersionId: VersionId}}' \ + --output json 2>/dev/null | jq -c '.Objects // []' || echo '[]') + if [ "$versions" != "[]" ] && [ -n "$versions" ]; then + del_payload=$(jq -n --argjson o "$versions" '{Objects: $o}') + del_err=$(aws s3api delete-objects --bucket "$bucket" --delete "$del_payload" 2>&1) || true + [ -n "$del_err" ] && echo "$del_err" | sed 's/^/ /' + fi + # DeleteMarkers[] -- tombstones on versioned buckets + markers=$(aws s3api list-object-versions --bucket "$bucket" --max-items 1000 \ + --query '{Objects: DeleteMarkers[].{Key: Key, VersionId: VersionId}}' \ + --output json 2>/dev/null | jq -c '.Objects // []' || echo '[]') + if [ "$markers" != "[]" ] && [ -n "$markers" ]; then + del_payload=$(jq -n --argjson o "$markers" '{Objects: $o}') + del_err=$(aws s3api delete-objects --bucket "$bucket" --delete "$del_payload" 2>&1) || true + [ -n "$del_err" ] && echo "$del_err" | sed 's/^/ /' + fi + echo " attempt $attempt: rb s3://$bucket --force" + rb_err=$(aws s3 rb "s3://$bucket" --force 2>&1) || true + if ! aws s3api head-bucket --bucket "$bucket" 2>/dev/null; then + echo " $bucket deleted" + return 0 + fi + echo " $bucket still present after attempt $attempt; rb stderr: ${rb_err:-}" + sleep 5 + done + gh issue create --title "smoke: bucket $bucket couldn't be deleted" \ + --label stranded-stack \ + --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete $bucket. Last rb stderr: $rb_err" || true + return 1 +} + +# Empty and delete S3 buckets matching ndx-try-*${ACCOUNT_ID}*. Force-retain +# orphan stack cleanups leave their non-stack-owned children behind, and +# scenario templates use deterministic bucket names with the account id in +# the suffix (e.g. ndx-try-planning-docs--). The next umbrella +# create then trips on AlreadyExists. Only call when the umbrella is truly +# absent from CFN (status=DOES_NOT_EXIST) so we can't accidentally delete a +# bucket that an in-flight stack still owns. +sweep_orphan_s3_buckets() { + local acct buckets bucket + acct=$(aws sts get-caller-identity --query Account --output text 2>/dev/null) || return 0 + # JMESPath backtick-literals parse their contents as JSON, so a bare + # 12-digit account id becomes a number — and `contains(Name, )` + # never matches the string bucket names. Single-quoted JMESPath strings + # avoid the parse and behave intuitively. + buckets=$(aws s3api list-buckets \ + --query "Buckets[?starts_with(Name, 'ndx-try-') && contains(Name, '${acct}')].Name" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + [ -z "$buckets" ] && { echo "No ndx-try-*${acct}* buckets to sweep."; return 0; } + echo "S3 buckets to sweep:" + echo "$buckets" + while IFS= read -r bucket; do + [ -z "$bucket" ] && continue + delete_bucket_completely "$bucket" + done <<< "$buckets" +} + +# Delete Amazon Connect instances whose InstanceAlias starts with ndx-try-. +# AICC creates one and its alias is account-globally-unique, so a leftover +# from a previous run blocks the next create with "Instance alias is +# already used". delete-instance is async but the alias is freed +# immediately on the API call. +sweep_orphan_connect() { + local instances inst alias + instances=$(aws connect list-instances \ + --query "InstanceSummaryList[?starts_with(InstanceAlias, 'ndx-try-')].[Id,InstanceAlias]" \ + --output text 2>/dev/null | grep -v '^$' || true) + [ -z "$instances" ] && { echo "No ndx-try-* Connect instances to sweep."; return 0; } + echo "Connect instances to sweep:" + echo "$instances" + while IFS=$'\t' read -r inst alias; do + [ -z "$inst" ] && continue + echo " deleting connect instance: $alias ($inst)" + aws connect delete-instance --instance-id "$inst" 2>&1 | sed 's/^/ /' || \ + gh issue create --title "smoke: Connect instance $alias couldn't be deleted" \ + --label stranded-stack \ + --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete Connect instance $alias ($inst)." || true + done <<< "$instances" +} + +# Delete ServiceCatalog AppRegistry applications matching NDXTry_*. Same +# orphan story as buckets: scenario templates create AppRegistry apps with +# deterministic names; survivors block the next create. +sweep_orphan_appregistry() { + local apps app + apps=$(aws servicecatalog-appregistry list-applications \ + --query "applications[?starts_with(name, 'NDXTry_')].name" \ + --output text 2>/dev/null | tr '\t' '\n' | grep -v '^$' || true) + [ -z "$apps" ] && { echo "No NDXTry_* AppRegistry apps to sweep."; return 0; } + echo "AppRegistry apps to sweep:" + echo "$apps" + while IFS= read -r app; do + [ -z "$app" ] && continue + echo " deleting appregistry: $app" + aws servicecatalog-appregistry delete-application --application "$app" 2>/dev/null || \ + gh issue create --title "smoke: appregistry $app couldn't be deleted" \ + --label stranded-stack \ + --body "Pre-deploy sweep on run ${GITHUB_RUN_ID} could not delete $app." || true + done <<< "$apps" +} + +# Final exit path for "use the canonical stack name". Sweeps orphans (so +# any nested-stack children left over from prior runs don't collide on +# globally-unique resource names) and writes the GH Actions output. +# Resource-level sweeps (S3 / AppRegistry) only fire when the umbrella is +# truly gone — running them while CFN is mid-create would yank live state +# out from under the stack. +use_canonical() { + sweep_orphan_stacks + if [ "$(aws cloudformation describe-stacks --stack-name "$STACK" \ + --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo DOES_NOT_EXIST)" = "DOES_NOT_EXIST" ]; then + # Order matters: file systems first (they hold buckets), then buckets. + sweep_orphan_s3files + sweep_orphan_s3_buckets + sweep_orphan_appregistry + sweep_orphan_connect + fi + echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" + exit 0 +} emit_recovery() { local reason="$1" @@ -30,60 +302,152 @@ emit_recovery() { gh issue create --title "stranded-stack: $STACK ($reason)" \ --label stranded-stack \ --body "Run ${GITHUB_RUN_ID} proceeded against recovery name $recovery. Manual cleanup needed." || true + exit 0 } -case "$STATUS" in - DOES_NOT_EXIST|CREATE_COMPLETE|UPDATE_COMPLETE|UPDATE_ROLLBACK_COMPLETE) - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - ;; - CREATE_FAILED|UPDATE_FAILED) - # Fix-forward: CFN's `update-stack` (which `aws cloudformation deploy` - # uses) accepts both *_FAILED states and replaces failed resources without - # touching the healthy ones. Reaching these states means at least one - # leaf resource failed outright but the umbrella rollback couldn't run to - # completion — we let the next deploy retry the leaves. - echo "Fix-forwarding from $STATUS" - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - ;; - ROLLBACK_COMPLETE) - # CFN refuses updates on ROLLBACK_COMPLETE (initial CREATE rolled back). - # Delete + recreate is the only option. - aws cloudformation delete-stack --stack-name "$STACK" - if aws cloudformation wait stack-delete-complete --stack-name "$STACK"; then - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - else - emit_recovery "delete from ROLLBACK_COMPLETE failed" - fi - ;; - *_IN_PROGRESS) - case "$STATUS" in - UPDATE_IN_PROGRESS|UPDATE_COMPLETE_CLEANUP_IN_PROGRESS|UPDATE_ROLLBACK_IN_PROGRESS) - aws cloudformation cancel-update-stack --stack-name "$STACK" 2>/dev/null || true - ;; - esac - sleep 60 - STATUS_NOW=$(aws cloudformation describe-stacks --stack-name "$STACK" \ - --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "DOES_NOT_EXIST") - if [[ "$STATUS_NOW" =~ _IN_PROGRESS$ ]]; then - emit_recovery "stuck in $STATUS_NOW" - else - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - fi - ;; - UPDATE_ROLLBACK_FAILED) - aws cloudformation continue-update-rollback --stack-name "$STACK" || true - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - ;; - DELETE_FAILED) - aws cloudformation delete-stack --stack-name "$STACK" - if aws cloudformation wait stack-delete-complete --stack-name "$STACK"; then - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - else - emit_recovery "DELETE_FAILED" - fi - ;; - *) - echo "Unhandled status $STATUS; proceeding with default name" - echo "stack_name=$STACK" >> "$GITHUB_OUTPUT" - ;; -esac +# Re-evaluating loop: any time a state-handling branch performs a wait or +# CFN-mutating call, we `continue` and re-read the stack's status. This +# lets transitions like ROLLBACK_IN_PROGRESS → ROLLBACK_COMPLETE be +# handled by the appropriate branch (delete + recreate) on the next +# iteration, instead of falling through to use_canonical with a status +# that isn't actually deployable. +for ITER in $(seq 1 $MAX_ITER); do + STATUS=$(aws cloudformation describe-stacks --stack-name "$STACK" \ + --query 'Stacks[0].StackStatus' --output text 2>/dev/null || echo "DOES_NOT_EXIST") + echo "[iter ${ITER}/${MAX_ITER}] $STACK status: $STATUS" + + case "$STATUS" in + DOES_NOT_EXIST|CREATE_COMPLETE|UPDATE_COMPLETE|UPDATE_ROLLBACK_COMPLETE) + use_canonical + ;; + CREATE_FAILED|UPDATE_FAILED) + # Fix-forward: CFN's `update-stack` (which `aws cloudformation deploy` + # uses) accepts both *_FAILED states and replaces failed resources + # without touching the healthy ones. + echo "Fix-forwarding from $STATUS" + use_canonical + ;; + ROLLBACK_COMPLETE) + # CFN refuses updates on ROLLBACK_COMPLETE (initial CREATE rolled back). + # Delete + recreate is the only option. + echo "Deleting from ROLLBACK_COMPLETE" + aws cloudformation delete-stack --stack-name "$STACK" || true + wait_for_stable "$STACK" 3600 || emit_recovery "delete from ROLLBACK_COMPLETE timed out" + continue + ;; + *_IN_PROGRESS) + case "$STATUS" in + UPDATE_IN_PROGRESS) + # cancel-update-stack only works on UPDATE_IN_PROGRESS; CFN refuses + # on the cleanup or rollback variants. + aws cloudformation cancel-update-stack --stack-name "$STACK" 2>/dev/null || true + ;; + esac + wait_for_stable "$STACK" 3600 || emit_recovery "stuck in $STATUS after 60m wait" + continue + ;; + UPDATE_ROLLBACK_FAILED) + # continue-update-rollback is async; on the first attempt CFN retries + # the same failing leaves. If they fail again we re-issue with + # --resources-to-skip on those leaves. If THAT still ends in + # UPDATE_ROLLBACK_FAILED, fall through to delete-stack so the + # umbrella's globally-unique child resources (AppRegistryApplication + # etc.) get freed for the next create. + echo "Attempting continue-update-rollback" + aws cloudformation continue-update-rollback --stack-name "$STACK" 2>/dev/null || true + STATUS_NOW=$(wait_for_stable "$STACK" 3600) || \ + emit_recovery "continue-update-rollback still running after 60m" + if [ "$STATUS_NOW" = "UPDATE_ROLLBACK_FAILED" ]; then + SKIP=$(aws cloudformation list-stack-resources --stack-name "$STACK" \ + --query 'StackResourceSummaries[?ResourceStatus==`UPDATE_FAILED`].LogicalResourceId' \ + --output text | tr '\t' ' ') + if [ -n "$SKIP" ]; then + echo "Retrying continue-update-rollback skipping: $SKIP" + # shellcheck disable=SC2086 + aws cloudformation continue-update-rollback --stack-name "$STACK" \ + --resources-to-skip $SKIP 2>/dev/null || true + STATUS_NOW=$(wait_for_stable "$STACK" 3600) || \ + emit_recovery "rollback retry-with-skip still running after 60m" + fi + fi + if [ "$STATUS_NOW" = "UPDATE_ROLLBACK_FAILED" ]; then + echo "Rollback unrecoverable; deleting $STACK" + aws cloudformation delete-stack --stack-name "$STACK" 2>/dev/null || true + STATUS_NOW=$(wait_for_stable "$STACK" 3600) || \ + emit_recovery "delete-stack still running after 60m" + if [ "$STATUS_NOW" = "DELETE_FAILED" ]; then + RETAIN=$(aws cloudformation list-stack-resources --stack-name "$STACK" \ + --query 'StackResourceSummaries[?ResourceStatus==`DELETE_FAILED`].LogicalResourceId' \ + --output text | tr '\t' ' ') + if [ -n "$RETAIN" ]; then + echo "Retrying delete-stack retaining: $RETAIN" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$STACK" \ + --retain-resources $RETAIN 2>/dev/null || true + wait_for_stable "$STACK" 3600 || \ + emit_recovery "delete-stack-with-retain still running after 60m" + gh issue create --title "smoke: retained resources after $STACK delete" \ + --label stranded-stack \ + --body "Retained on delete: $RETAIN. Run ${GITHUB_RUN_ID}." || true + fi + fi + fi + continue + ;; + DELETE_FAILED) + aws cloudformation delete-stack --stack-name "$STACK" 2>/dev/null || true + STATUS_NOW=$(wait_for_stable "$STACK" 3600) || \ + emit_recovery "delete from DELETE_FAILED still running after 60m" + if [ "$STATUS_NOW" = "DELETE_FAILED" ]; then + RETAIN=$(aws cloudformation list-stack-resources --stack-name "$STACK" \ + --query 'StackResourceSummaries[?ResourceStatus==`DELETE_FAILED`].LogicalResourceId' \ + --output text | tr '\t' ' ') + if [ -n "$RETAIN" ]; then + echo "Retrying delete-stack from DELETE_FAILED retaining: $RETAIN" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$STACK" \ + --retain-resources $RETAIN 2>/dev/null || true + wait_for_stable "$STACK" 3600 || \ + emit_recovery "DELETE_FAILED-with-retain still running after 60m" + gh issue create --title "smoke: retained resources after $STACK delete (DELETE_FAILED)" \ + --label stranded-stack \ + --body "Retained on delete: $RETAIN. Run ${GITHUB_RUN_ID}." || true + fi + fi + continue + ;; + ROLLBACK_FAILED) + # ROLLBACK_FAILED comes from a failed initial CREATE rollback. Unlike + # UPDATE_ROLLBACK_FAILED, CFN does not accept continue-update-rollback + # here — the only recovery is delete-stack. If delete itself ends in + # DELETE_FAILED, retain the stuck leaves and proceed. + echo "Deleting from ROLLBACK_FAILED" + aws cloudformation delete-stack --stack-name "$STACK" 2>/dev/null || true + STATUS_NOW=$(wait_for_stable "$STACK" 3600) || \ + emit_recovery "delete from ROLLBACK_FAILED still running after 60m" + if [ "$STATUS_NOW" = "DELETE_FAILED" ]; then + RETAIN=$(aws cloudformation list-stack-resources --stack-name "$STACK" \ + --query 'StackResourceSummaries[?ResourceStatus==`DELETE_FAILED`].LogicalResourceId' \ + --output text | tr '\t' ' ') + if [ -n "$RETAIN" ]; then + echo "Retrying delete-stack from ROLLBACK_FAILED retaining: $RETAIN" + # shellcheck disable=SC2086 + aws cloudformation delete-stack --stack-name "$STACK" \ + --retain-resources $RETAIN 2>/dev/null || true + wait_for_stable "$STACK" 3600 || \ + emit_recovery "ROLLBACK_FAILED retain-delete still running after 60m" + gh issue create --title "smoke: retained resources after $STACK delete (ROLLBACK_FAILED)" \ + --label stranded-stack \ + --body "Retained on delete: $RETAIN. Run ${GITHUB_RUN_ID}." || true + fi + fi + continue + ;; + *) + echo "Unhandled status $STATUS; proceeding with canonical name" + use_canonical + ;; + esac +done + +emit_recovery "exhausted ${MAX_ITER} iterations of state reconciliation"