Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
18cf8ef
Add three-layer defense against orphaned test resources
Nov 5, 2025
b222487
Fix shellcheck errors in cleanup workflow
Nov 5, 2025
95c608c
Fix shellcheck errors in cleanup-test-resources script
Nov 5, 2025
b5bb6eb
TICKET-942 Remove scheduled orphaned detection workflow
Nov 5, 2025
05f934c
Address PR review feedback
Nov 14, 2025
9aad608
Add scheduled workflow to scan for orphaned test resources
Nov 14, 2025
c57578a
Add retry logic to teardown functions
Nov 14, 2025
d745061
Address PR review comments
Nov 25, 2025
0438773
Merge branch 'main' into fix/improve-test-cleanup-process
sean-navapbc Nov 25, 2025
c553d1c
Add runCommandWithRetry helper function to template_infra_test.go
Nov 26, 2025
68401b4
Temporarily trigger scan workflow on feature branch for testing
Nov 26, 2025
203df2f
Rename workflows to template-only- prefix for consistency
Nov 26, 2025
d9d84d5
Remove unnecessary chmod commands from workflows
Nov 26, 2025
322efb1
Rename cleanup script to template-only- prefix
Nov 26, 2025
240f872
Fix shellcheck warning SC2129 - group redirects
Nov 26, 2025
44c06b8
Merge branch 'main' into fix/improve-test-cleanup-process
sean-navapbc Dec 1, 2025
edc0de4
Update .github/workflows/template-only-cleanup-orphaned-infra-test-re…
sean-navapbc Dec 1, 2025
c65abc7
Rename script and update workflow display names
Dec 1, 2025
41d64d7
Continue cleanup even when state bucket is missing
Dec 1, 2025
882a63c
Suppress broken pipe errors in scan workflow
Dec 1, 2025
383f93c
Fix broken pipe errors by redirecting entire pipeline stderr
Dec 1, 2025
9a11699
Add ECS task definition cleanup to cleanup script
Dec 1, 2025
4529637
Fix cleanup script to handle all orphaned resource types
Dec 8, 2025
5a83472
Remove unnecessary private zone filter from Route53 cleanup
Dec 8, 2025
964db22
Improve task definition cleanup logic
Dec 8, 2025
f3af99c
Fix broken pipe errors in scan workflow
Dec 8, 2025
bee83f4
Fix all broken pipe errors in scan workflow
Dec 9, 2025
a5d77ae
Fix shellcheck lint warning in cleanup script
Dec 9, 2025
eda7577
Clean up task definitions in destroy-app-service script
Dec 9, 2025
29ab11d
Remove notify block from scan workflow
Dec 9, 2025
743d112
Fix broken pipe errors by disabling pipefail around head commands
Dec 9, 2025
1f56642
Merge branch 'main' into fix/improve-test-cleanup-process
sean-navapbc Dec 11, 2025
daa0f2e
Fix broken pipe errors and add inactive task definition cleanup
Dec 11, 2025
b6dd99c
Fix broken pipe by trapping SIGPIPE signal
Dec 11, 2025
aea90f5
Suppress broken pipe error messages with stderr redirect
Dec 11, 2025
2dc6051
Fix broken pipe error and show full scan output
Dec 16, 2025
765325e
Fix broken pipe errors by using temp file instead of pipes
Dec 16, 2025
c318ace
Fix inactive task definition lookup to check tags instead of family p…
Dec 16, 2025
005881a
Remove temporary push trigger for testing
Dec 22, 2025
b46f459
Remove unused runCommandWithRetry helper function
Jan 5, 2026
355e27d
Remove unused AGE_HOURS variable from cleanup script
Jan 5, 2026
b9cbff5
Improve task definition cleanup comments and use pushd/popd
Jan 6, 2026
5192884
Get project config from terraform instead of pwd/aws config
Jan 6, 2026
4c768fe
Remove unused --age-hours argument from scan workflow
Jan 6, 2026
e451aa4
Remove unused --age-hours parameter from cleanup workflow
Jan 6, 2026
9acd97c
Fix GitHub issue link for task definition deletion
Jan 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Template Cleanup Orphaned Infra Test Resources

on:
# Manual trigger only - cleanup is intentionally not automatic
# to avoid masking underlying test issues that should be fixed
workflow_dispatch:
inputs:
project_name:
description: 'Specific project to clean up (e.g., plt-tst-act-12345). Leave empty to find all projects.'
required: false
type: string
age_hours:
description: 'Only delete resources older than this many hours'
required: false
default: '6'
type: string
dry_run:
description: 'Dry run - list resources without deleting them'
required: false
default: true
type: boolean

jobs:
cleanup:
name: Cleanup Test Resources
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.TESTER_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.TESTER_AWS_SECRET_ACCESS_KEY }}

- name: Run cleanup script
run: |
chmod +x ./template-only-bin/cleanup-test-resources

args=()
if [ "${{ inputs.dry_run }}" = "true" ]; then
args+=(--dry-run)
fi

args+=(--age-hours "${{ inputs.age_hours }}")

if [ -n "${{ inputs.project_name }}" ]; then
args+=("${{ inputs.project_name }}")
fi

./template-only-bin/cleanup-test-resources "${args[@]}"
56 changes: 56 additions & 0 deletions .github/workflows/template-scan-orphaned-infra-test-resources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Template Scan Orphaned Infra Test Resources

on:
workflow_dispatch:
schedule:
# Run every day at 08:00 UTC (4:00am ET, 1:00am PT)
- cron: "0 8 * * *"

jobs:
scan:
name: Scan for orphaned test resources
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-east-1
aws-access-key-id: ${{ secrets.TESTER_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.TESTER_AWS_SECRET_ACCESS_KEY }}

- name: Scan for orphaned resources
id: scan
run: |
chmod +x ./template-only-bin/cleanup-test-resources

# Run in dry-run mode to see what would be deleted
output=$(./template-only-bin/cleanup-test-resources --dry-run --age-hours 6 2>&1)
echo "$output"

# Check if any resources were found (look for "Found X resources" in output)
if echo "$output" | grep -q "Found [1-9][0-9]* resources"; then
echo "found=true" >> "$GITHUB_OUTPUT"
# Extract resource count and project names for notification
resource_info=$(echo "$output" | grep -E "(Found [0-9]+ resources|Cleaning up project:|plt-tst-act-)" | head -20)
echo "resource_info<<EOF" >> "$GITHUB_OUTPUT"
echo "$resource_info" >> "$GITHUB_OUTPUT"
echo "EOF" >> "$GITHUB_OUTPUT"
exit 1
else
echo "found=false" >> "$GITHUB_OUTPUT"
echo "No orphaned resources found"
fi

notify:
name: " "
needs: scan
if: failure()
uses: ./.github/workflows/send-system-notification.yml
with:
channel: "workflow-failures"
message: "🧹 [Orphaned test resources detected in template-infra](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})\n\nRun the [cleanup workflow](https://github.com/${{ github.repository }}/actions/workflows/cleanup-orphaned-test-resources.yml) to remove them."
secrets: inherit
10 changes: 10 additions & 0 deletions infra/test/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ package test
import (
"fmt"
"testing"
"time"

"github.com/gruntwork-io/terratest/modules/retry"
"github.com/gruntwork-io/terratest/modules/shell"
"github.com/gruntwork-io/terratest/modules/terraform"
)

Expand All @@ -18,3 +21,10 @@ import (
func TerraformInit(t *testing.T, terraformOptions *terraform.Options, backendConfig string) {
terraform.RunTerraformCommand(t, terraformOptions, "init", fmt.Sprintf("-backend-config=%s", backendConfig))
}

// runCommandWithRetry runs a shell command with retry logic
func runCommandWithRetry(t *testing.T, description string, maxRetries int, sleepBetweenRetries time.Duration, command shell.Command) {
retry.DoWithRetry(t, description, maxRetries, sleepBetweenRetries, func() (string, error) {
return "", shell.RunCommandE(t, command)
})
}
225 changes: 225 additions & 0 deletions template-only-bin/cleanup-test-resources
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
#!/usr/bin/env bash
# -----------------------------------------------------------------------------
# Cleanup orphaned test resources from failed CI runs
#
# This script finds and destroys resources from test runs that failed to
# clean up after themselves. It targets resources with the project tag pattern
# "plt-tst-act-*" that are older than a specified age.
#
# Usage:
# cleanup-test-resources [--dry-run] [--age-hours HOURS] [PROJECT_NAME]
#
# Arguments:
# PROJECT_NAME (optional) - Specific project to clean up (e.g., plt-tst-act-12345)
# If not provided, finds all matching projects
#
# Options:
# --dry-run - List resources that would be deleted without deleting them
# --age-hours N - Only delete resources older than N hours (default: 6)
#
# Examples:
# # Dry run to see what would be deleted
# cleanup-test-resources --dry-run
#
# # Clean up a specific project
# cleanup-test-resources plt-tst-act-12345
#
# # Clean up all projects older than 12 hours
# cleanup-test-resources --age-hours 12
# -----------------------------------------------------------------------------
set -euo pipefail

# Default values
DRY_RUN=false
AGE_HOURS=6
PROJECT_NAME=""
AWS_ACCOUNT_ID="533267424629"
AWS_REGION="us-east-1"

# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--dry-run)
DRY_RUN=true
shift
;;
--age-hours)
AGE_HOURS="$2"
shift 2
;;
plt-tst-act-*)
PROJECT_NAME="$1"
shift
;;
*)
echo "Unknown argument: $1"
exit 1
;;
esac
done

echo "=== Cleanup Orphaned Test Resources ==="
echo "Dry run: ${DRY_RUN}"
echo "Age threshold: ${AGE_HOURS} hours"
echo "Region: ${AWS_REGION}"
echo ""

# Find projects to clean up
if [ -n "${PROJECT_NAME}" ]; then
PROJECTS="${PROJECT_NAME}"
echo "Cleaning up specific project: ${PROJECT_NAME}"
else
echo "Finding all test projects..."
# shellcheck disable=SC2016 # Backticks are JMESPath syntax, not bash
PROJECTS=$(aws resourcegroupstaggingapi get-resources \
--region "${AWS_REGION}" \
--tag-filters Key=project \
--query 'ResourceTagMappingList[].Tags[?Key==`project`].Value' \
--output text | tr '\t' '\n' | grep '^plt-tst-act-' | sort -u || echo "")

if [ -z "${PROJECTS}" ]; then
echo "No test projects found."
exit 0
fi

echo "Found projects:"
echo "${PROJECTS}"
fi

echo ""

cleanup_project() {
local project=${1}
echo "=== Cleaning up project: ${project} ==="

# Check if terraform state bucket exists
local bucket_name="${project}-${AWS_ACCOUNT_ID}-${AWS_REGION}-tf"

if ! aws s3api head-bucket --bucket "${bucket_name}" 2>/dev/null; then
echo "No terraform state bucket found for ${project} (likely already cleaned up)"
return 0
fi

# List all resources for this project
echo "Finding resources..."
local resources
resources=$(aws resourcegroupstaggingapi get-resources \
--region "${AWS_REGION}" \
--tag-filters Key=project,Values="${project}" \
--query 'ResourceTagMappingList[].ResourceARN' \
--output text)

if [ -z "${resources}" ]; then
echo "No resources found for project ${project}"
return 0
fi

local resource_count
resource_count=$(echo "${resources}" | wc -w)
echo "Found ${resource_count} resources"

if [ "${DRY_RUN}" = true ]; then
echo "Would delete the following resources:"
echo "${resources}" | tr '\t' '\n'
return 0
fi

echo "Deleting resources..."

# Delete Route 53 hosted zones (the main blocker)
echo "Cleaning up Route 53 hosted zones..."
local zones
zones=$(aws route53 list-hosted-zones \
--query "HostedZones[?Config.PrivateZone==\`false\`].Id" \
--output text || echo "")

for zone_id in ${zones}; do
# Extract just the ID
zone_id="${zone_id#/hostedzone/}"

# Check if this zone belongs to our project
local zone_tags
zone_tags=$(aws route53 list-tags-for-resource \
--resource-type hostedzone \
--resource-id "${zone_id}" \
--query "ResourceTagSet.Tags[?Key=='project'].Value" \
--output text || echo "")

if [ "${zone_tags}" = "${project}" ]; then
echo "Deleting hosted zone: ${zone_id}"
aws route53 delete-hosted-zone --id "${zone_id}" || echo "Failed to delete zone ${zone_id}"
fi
done

# Delete other resources using AWS CLI
# Note: Some resources need to be deleted in specific order due to dependencies

# Delete ECS services first
echo "Cleaning up ECS services and clusters..."
local clusters
clusters=$(aws ecs list-clusters --region "${AWS_REGION}" --query 'clusterArns[]' --output text || echo "")
for cluster_arn in ${clusters}; do
local cluster_tags
cluster_tags=$(aws ecs list-tags-for-resource \
--resource-arn "${cluster_arn}" \
--query "tags[?key=='project'].value" \
--output text || echo "")

if [ "${cluster_tags}" = "${project}" ]; then
local cluster_name
cluster_name=$(echo "${cluster_arn}" | awk -F/ '{print $NF}')
echo "Deleting ECS cluster: ${cluster_name}"

# Delete services in cluster first
local services
services=$(aws ecs list-services --cluster "${cluster_name}" --region "${AWS_REGION}" --query 'serviceArns[]' --output text || echo "")
for service in ${services}; do
aws ecs delete-service --cluster "${cluster_name}" --service "${service}" --force --region "${AWS_REGION}" || echo "Failed to delete service"
done

# Then delete cluster
aws ecs delete-cluster --cluster "${cluster_name}" --region "${AWS_REGION}" || echo "Failed to delete cluster"
fi
done

# Delete load balancers
echo "Cleaning up load balancers..."
local lbs
lbs=$(aws elbv2 describe-load-balancers --region "${AWS_REGION}" --query 'LoadBalancers[].LoadBalancerArn' --output text || echo "")
for lb_arn in ${lbs}; do
local lb_tags
lb_tags=$(aws elbv2 describe-tags --resource-arns "${lb_arn}" --query "TagDescriptions[0].Tags[?Key=='project'].Value" --output text || echo "")

if [ "${lb_tags}" = "${project}" ]; then
echo "Deleting load balancer: ${lb_arn}"
aws elbv2 delete-load-balancer --load-balancer-arn "${lb_arn}" --region "${AWS_REGION}" || echo "Failed to delete LB"
fi
done

# Wait a bit for LB deletion
sleep 5

# Delete target groups
echo "Cleaning up target groups..."
local tgs
tgs=$(aws elbv2 describe-target-groups --region "${AWS_REGION}" --query 'TargetGroups[].TargetGroupArn' --output text || echo "")
for tg_arn in ${tgs}; do
local tg_tags
tg_tags=$(aws elbv2 describe-tags --resource-arns "${tg_arn}" --query "TagDescriptions[0].Tags[?Key=='project'].Value" --output text 2>/dev/null || echo "")

if [ "${tg_tags}" = "${project}" ]; then
echo "Deleting target group: ${tg_arn}"
aws elbv2 delete-target-group --target-group-arn "${tg_arn}" --region "${AWS_REGION}" || echo "Failed to delete TG"
fi
done

echo "Cleanup complete for project: ${project}"
echo ""
}

# Clean up each project
for project in ${PROJECTS}; do
cleanup_project "${project}" || echo "Failed to clean up ${project}"
done

echo "=== Cleanup complete ==="
Loading
Loading