Skip to content

Tests - Daily Cleanup - AWS Kubernetes EKS Single Region (IRSA) #153

Tests - Daily Cleanup - AWS Kubernetes EKS Single Region (IRSA)

Tests - Daily Cleanup - AWS Kubernetes EKS Single Region (IRSA) #153

---
name: Tests - Daily Cleanup - AWS Kubernetes EKS Single Region (IRSA)
on:
workflow_dispatch:
inputs:
max_age_hours_cluster:
description: Maximum age of clusters in hours
required: true
default: '12'
pull_request:
paths:
- .github/workflows/aws_eks_single_region_daily_cleanup.yml
- .tool-versions
- aws/kubernetes/eks-single-region*/**
- '!aws/kubernetes/eks-single-region/terraform/*/test/golden/**'
- '!aws/kubernetes/eks-single-region-irsa/terraform/*/test/golden/**'
- .github/actions/aws-generic-terraform-cleanup/**
- .github/actions/aws-configure-cli/**
schedule:
- cron: 0 4 * * * # At 04:00 everyday.
# limit to a single execution per actor of this workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
# we don't cancel the previous run, so it can finish it and let clusters in a proper state
cancel-in-progress: false
env:
IS_SCHEDULE: ${{ (contains(github.head_ref, 'schedules/') || github.event_name == 'schedule') && 'true' || 'false' }}
MAX_AGE_HOURS_CLUSTER: ${{ github.event.inputs.max_age_hours_cluster || '12' }}
# please keep those variables synced with aws_kubernetes_eks_single_region_tests.yml
AWS_PROFILE: infex
S3_BACKEND_BUCKET: tests-ra-aws-rosa-hcp-tf-state-eu-central-1
S3_BUCKET_REGION: eu-central-1
AWS_REGION: eu-west-2
jobs:
triage:
runs-on: ubuntu-latest
outputs:
should_skip: ${{ steps.skip_check.outputs.should_skip }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- name: Check labels
id: skip_check
uses: ./.github/actions/internal-triage-skip
cleanup-clusters:
needs:
- triage
if: needs.triage.outputs.should_skip == 'false'
strategy:
fail-fast: false
matrix:
scenario:
- name: eks-single-region
- name: eks-single-region-irsa
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
with:
ref: ${{ github.ref }}
fetch-depth: 0
- name: Install asdf tools with cache
uses: camunda/infraex-common-config/./.github/actions/asdf-install-tooling@193a21e1e56c9a65517a822224ac3b4ffa4d6ae4 # 1.5.9
- name: Use repo .tool-version as global version
run: cp .tool-versions ~/.tool-versions
- name: Set current target branch
id: target-branch
run: |
set -euo pipefail
TARGET_BRANCH=$(cat .target-branch)
echo "TARGET_BRANCH=$TARGET_BRANCH" | tee -a "$GITHUB_OUTPUT"
- name: Configure AWS CLI
uses: ./.github/actions/aws-configure-cli
with:
vault-addr: ${{ secrets.VAULT_ADDR }}
vault-role-id: ${{ secrets.VAULT_ROLE_ID }}
vault-secret-id: ${{ secrets.VAULT_SECRET_ID }}
aws-profile: ${{ env.AWS_PROFILE }}
aws-region: ${{ env.AWS_REGION }}
- name: Export S3_BACKEND_BUCKET based on matrix
id: s3_prefix
run: |
set -euo pipefail
echo "S3_BACKEND_BUCKET_PREFIX=aws/kubernetes/${{ matrix.scenario.name }}/" | tee -a "$GITHUB_OUTPUT"
- name: Delete clusters
id: delete_clusters
continue-on-error: true
timeout-minutes: 125
uses: ./.github/actions/aws-generic-terraform-cleanup
with:
tf-bucket: ${{ env.S3_BACKEND_BUCKET }}
tf-bucket-region: ${{ env.S3_BUCKET_REGION }}
max-age-hours: ${{ env.MAX_AGE_HOURS_CLUSTER }}
tf-bucket-key-prefix: ${{ steps.s3_prefix.outputs.S3_BACKEND_BUCKET_PREFIX }}${{ steps.target-branch.outputs.TARGET_BRANCH }}/
modules-order: vpn,cluster
# The previous step has a continue-on-error set to true in case of schedule run.
# This means that the workflow is not marked as failed, but the step is.
# We can't use the `if: failure()` condition here, as the overall job is succeeding.
# Instead, we check the outcome of the previous step and if it failed, we retry the deletion.
# If the retry fails, then the report-failure job will be triggered as normally.
# There are cases where the deletion of resources fails due to dependencies.
- name: Retry delete clusters
id: retry_delete_clusters
if: steps.delete_clusters.outcome == 'failure'
timeout-minutes: 125
uses: ./.github/actions/aws-generic-terraform-cleanup
with:
tf-bucket: ${{ env.S3_BACKEND_BUCKET }}
tf-bucket-region: ${{ env.S3_BUCKET_REGION }}
max-age-hours: 0 # the previous step alters the age and resets it to 0
tf-bucket-key-prefix: ${{ steps.s3_prefix.outputs.S3_BACKEND_BUCKET_PREFIX }}${{ steps.target-branch.outputs.TARGET_BRANCH }}/
modules-order: vpn,cluster
report-failure:
name: Report failures
if: failure()
runs-on: ubuntu-latest
needs:
- cleanup-clusters
steps:
- name: Notify in Slack in case of failure
id: slack-notification
if: ${{ env.IS_SCHEDULE == 'true' }}
uses: camunda/infraex-common-config/.github/actions/report-failure-on-slack@193a21e1e56c9a65517a822224ac3b4ffa4d6ae4 # 1.5.9
with:
vault_addr: ${{ secrets.VAULT_ADDR }}
vault_role_id: ${{ secrets.VAULT_ROLE_ID }}
vault_secret_id: ${{ secrets.VAULT_SECRET_ID }}