DR backup freshness monitor #34
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: DR backup freshness monitor | |
| # Off-platform monitor: runs in GitHub (not GCP) so it survives a full GCP | |
| # compromise. If the nightly backup pipeline dies silently, this catches it. | |
| # | |
| # Required secrets: | |
| # AWS_BACKUP_READONLY_ACCESS_KEY_ID | |
| # AWS_BACKUP_READONLY_SECRET_ACCESS_KEY | |
| # DR_MONITOR_DISCORD_WEBHOOK | |
| # | |
| # Setup: see docs/DISASTER-RECOVERY.md § "Backup freshness monitor". | |
| on: | |
| schedule: | |
| # Daily at 14:00 UTC (~9h after the 1am ET backup window). | |
| - cron: "0 14 * * *" | |
| workflow_dispatch: {} | |
| permissions: | |
| contents: read | |
| jobs: | |
| check-freshness: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_BACKUP_READONLY_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_BACKUP_READONLY_SECRET_ACCESS_KEY }} | |
| AWS_DEFAULT_REGION: us-east-1 | |
| BUCKET: nomadkaraoke-backup | |
| DISCORD_WEBHOOK: ${{ secrets.DR_MONITOR_DISCORD_WEBHOOK }} | |
| steps: | |
| - name: Verify required secrets are set | |
| run: | | |
| missing=0 | |
| for var in AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY DISCORD_WEBHOOK; do | |
| if [ -z "${!var}" ]; then | |
| echo "::error::Required secret $var is empty" | |
| missing=1 | |
| fi | |
| done | |
| [ "$missing" = "0" ] || exit 1 | |
| - name: Install boto3 | |
| run: pip install --quiet boto3 | |
| - name: Check freshness of each tracked prefix | |
| id: check | |
| shell: bash | |
| run: | | |
| python3 - <<'PY' | |
| import os, sys, datetime, boto3 | |
| BUCKET = os.environ["BUCKET"] | |
| # (label, prefix, max_age_hours) | |
| TARGETS = [ | |
| ("Firestore", "firestore/", 36), | |
| ("GCS job files", "gcs/job-files/", 36), | |
| ("Encrypted secrets", "secrets/", 36), | |
| ("BigQuery weekly", "bigquery/daily-refresh/", 192), | |
| ] | |
| s3 = boto3.client("s3") | |
| now = datetime.datetime.now(datetime.timezone.utc) | |
| stale, ok = [], [] | |
| # Paginated scan — JMESPath --query on `aws s3api list-objects-v2` | |
| # operates per-page, which silently breaks "newest across all pages" | |
| # queries on prefixes with >1000 objects (e.g. firestore/ has ~22k). | |
| paginator = s3.get_paginator("list_objects_v2") | |
| for label, prefix, max_hours in TARGETS: | |
| latest = None | |
| for page in paginator.paginate(Bucket=BUCKET, Prefix=prefix): | |
| for obj in page.get("Contents", []): | |
| if latest is None or obj["LastModified"] > latest: | |
| latest = obj["LastModified"] | |
| if latest is None: | |
| stale.append(f"{label}: NO OBJECTS in s3://{BUCKET}/{prefix}") | |
| continue | |
| age = (now - latest).total_seconds() | |
| age_h = int(age // 3600) | |
| line = f"{label}: {age_h}h old (limit {max_hours}h) — last {latest.isoformat()}" | |
| (stale if age > max_hours * 3600 else ok).append(line) | |
| summary = ["## DR backup freshness", ""] | |
| if stale: | |
| summary.append("### Stale") | |
| summary.extend(f"- {s}" for s in stale) | |
| summary.append("") | |
| summary.append("### Fresh") | |
| summary.extend(f"- {o}" for o in ok) | |
| with open(os.environ["GITHUB_STEP_SUMMARY"], "a") as f: | |
| f.write("\n".join(summary) + "\n") | |
| with open(os.environ["GITHUB_OUTPUT"], "a") as f: | |
| f.write(f"ok_count={len(ok)}\n") | |
| f.write(f"stale_count={len(stale)}\n") | |
| if stale: | |
| f.write("STALE_LIST<<EOF\n") | |
| f.write("\n".join(stale) + "\n") | |
| f.write("EOF\n") | |
| for o in ok: | |
| print("OK:", o) | |
| if stale: | |
| for s in stale: | |
| print("STALE:", s, file=sys.stderr) | |
| sys.exit(1) | |
| PY | |
| - name: Alert Discord on stale backup | |
| if: failure() && steps.check.outputs.stale_count != '' | |
| env: | |
| STALE: ${{ steps.check.outputs.STALE_LIST }} | |
| run: | | |
| payload=$(jq -n \ | |
| --arg title "🚨 DR backup is stale" \ | |
| --arg desc "$STALE" \ | |
| --arg url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| '{ | |
| embeds: [{ | |
| title: $title, | |
| description: $desc, | |
| color: 15158332, | |
| fields: [{name: "Workflow run", value: $url}] | |
| }] | |
| }') | |
| curl -fsS -X POST -H "Content-Type: application/json" \ | |
| -d "$payload" "$DISCORD_WEBHOOK" | |
| - name: Alert Discord on workflow failure (catch-all) | |
| if: failure() && steps.check.outputs.stale_count == '' | |
| run: | | |
| payload=$(jq -n \ | |
| --arg title "⚠️ DR freshness monitor failed to run" \ | |
| --arg desc "Could not list S3 backup bucket — credentials, network, or bucket policy may be broken." \ | |
| --arg url "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| '{ | |
| embeds: [{ | |
| title: $title, | |
| description: $desc, | |
| color: 15844367, | |
| fields: [{name: "Workflow run", value: $url}] | |
| }] | |
| }') | |
| curl -fsS -X POST -H "Content-Type: application/json" \ | |
| -d "$payload" "$DISCORD_WEBHOOK" |