site healthcheck #372
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # healthcheck.example.yml — OPS-01 external healthcheck (PLACEHOLDER TEMPLATE) | |
| # | |
| # This is a TEMPLATE committed to the public self-hosting repo. The LIVE workflow is | |
| # materialized in the SOURCE repo (alongside deploy.yml) as: | |
| # .github/workflows/healthcheck.yml | |
| # Copy this file there, then create the GH Actions secrets it references. NO real | |
| # domains / recipients / sender / Brevo key live in this file — secrets references only. | |
| # See docs/site-healthcheck-runbook.md for the materialize + secrets + sequencing steps. | |
| # | |
| # Why off-jarvis GH cron: it runs OUTSIDE jarvis, so it catches the failure mode that | |
| # matters — jarvis or the tunnel being down — which an on-jarvis cron would be blind to. | |
| # Why Brevo transactional API: a single api-key header + one curl POST, smallest secret surface. | |
| name: site healthcheck | |
| on: | |
| schedule: | |
| # GH enforces a 5-minute MINIMUM; ~15 min is reliable + low-noise. Expect 5-30 min jitter. | |
| - cron: '*/15 * * * *' | |
| workflow_dispatch: {} | |
| # No checkout needed → minimal token surface (public-repo Actions hardening). | |
| permissions: | |
| contents: none | |
| jobs: | |
| check: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Probe targets and alert on failure | |
| env: | |
| # All real values come from GH Actions secrets in the SOURCE repo — never committed. | |
| TARGETS: ${{ secrets.HEALTHCHECK_TARGETS }} # space-separated real domains | |
| BREVO_KEY: ${{ secrets.BREVO_API_KEY }} # Brevo transactional api-key | |
| ALERT_TO: ${{ secrets.ALERT_RECIPIENT }} # operator inbox | |
| ALERT_FROM: ${{ secrets.ALERT_SENDER }} # DKIM/DMARC-authenticated Brevo sender | |
| run: | | |
| set -u | |
| fail="" | |
| for url in $TARGETS; do | |
| # `|| true`: GH Actions runs steps under `bash -eo pipefail`, so a DNS/connection | |
| # failure (curl exit 6/7/28 — exactly the "jarvis/tunnel down" case) would otherwise | |
| # ABORT the step before we can alert. curl still writes %{http_code} (000 on no HTTP | |
| # response); we treat anything != 200 (including 000) as DOWN and send the alert. | |
| code=$(curl -sS -o /dev/null -w '%{http_code}' \ | |
| --max-time 20 --retry 2 --retry-delay 5 "https://$url/" || true) | |
| [ -n "$code" ] || code="000" | |
| [ "$code" = "200" ] || fail="$fail $url(=$code)" | |
| done | |
| if [ -z "$fail" ]; then | |
| echo "all up" | |
| exit 0 | |
| fi | |
| echo "DOWN:$fail" | |
| # On failure, POST a Brevo transactional email. The api-key is a header, never echoed. | |
| # Capture + echo the Brevo HTTP status + body (messageId on success; never the key) so a | |
| # rejected send (e.g. unverified sender, bad key) is visible in the run log instead of a | |
| # silent no-mail. `|| true` keeps the intentional `exit 1` (red run) as the last word. | |
| resp=$(curl -sS -w '\nHTTP:%{http_code}' -X POST https://api.brevo.com/v3/smtp/email \ | |
| -H "api-key: $BREVO_KEY" \ | |
| -H 'content-type: application/json' \ | |
| -d "{\"sender\":{\"email\":\"$ALERT_FROM\"},\"to\":[{\"email\":\"$ALERT_TO\"}],\"subject\":\"[jarvis] site DOWN:$fail\",\"textContent\":\"Healthcheck first-failed-observation for:$fail\"}" || true) | |
| echo "brevo-response: $resp" | |
| exit 1 |