Skip to content

site healthcheck

site healthcheck #382

Workflow file for this run

# healthcheck.example.yml — OPS-01 external healthcheck (PLACEHOLDER TEMPLATE)
#
# This is a TEMPLATE committed to the public self-hosting repo. The LIVE workflow is
# materialized in the SOURCE repo (alongside deploy.yml) as:
# .github/workflows/healthcheck.yml
# Copy this file there, then create the GH Actions secrets it references. NO real
# domains / recipients / sender / Brevo key live in this file — secrets references only.
# See docs/site-healthcheck-runbook.md for the materialize + secrets + sequencing steps.
#
# Why off-jarvis GH cron: it runs OUTSIDE jarvis, so it catches the failure mode that
# matters — jarvis or the tunnel being down — which an on-jarvis cron would be blind to.
# Why Brevo transactional API: a single api-key header + one curl POST, smallest secret surface.
name: site healthcheck
on:
schedule:
# GH enforces a 5-minute MINIMUM; ~15 min is reliable + low-noise. Expect 5-30 min jitter.
- cron: '*/15 * * * *'
workflow_dispatch: {}
# No checkout needed → minimal token surface (public-repo Actions hardening).
permissions:
contents: none
jobs:
check:
runs-on: ubuntu-latest
steps:
- name: Probe targets and alert on failure
env:
# All real values come from GH Actions secrets in the SOURCE repo — never committed.
TARGETS: ${{ secrets.HEALTHCHECK_TARGETS }} # space-separated real domains
BREVO_KEY: ${{ secrets.BREVO_API_KEY }} # Brevo transactional api-key
ALERT_TO: ${{ secrets.ALERT_RECIPIENT }} # operator inbox
ALERT_FROM: ${{ secrets.ALERT_SENDER }} # DKIM/DMARC-authenticated Brevo sender
run: |
set -u
fail=""
for url in $TARGETS; do
# `|| true`: GH Actions runs steps under `bash -eo pipefail`, so a DNS/connection
# failure (curl exit 6/7/28 — exactly the "jarvis/tunnel down" case) would otherwise
# ABORT the step before we can alert. curl still writes %{http_code} (000 on no HTTP
# response); we treat anything != 200 (including 000) as DOWN and send the alert.
code=$(curl -sS -o /dev/null -w '%{http_code}' \
--max-time 20 --retry 2 --retry-delay 5 "https://$url/" || true)
[ -n "$code" ] || code="000"
[ "$code" = "200" ] || fail="$fail $url(=$code)"
done
if [ -z "$fail" ]; then
echo "all up"
exit 0
fi
echo "DOWN:$fail"
# On failure, POST a Brevo transactional email. The api-key is a header, never echoed.
# Capture + echo the Brevo HTTP status + body (messageId on success; never the key) so a
# rejected send (e.g. unverified sender, bad key) is visible in the run log instead of a
# silent no-mail. `|| true` keeps the intentional `exit 1` (red run) as the last word.
resp=$(curl -sS -w '\nHTTP:%{http_code}' -X POST https://api.brevo.com/v3/smtp/email \
-H "api-key: $BREVO_KEY" \
-H 'content-type: application/json' \
-d "{\"sender\":{\"email\":\"$ALERT_FROM\"},\"to\":[{\"email\":\"$ALERT_TO\"}],\"subject\":\"[jarvis] site DOWN:$fail\",\"textContent\":\"Healthcheck first-failed-observation for:$fail\"}" || true)
echo "brevo-response: $resp"
exit 1