Skip to content

Phase 18: Webhook Payload + State-Filter + Coalescing #140

Phase 18: Webhook Payload + State-Filter + Coalescing

Phase 18: Webhook Payload + State-Filter + Coalescing #140

Workflow file for this run

# .github/workflows/ci.yml
# Single source of truth for Phase 1 CI. Every `run:` step invokes `just <recipe>`
# exclusively (D-10 / FOUND-12). No inline `cargo` / `docker` / `rustup` / `sqlx` /
# `npm` / `npx` commands.
name: ci
on:
pull_request:
push:
branches: [main]
concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true
# Top-level permissions: read-only by default. `packages: write` is scoped
# per-job to the `image` job only (T-01-13).
permissions:
contents: read
jobs:
lint:
name: lint (fmt + clippy + openssl-sys guard)
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: read
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
with:
components: rustfmt, clippy
- uses: Swatinem/rust-cache@v2
- uses: extractions/setup-just@v2
- run: just fmt-check
- run: just clippy
# `just openssl-check` depends on `just install-targets` and loops over
# native + amd64-musl + arm64-musl in a single run. One lint-job invocation
# covers every target CI ships (Pitfall 14 -- 01-RESEARCH.md S14).
- run: just openssl-check
# Phase 13 OBS-05 structural parity guard — p50/p95 percentile is computed
# in Rust via src/web/stats.rs::percentile, never via SQL-native
# percentile_cont / percentile_disc / median() (even on Postgres). This
# step permanently prevents any future PR from introducing SQL-native
# percentile into src/. See justfile recipe for the grep pattern + rationale.
- run: just grep-no-percentile-cont
# Phase 15 / FOUND-16. cargo-deny supply-chain check (advisories +
# licenses + duplicate-versions). Non-blocking on rc.1 per D-09 — the
# step is marked continue-on-error: true so a transient advisory or
# transitive duplicate-version finding cannot redden CI in v1.2 hands.
# Promoted to blocking (single-line removal of continue-on-error)
# before final v1.2.0 ships in Phase 24. Pairs with deny.toml's
# `bans.multiple-versions = "warn"` for two-layer non-blocking (D-10).
- uses: taiki-e/install-action@v2
with:
tool: cargo-deny
- run: just deny
continue-on-error: true
# Test matrix: arch dimension only.
# Both SQLite and Postgres backends are exercised in every cell via
# testcontainers-modules::postgres (integration tests in tests/schema_parity.rs
# and tests/db_pool_postgres.rs boot a real Postgres container). A per-cell
# `db` dimension would be cosmetic -- see ci.yml design notes in
# .planning/phases/01-foundation-security-posture-persistence-base/01-RESEARCH.md S10.
test:
name: test ${{ matrix.arch }}
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: read
strategy:
fail-fast: false
matrix:
arch: [amd64, arm64]
env:
SQLX_OFFLINE: "true"
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@stable
- uses: Swatinem/rust-cache@v2
with:
key: ${{ matrix.arch }}
- uses: extractions/setup-just@v2
- uses: taiki-e/install-action@v2
with:
tool: nextest,cargo-zigbuild
- name: Install cross-compile targets (arm64 cells only)
if: matrix.arch == 'arm64'
run: just install-targets
# Pre-pull testcontainers images from Google's public Docker Hub mirror
# and retag locally. Avoids the anonymous Docker Hub pull rate limit
# (100/6h per IP) that intermittently breaks the test matrix, since both
# amd64 and arm64 cells run on ubuntu-latest and pull the same images.
# mirror.gcr.io has much higher anonymous rate limits and is transparent
# to testcontainers-rs because bollard finds the retagged image locally
# before attempting an upstream pull.
- name: Pre-pull testcontainers images via mirror.gcr.io
run: |
set -euo pipefail
for image in postgres:11-alpine alpine:latest; do
echo "::group::Pre-pull ${image}"
docker pull "mirror.gcr.io/library/${image}"
docker tag "mirror.gcr.io/library/${image}" "${image}"
echo "::endgroup::"
done
- run: just nextest
- run: just schema-diff
image:
name: multi-arch docker image
runs-on: ubuntu-latest
needs: [lint, test]
timeout-minutes: 45
# Per-job permissions: scoped `packages: write` for GHCR push (T-01-13).
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4
- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3
- uses: extractions/setup-just@v2
# PR path: build both platforms, load no push
- name: just image (PR -- build only)
if: github.event_name == 'pull_request'
run: just image
# main path: :main floating tag is owned by main-build.yml (OPS-10).
# ci.yml's image job stays PR-only — verifies multi-arch BUILD works
# without pushing. On push-to-main this job becomes a no-op because the
# only remaining step (`just image (PR -- build only)` above) is gated on
# `github.event_name == 'pull_request'`. DO NOT re-add a main-push step
# here — that is what caused the pre-Phase-12.1 :latest divergence.
compose-smoke:
name: quickstart compose smoke (${{ matrix.compose }})
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
contents: read
# Phase 8 D-18..D-22: matrix over both quickstart compose files. Each axis
# boots the full stack, triggers Run Now on every example job via the API,
# and asserts all four reach status=success within 120s. Extends the Phase 6
# gap-closure compose-smoke job rather than adding a second job (D-22 —
# single compose up/down cycle per axis).
strategy:
fail-fast: false
matrix:
compose:
- docker-compose.yml
- docker-compose.secure.yml
steps:
- uses: actions/checkout@v4
- name: Install jq
run: |
if ! command -v jq >/dev/null 2>&1; then
sudo apt-get update -qq
sudo apt-get install -y --no-install-recommends jq
fi
jq --version
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build local cronduit:ci image from PR checkout
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile
platforms: linux/amd64
push: false
load: true
tags: cronduit:ci
cache-from: type=gha,scope=cronduit-ci-smoke
cache-to: type=gha,mode=max,scope=cronduit-ci-smoke
- name: Point compose at locally-built cronduit:ci image
env:
COMPOSE_FILE: ${{ matrix.compose }}
run: |
# Point the smoke test at the just-built cronduit:ci image so it
# exercises PR code, not stale ghcr :latest. Two paths:
# (a) Compose file uses ${CRONDUIT_IMAGE:-...} — set CRONDUIT_IMAGE
# env var; docker compose substitution does the rest.
# (b) Compose file uses literal image: ... — sed-rewrite in place.
# Never committed either way.
if grep -qE '\$\{CRONDUIT_IMAGE(:-[^}]*)?\}' "examples/${COMPOSE_FILE}"; then
echo "Compose file uses CRONDUIT_IMAGE env var — exporting CRONDUIT_IMAGE=cronduit:ci"
echo "CRONDUIT_IMAGE=cronduit:ci" >> "$GITHUB_ENV"
else
sed -i "s|ghcr.io/simplicityguy/cronduit:latest|cronduit:ci|g" "examples/${COMPOSE_FILE}"
count=$(grep -c 'image: cronduit:ci' "examples/${COMPOSE_FILE}" || true)
if [ "$count" -ne 1 ]; then
echo "ERROR: expected exactly 1 'image: cronduit:ci' line, found ${count}"
cat "examples/${COMPOSE_FILE}"
exit 1
fi
fi
- name: Derive DOCKER_GID from runner's docker.sock
# Explicit derivation makes the default-compose axis robust against
# ubuntu-latest image bumps that change the docker group GID. The
# secure-compose axis doesn't need this (its socket-proxy runs as
# root in its own container), but exporting DOCKER_GID unconditionally
# is a no-op there since nothing reads it.
run: |
DG=$(stat -c %g /var/run/docker.sock)
echo "DOCKER_GID=${DG}" >> "$GITHUB_ENV"
echo "derived DOCKER_GID=${DG}"
- name: docker compose up -d
working-directory: examples
env:
COMPOSE_FILE: ${{ matrix.compose }}
run: docker compose -f "${COMPOSE_FILE}" up -d
- name: Wait for /health (max 30s)
env:
COMPOSE_FILE: ${{ matrix.compose }}
run: |
set -eu
for i in $(seq 1 30); do
if curl -sSf http://localhost:8080/health >/tmp/health.json 2>/dev/null; then
echo "health responded after ${i}s"
cat /tmp/health.json
exit 0
fi
sleep 1
done
echo "ERROR: /health never responded after 30s"
docker compose -f "examples/${COMPOSE_FILE}" logs
exit 1
- name: Assert /health body contains status:ok
run: |
set -eu
body=$(curl -sSf http://localhost:8080/health) || {
echo "ERROR: /health curl failed with exit $?"
exit 1
}
echo "health body: $body"
echo "$body" | grep -q '"status":"ok"' || {
echo "ERROR: /health body missing status:ok"
exit 1
}
- name: Assert dashboard lists all four quickstart jobs
run: |
set -eu
dash=$(curl -sSf http://localhost:8080/)
for job in echo-timestamp http-healthcheck disk-usage hello-world; do
echo "$dash" | grep -q "$job" || {
echo "ERROR: dashboard missing job: $job"
echo "---- full dashboard body ----"
echo "$dash"
exit 1
}
done
echo "all four jobs present on dashboard"
- name: Trigger Run Now on every example job and assert success within 120s
env:
COMPOSE_FILE: ${{ matrix.compose }}
run: |
set -eu
BASE="http://localhost:8080"
JOBS="echo-timestamp http-healthcheck disk-usage hello-world"
BUDGET_SECS=120
POLL_INTERVAL=2
COOKIE_JAR=$(mktemp)
# Prime the CSRF cookie by GETting the dashboard. The
# ensure_csrf_cookie middleware sets the `cronduit_csrf` cookie on
# first response; subsequent POSTs must echo that value in both the
# cookie header and the `csrf_token` form field (validate_csrf
# requires byte-equal non-empty strings).
curl -sSf -c "${COOKIE_JAR}" -o /dev/null "${BASE}/"
CSRF_TOKEN=$(awk '$6 == "cronduit_csrf" { print $7 }' "${COOKIE_JAR}")
if [ -z "${CSRF_TOKEN}" ]; then
echo "ERROR: cronduit_csrf cookie not set after GET /"
cat "${COOKIE_JAR}"
exit 1
fi
echo "primed CSRF cookie (len=${#CSRF_TOKEN})"
# Fetch all jobs once and map name -> id (the Run Now API takes id, not name).
jobs_json=$(curl -sSf "${BASE}/api/jobs")
echo "---- /api/jobs body ----"
echo "$jobs_json" | jq '.'
for name in $JOBS; do
id=$(echo "$jobs_json" | jq -r --arg n "$name" '.[] | select(.name == $n) | .id')
if [ -z "$id" ] || [ "$id" = "null" ]; then
echo "ERROR: could not resolve job id for '$name' from /api/jobs"
exit 1
fi
echo "triggering Run Now for ${name} (id=${id})"
curl -sSf -b "${COOKIE_JAR}" \
-H "Content-Type: application/x-www-form-urlencoded" \
--data-urlencode "csrf_token=${CSRF_TOKEN}" \
-X POST "${BASE}/api/jobs/${id}/run" -o /dev/null || {
echo "ERROR: POST /api/jobs/${id}/run failed for ${name}"
exit 1
}
done
echo "polling run history for every job (per-job budget ${BUDGET_SECS}s, interval ${POLL_INTERVAL}s)"
for name in $JOBS; do
deadline=$(( $(date +%s) + BUDGET_SECS )) # per-job budget: avoids first slow job starving later ones
id=$(echo "$jobs_json" | jq -r --arg n "$name" '.[] | select(.name == $n) | .id')
while :; do
now=$(date +%s)
if [ "$now" -ge "$deadline" ]; then
echo "ERROR: job '${name}' did not reach status=success within ${BUDGET_SECS}s"
echo "---- latest runs for ${name} ----"
curl -sSf "${BASE}/api/jobs/${id}/runs?limit=5" | jq '.'
exit 1
fi
latest=$(curl -sSf "${BASE}/api/jobs/${id}/runs?limit=1" | jq -r '.[0].status // empty')
case "$latest" in
success)
echo "job '${name}' reached status=success"
break
;;
failed|timeout|cancelled)
echo "ERROR: job '${name}' reached terminal status=${latest} (expected success)"
curl -sSf "${BASE}/api/jobs/${id}/runs?limit=5" | jq '.'
exit 1
;;
running|scheduled|"")
sleep "$POLL_INTERVAL"
;;
*)
echo "WARN: unknown status='${latest}' for ${name}, continuing poll"
sleep "$POLL_INTERVAL"
;;
esac
done
done
echo "all four example jobs reached status=success within ${BUDGET_SECS}s"
- name: Dump diagnostics on failure
if: failure()
env:
COMPOSE_FILE: ${{ matrix.compose }}
run: |
echo "::group::cronduit logs (tail 200)"
docker compose -f "examples/${COMPOSE_FILE}" logs cronduit --tail=200 || true
echo "::endgroup::"
echo "::group::dockerproxy logs (tail 50, secure axis only)"
docker compose -f "examples/${COMPOSE_FILE}" logs dockerproxy --tail=50 2>/dev/null || echo "(no dockerproxy service in this axis)"
echo "::endgroup::"
echo "::group::run history tail per job"
for name in echo-timestamp http-healthcheck disk-usage hello-world; do
echo "---- ${name} ----"
jobs_json=$(curl -sSf "http://localhost:8080/api/jobs" 2>/dev/null || echo "[]")
id=$(echo "$jobs_json" | jq -r --arg n "$name" '.[] | select(.name == $n) | .id' 2>/dev/null || true)
if [ -n "$id" ] && [ "$id" != "null" ]; then
curl -sSf "http://localhost:8080/api/jobs/${id}/runs?limit=5" 2>/dev/null | jq '.' || echo "(unreachable)"
else
echo "(id not found)"
fi
done
echo "::endgroup::"
echo "::group::cronduit_docker_reachable gauge"
curl -sSf http://localhost:8080/metrics 2>/dev/null | grep cronduit_docker_reachable || echo "(gauge unavailable)"
echo "::endgroup::"
- name: Tear down compose stack
if: always()
working-directory: examples
env:
COMPOSE_FILE: ${{ matrix.compose }}
run: docker compose -f "${COMPOSE_FILE}" down -v