Refresh data #44
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Data refresh for warehouse-duckdb (labordata.bunkum.us), blue/green style. | |
| # | |
| # Builds the source data, converts it to DuckDB | |
| # (`python -m datasette_duckdb.convert`), and deploys it to a fresh volume + | |
| # staging machine on warehouse-duckdb, then promotes. | |
| # | |
| # Splits cleanly with deploy.yml: image lifecycle lives in deploy.yml, data | |
| # lifecycle lives here. This workflow reuses the CURRENT machine's image for | |
| # the staging machine and does not build. | |
| # | |
| # Each run: | |
| # 1. `make` builds the .db files from the same upstream sources | |
| # 2. Converts each <name>.db -> <name>.duckdb (rm'ing the .db as it goes to | |
| # keep the runner under its disk cap) | |
| # 3. Uploads the .duckdb files to R2 under the duckdb/ prefix | |
| # 4. Discovers the role=current machine + its image | |
| # 5. Creates a fresh volume + a role=staging machine on that same image | |
| # (no public ports) | |
| # 6. SFTPs pull-from-r2-direct.sh and populates /data from R2 | |
| # 7. Restarts to attach the files, warms the cold schema scan, smoke-tests | |
| # 8. Promotes: adds services to staging, warms again (port update restarts), | |
| # cordons + destroys the old machine and its volume | |
| # | |
| # Bootstrap is one-time manual: create the first machine | |
| # via `flyctl deploy -c fly.toml --remote-only` after creating the app | |
| # and allocating IPs. From then on deploy.yml rolls images and this | |
| # workflow refreshes data. | |
| # | |
| # Known gaps (tracked in the datasette-duckdb work): | |
| # * No FTS yet — the converter doesn't build FTS indexes, so search boxes | |
| # are absent until that lands. | |
| # * No --crossdb / union_names canned queries (sqlite-specific). | |
| name: Refresh data | |
| on: | |
| schedule: | |
| # Daily at 07:30 UTC, after upstream sources have settled. | |
| - cron: "30 7 * * *" | |
| workflow_dispatch: | |
| inputs: | |
| promote: | |
| description: "Promote staging to current after smoke (destroys current machine + volume)" | |
| type: boolean | |
| default: true | |
| concurrency: | |
| # Shared with deploy.yml (warehouse-duckdb-deploy) so refreshes serialize | |
| # against each other and against image deploys. | |
| group: warehouse-duckdb-deploy | |
| cancel-in-progress: false | |
| permissions: | |
| contents: read | |
| env: | |
| FLY_APP: warehouse-duckdb | |
| FLY_REGION: iad | |
| R2_BUCKET: labordata-warehouse-staging | |
| R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }} | |
| DUCKDB_PREFIX: duckdb | |
| STAGING_VOL_GB: 10 | |
| # Schedule + default-dispatch promote; workflow_dispatch can set | |
| # promote=false to dry-run. | |
| PROMOTE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }} | |
| jobs: | |
| refresh: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Free disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ | |
| /opt/hostedtoolcache/CodeQL || true | |
| df -h | |
| - name: Install build dependencies | |
| run: | | |
| # sqlite-utils / csvs-to-sqlite drive the Makefile; labor-union-parser | |
| # builds the crosswalk. | |
| pip install -r requirements.txt | |
| pip install labor-union-parser | |
| # Plugin (converter + duckdb driver) from the deploy branch. | |
| pip install "datasette-duckdb @ git+https://github.com/fgregg/datasette@duckdb-deploy#subdirectory=datasette-duckdb" | |
| # requirements.txt already installed the no_limit_csv datasette fork, | |
| # so pip treats the plugin's unpinned `datasette` dep as already | |
| # satisfied and does NOT pull duckdb-deploy. Force the deploy | |
| # datasette in last so the backends module + the `inspect -c` flag | |
| # (used below to build inspect-data.json) are both available. | |
| pip install --force-reinstall --no-deps "datasette @ git+https://github.com/fgregg/datasette@duckdb-deploy" | |
| # Fail fast if anything is wrong. | |
| python -c "import datasette.backends, datasette_duckdb.convert; print('OK')" | |
| - name: Build all databases | |
| run: make | |
| - name: Convert databases to DuckDB | |
| run: | | |
| # Convert one at a time and delete each .db immediately so peak disk | |
| # stays bounded (the .db set is ~10 GB; keeping both forms would blow | |
| # the runner). Fail loud if any conversion errors. | |
| set -eu | |
| for db in *.db; do | |
| name="${db%.db}" | |
| echo "::group::Converting $db -> $name.duckdb" | |
| python -m datasette_duckdb.convert "$db" "$name.duckdb" | |
| rm -f "$db" | |
| echo "::endgroup::" | |
| done | |
| ls -la *.duckdb | |
| df -h | |
| - name: Build inspect-data.json | |
| run: | | |
| # Precompute per-table counts so datasette's startup scan | |
| # short-circuits to the cache (Database.cached_table_counts keyed by | |
| # db name). Without this, cold /-/databases.json over the real | |
| # warehouse data takes ~245s on shared-cpu-1x. | |
| # | |
| # Build a temp config mirroring serve.sh's plugin mount, but | |
| # with LOCAL paths so the CLI can open the files here. The inspect | |
| # JSON is matched at serve-time by db name (not file path), so the | |
| # "file" field being a CI path is fine — runtime config points at | |
| # /data/<name>.duckdb under the same names. | |
| python3 - <<'PY' | |
| import glob, os, yaml | |
| dbs = { | |
| os.path.splitext(os.path.basename(p))[0]: os.path.abspath(p) | |
| for p in sorted(glob.glob("*.duckdb")) | |
| } | |
| yaml.safe_dump( | |
| {"plugins": {"datasette-duckdb": {"databases": dbs}}}, | |
| open("/tmp/inspect-config.yml", "w"), | |
| ) | |
| print("mounting %d database(s) for inspect" % len(dbs)) | |
| PY | |
| datasette inspect -c /tmp/inspect-config.yml --inspect-file inspect-data.json | |
| echo "wrote inspect-data.json ($(wc -c < inspect-data.json) bytes)" | |
| - name: Build internal.db (frozen schema catalog) | |
| run: | | |
| # Datasette's schema catalog (tables/columns/FKs -> internal.db) powers | |
| # every db/table landing page. It used to be built LAZILY on the first | |
| # serve-boot of a fresh volume — but that introspection runs on the | |
| # first request, racing live traffic + memory pressure, and on the | |
| # largest db (cats, 282 tables) populate_schema_tables threw partway. | |
| # _refresh_schemas writes the catalog_databases row BEFORE populating | |
| # tables, and schema_version() is a static 0, so the half-built entry | |
| # froze cats at 0 listed tables forever (data still queryable, just | |
| # unlisted). See app.py _refresh_schemas. | |
| # | |
| # Fix: build internal.db HERE, offline, with full resources and zero | |
| # traffic, then ship it on the volume. serve.sh reuses an | |
| # existing /data/internal.db (0 == 0 -> populate skipped on boot), so a | |
| # pre-validated catalog is adopted with no live introspection at all. | |
| # | |
| # Reuses /tmp/inspect-config.yml (same db mounts as inspect-data.json). | |
| # The catalog content is identical under shared- or separate-instance | |
| # mode; catalog_databases.path is written but never read back (reuse is | |
| # keyed on database_name + schema_version), so the CI-local paths here | |
| # are harmless. | |
| rm -f internal.db | |
| datasette serve -c /tmp/inspect-config.yml \ | |
| --inspect-file inspect-data.json \ | |
| --internal internal.db \ | |
| --port 8099 >/tmp/ds-internal.log 2>&1 & | |
| DSPID=$! | |
| # The first request to a db view drives + awaits the full populate of | |
| # every db (refresh_schemas is called from the view handlers). Poll | |
| # /-/databases.json: a connection-refused returns fast (not bound yet), | |
| # a bound-but-still-introspecting request blocks up to --max-time. | |
| ok=0 | |
| for i in $(seq 1 120); do | |
| if curl -sf --max-time 600 http://127.0.0.1:8099/-/databases.json \ | |
| -o /tmp/dbs.json; then ok=1; break; fi | |
| if ! kill -0 $DSPID 2>/dev/null; then | |
| echo "datasette exited during introspection:"; cat /tmp/ds-internal.log; exit 1 | |
| fi | |
| sleep 2 | |
| done | |
| if [ "$ok" != 1 ]; then | |
| echo "datasette never served /-/databases.json"; cat /tmp/ds-internal.log | |
| kill $DSPID 2>/dev/null; exit 1 | |
| fi | |
| # Belt-and-suspenders: touch each db page so its catalog is forced. | |
| for name in $(ls *.duckdb | sed 's/\.duckdb$//'); do | |
| curl -sf --max-time 180 "http://127.0.0.1:8099/$name.json" -o /dev/null \ | |
| || echo "warn: $name.json did not return 200" | |
| done | |
| # Shut datasette down and wait for it to release the internal.db lock. | |
| kill $DSPID 2>/dev/null || true | |
| for i in $(seq 1 30); do kill -0 $DSPID 2>/dev/null || break; sleep 1; done | |
| wait $DSPID 2>/dev/null || true | |
| # datasette's internal db is SQLite in WAL mode: the catalog we just | |
| # built lives partly in internal.db-wal, NOT yet in the main file. We | |
| # ship only the single internal.db (not the -wal/-shm sidecars), so we | |
| # MUST fold the WAL into it first — otherwise the shipped catalog is | |
| # empty and every db lists 0 tables on boot (the 0==0 skip then freezes | |
| # that empty state). Checkpoint BEFORE the gate so the gate validates | |
| # exactly the single file that gets uploaded. | |
| python3 - <<'PY' | |
| import sqlite3 | |
| con = sqlite3.connect("internal.db") | |
| con.execute("PRAGMA wal_checkpoint(TRUNCATE)") | |
| con.execute("PRAGMA journal_mode=DELETE") # leave a single self-contained file | |
| con.close() | |
| PY | |
| rm -f internal.db-wal internal.db-shm | |
| # Completeness gate: a db with rows but 0 cataloged tables is the exact | |
| # cats failure — fail the build rather than ship a broken landing page. | |
| # Runs on the post-checkpoint single file, so it reflects what ships. | |
| python3 - <<'PY' | |
| import glob, os, sqlite3, sys | |
| con = sqlite3.connect("internal.db") | |
| counts = dict(con.execute( | |
| "select database_name, count(*) from catalog_tables group by database_name")) | |
| bad = [] | |
| for f in sorted(glob.glob("*.duckdb")): | |
| name = os.path.splitext(os.path.basename(f))[0] | |
| n = counts.get(name, 0) | |
| print(f" {name}: {n} tables") | |
| if n == 0: | |
| bad.append(name) | |
| if bad: | |
| sys.exit("FAIL: internal.db has 0 cataloged tables for: " + ", ".join(bad)) | |
| print("internal.db complete: %d tables across %d dbs" | |
| % (sum(counts.values()), len(counts))) | |
| PY | |
| echo "wrote internal.db ($(wc -c < internal.db) bytes)" | |
| - name: Upload DuckDB databases to R2 | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }} | |
| AWS_DEFAULT_REGION: auto | |
| run: | | |
| # duckdb/ prefix keeps these separate from any legacy .db files. | |
| aws s3 sync . "s3://$R2_BUCKET/$DUCKDB_PREFIX/" \ | |
| --exclude "*" --include "*.duckdb" --include "inspect-data.json" \ | |
| --include "internal.db" \ | |
| --no-progress | |
| - name: Install flyctl | |
| uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 # v1.6 | |
| - name: Discover current machine + volume + image | |
| id: cur | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Prefer role=current; fall back to the first machine. Image is | |
| # captured so the staging machine boots from the SAME rootfs as | |
| # current — image lifecycle lives in deploy.yml now, this | |
| # workflow no longer builds. Fails if no machine exists (bootstrap | |
| # is a one-time manual step). | |
| JSON=$(flyctl machine list --app "$FLY_APP" --json) | |
| ROW=$(echo "$JSON" | jq ' | |
| [.[] | select(.config.metadata.role == "current")][0] // .[0] | |
| ') | |
| if [ -z "$ROW" ] || [ "$ROW" = "null" ]; then | |
| echo "No machine on $FLY_APP — bootstrap one manually first" >&2 | |
| echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role,state}]' >&2 | |
| exit 1 | |
| fi | |
| MID=$(echo "$ROW" | jq -r '.id') | |
| VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume // empty') | |
| IMG=$(echo "$ROW" | jq -r '.config.image') | |
| if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then | |
| echo "Current machine has no image set — corrupted state" >&2 | |
| exit 1 | |
| fi | |
| echo "machine=$MID" >> $GITHUB_OUTPUT | |
| echo "volume=$VOL" >> $GITHUB_OUTPUT | |
| echo "image=$IMG" >> $GITHUB_OUTPUT | |
| echo "Current: machine=$MID volume=$VOL image=$IMG" | |
| echo "Promote? $PROMOTE" | |
| - name: Create staging volume | |
| id: vol | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # On this Fly setup the shared-CPU "machine capacity hold" is acquired | |
| # at VOLUME creation, so `flyctl volumes create` is what hits | |
| # "failed_precondition: ... insufficient CPUs available" when iad is | |
| # momentarily full (2026-06-22 run 27921322176). Retry it with the | |
| # same policy as the machine step below. | |
| # | |
| # Also: capture the create output to a file and check the exit code | |
| # explicitly. The previous `VOL_ID=$(flyctl ... | jq)` form let the | |
| # pipe mask a failed create — jq returned empty from the error text, | |
| # the pipeline exit was jq's (0), so the step went green with an empty | |
| # volume id and the machine step later failed with the opaque | |
| # "not enough unattached volumes for ''". | |
| NAME="dbs_stage_${GITHUB_RUN_ID}" | |
| VOL_ID="" | |
| for attempt in 1 2 3 4 5; do | |
| set +e | |
| flyctl volumes create "$NAME" \ | |
| --app "$FLY_APP" \ | |
| --size "$STAGING_VOL_GB" \ | |
| --region "$FLY_REGION" \ | |
| --yes \ | |
| --json > /tmp/vol_create.log 2>&1 | |
| rc=$? | |
| set -e | |
| cat /tmp/vol_create.log | |
| if [ "$rc" -eq 0 ]; then | |
| VOL_ID=$(jq -r '.id // empty' /tmp/vol_create.log) | |
| fi | |
| [ -n "$VOL_ID" ] && break | |
| if grep -qiE 'insufficient (cpus|capacity)|capacity hold failed|failed_precondition|no capacity available' /tmp/vol_create.log; then | |
| echo "::warning::staging volume create hit Fly capacity (attempt $attempt/5, rc=$rc); retrying after backoff" | |
| sleep $((attempt * 30)) | |
| continue | |
| fi | |
| echo "flyctl volumes create failed with a non-capacity error (rc=$rc); not retrying" >&2 | |
| exit 1 | |
| done | |
| if [ -z "$VOL_ID" ]; then | |
| echo "Could not create staging volume after 5 attempts (Fly capacity?)" >&2 | |
| exit 1 | |
| fi | |
| echo "id=$VOL_ID" >> $GITHUB_OUTPUT | |
| echo "Created staging volume: $NAME ($VOL_ID)" | |
| - name: Create staging machine | |
| id: mach | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Boot from the CURRENT machine's image so this workflow doesn't | |
| # have to build — image lifecycle is owned by deploy.yml. | |
| # No public ports until promotion. `flyctl machine run` has no | |
| # --json; parse stdout for "Machine ID: XXX". | |
| # | |
| # Retry on transient Fly capacity errors ("machine capacity hold | |
| # failed: insufficient CPUs", failed_precondition): the region's | |
| # shared-CPU pool is occasionally momentarily full, which otherwise | |
| # fails the whole daily refresh (2026-06-20 run 27868174575). A | |
| # capacity hold fails BEFORE any machine is created, so retrying is | |
| # safe and can't orphan a machine. Non-capacity errors fail fast so | |
| # we don't mask a real misconfiguration behind retries. | |
| MID="" | |
| for attempt in 1 2 3 4 5; do | |
| set +e | |
| flyctl machine run "${{ steps.cur.outputs.image }}" \ | |
| --app "$FLY_APP" \ | |
| --region "$FLY_REGION" \ | |
| --volume "${{ steps.vol.outputs.id }}:/data" \ | |
| --metadata role=staging \ | |
| --metadata refresh_run="$GITHUB_RUN_ID" \ | |
| --vm-cpu-kind shared \ | |
| --vm-cpus 1 \ | |
| --vm-memory 2048 > /tmp/machine_run.log 2>&1 | |
| rc=$? | |
| set -e | |
| cat /tmp/machine_run.log | |
| MID=$(grep -oE 'Machine ID: [a-f0-9]+' /tmp/machine_run.log | head -1 | awk '{print $3}') | |
| [ -n "$MID" ] && break | |
| if grep -qiE 'insufficient (cpus|capacity)|capacity hold failed|failed_precondition|no capacity available' /tmp/machine_run.log; then | |
| echo "::warning::staging machine create hit Fly capacity (attempt $attempt/5, rc=$rc); retrying after backoff" | |
| sleep $((attempt * 30)) | |
| continue | |
| fi | |
| echo "flyctl machine run failed with a non-capacity error (rc=$rc); not retrying" >&2 | |
| exit 1 | |
| done | |
| if [ -z "$MID" ]; then | |
| echo "Could not create staging machine after 5 attempts (Fly capacity?)" >&2 | |
| exit 1 | |
| fi | |
| echo "id=$MID" >> $GITHUB_OUTPUT | |
| echo "Created staging machine: $MID" | |
| sleep 15 | |
| - name: Populate staging /data from R2 | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # The .duckdb files are still on the runner (we only rm'd the .db | |
| # forms). Enumerate them for the pull list; the public base points at | |
| # the duckdb/ prefix we synced to. pull-from-r2-direct.sh cds /data | |
| # and writes each basename, so it's reused unchanged. | |
| ASSETS="$(ls *.duckdb | tr '\n' ' ') inspect-data.json internal.db" | |
| echo "Pulling: $ASSETS" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/pull-from-r2-direct.sh" | |
| echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE/$DUCKDB_PREFIX $ASSETS" | |
| - name: Restart staging to attach fresh data | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| flyctl machine restart "$MID" --app "$FLY_APP" | |
| for i in $(seq 1 90); do | |
| state=$(flyctl machine list --app "$FLY_APP" --json \ | |
| | jq -r --arg id "$MID" '.[] | select(.id == $id) | .state') | |
| echo " attempt $i: state=$state" | |
| if [ "$state" = "started" ]; then break; fi | |
| sleep 2 | |
| done | |
| if [ "$state" != "started" ]; then | |
| echo "Machine never reached started state" >&2 | |
| exit 1 | |
| fi | |
| - name: Wait for SSH (hallpass) on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| for i in $(seq 1 30); do | |
| if flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "true" >/dev/null 2>&1; then | |
| echo " ssh ready on attempt $i" | |
| break | |
| fi | |
| echo " attempt $i: ssh not ready" | |
| sleep 2 | |
| done | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true" | |
| - name: Wait for datasette to bind :8080 on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Wait-for-SSH above only checks Fly's hallpass — datasette itself | |
| # takes ~2 min to start over 13 attached duckdb dbs on shared-cpu-1x. | |
| # Without an explicit wait, smoke-test.sh's built-in 120s poll | |
| # races against the boot and tail-fails. | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/wait-for-datasette.sh" | |
| echo "put scripts/wait-for-datasette.sh /tmp/wait-for-datasette.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/wait-for-datasette.sh" | |
| - name: Smoke test datasette on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Reused unchanged: datasette names a db by its filename stem, so | |
| # <name>.duckdb attaches as "<name>" — the same set smoke-test.sh | |
| # expects. | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/smoke-test.sh" | |
| echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/smoke-test.sh" | |
| - name: Dump staging diagnostics (on failure) | |
| # On failure, capture the staging machine's datasette logs + memory / | |
| # process state into the CI output so the next time something breaks | |
| # it's diagnosable without reproducing. (Teardown destroys the machine | |
| # immediately after, so without this we lose the evidence.) | |
| if: failure() | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| echo "=== machine status ===" | |
| flyctl machine status "$MID" --app "$FLY_APP" 2>&1 | head -40 || true | |
| echo "=== datasette logs (tail) ===" | |
| flyctl logs --app "$FLY_APP" --machine "$MID" --no-tail 2>&1 | tail -150 || true | |
| echo "=== in-container probe ===" | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "sh -c 'grep -E \"MemTotal|MemAvailable\" /proc/meminfo; echo ---DATA---; ls -la /data; echo ---PROCS---; ps aux 2>/dev/null | grep -E \"datasette|uvicorn\" | grep -v grep'" 2>&1 | head -50 || true | |
| # ─── Promote path (only when explicitly requested) ────────────── | |
| - name: Promote staging to current | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| set -eu | |
| NEW="${{ steps.mach.outputs.id }}" | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| # 1. Put services on the staging machine so Fly's proxy routes to it. | |
| # machine update restarts the container, so the warm cache from | |
| # the smoke step is dropped. inspect-data.json on the volume | |
| # means the first /-/databases.json is fast, but datasette | |
| # still takes ~2 min to bind :8080 over 13 dbs — the next step | |
| # waits for it so we don't cordon the old machine before the | |
| # new one is serving (which would leak 502s to visitors). | |
| flyctl machine update "$NEW" --app "$FLY_APP" --yes \ | |
| --port "80:8080/tcp:http" \ | |
| --port "443:8080/tcp:tls:http" | |
| - name: Wait for datasette to bind :8080 on promoted machine | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| # The promote `machine update` above restarts the container to apply | |
| # the public ports, so SSH (hallpass) isn't immediately back. Poll | |
| # until the machine is reachable before sftp'ing the wait script — | |
| # otherwise the next `ssh console` races the restart and dies with | |
| # "--machine not found/started" (2026-06-23 nightly run 28020701935). | |
| # Mirrors the "Wait for SSH (hallpass) on staging" step. | |
| for i in $(seq 1 45); do | |
| if flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true" >/dev/null 2>&1; then | |
| echo " ssh ready on attempt $i" | |
| break | |
| fi | |
| echo " attempt $i: machine not ssh-ready yet" | |
| sleep 2 | |
| done | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "rm -f /tmp/wait-for-datasette.sh" | |
| echo "put scripts/wait-for-datasette.sh /tmp/wait-for-datasette.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "$MID" | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "sh /tmp/wait-for-datasette.sh" | |
| - name: Cordon + finalize promote metadata | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| set -eu | |
| NEW="${{ steps.mach.outputs.id }}" | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| # 2/3. Cordon the old machine and let connections drain before | |
| # swapping roles. | |
| flyctl machine cordon "$OLD" --app "$FLY_APP" | |
| sleep 45 | |
| # 4. Promote metadata so the next run finds this machine. | |
| flyctl machine update "$NEW" --app "$FLY_APP" --yes \ | |
| --metadata role=current | |
| - name: Destroy old current | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| OLDVOL="${{ steps.cur.outputs.volume }}" | |
| flyctl machine destroy "$OLD" --app "$FLY_APP" --force | |
| if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then | |
| flyctl volumes destroy "$OLDVOL" -y | |
| fi | |
| # The edge caches data-derived pages (the homepage's database list, | |
| # table row counts, etc.) and CSV/JSON exports with a long | |
| # cdn-cache-control, so a data refresh must purge or the old counts/pages | |
| # stick at the edge until expiry. labordata.bunkum.us is now served | |
| # through this app (DuckDB cutover), so this purge is live (it was a no-op | |
| # while the site was pre-cutover on warehouse-duckdb.fly.dev). | |
| - name: Purge Cloudflare cache | |
| if: env.PROMOTE == 'true' | |
| env: | |
| CF_ZONE_ID: ${{ secrets.CF_ZONE_ID }} | |
| CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }} | |
| run: | | |
| response=$(curl -fsS -X POST \ | |
| "https://api.cloudflare.com/client/v4/zones/${CF_ZONE_ID}/purge_cache" \ | |
| -H "Authorization: Bearer ${CF_API_TOKEN}" \ | |
| -H "Content-Type: application/json" \ | |
| --data '{"purge_everything":true}') | |
| echo "$response" | |
| echo "$response" | grep -q '"success":true' || { echo "Purge failed"; exit 1; } | |
| # ─── Teardown of staging if not promoted (dry-run + failure path) ─ | |
| - name: Tear down staging | |
| if: always() && (env.PROMOTE != 'true' || failure()) | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| if [ -n "${{ steps.mach.outputs.id }}" ]; then | |
| flyctl machine destroy "${{ steps.mach.outputs.id }}" \ | |
| --app "$FLY_APP" --force || true | |
| fi | |
| if [ -n "${{ steps.vol.outputs.id }}" ]; then | |
| flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true | |
| fi |