Skip to content

Trigger redeploy to pick up table.py count-cache fix #15

Trigger redeploy to pick up table.py count-cache fix

Trigger redeploy to pick up table.py count-cache fix #15

# DuckDB data refresh, blue/green style. Parallel track to refresh-data.yml.
#
# Ships the *same* source data as the SQLite build, but converted to DuckDB
# (`python -m datasette_duckdb.convert`) and deployed to a SEPARATE Fly app
# (warehouse-duckdb) with its own machine + volume. The SQLite production app
# (warehouse / labordata.bunkum.us) is never touched by this workflow.
#
# Splits cleanly with deploy-duckdb.yml the same way refresh-data.yml splits
# with deploy.yml on the SQLite track: image lifecycle lives in deploy, data
# lifecycle lives here. This workflow reuses the CURRENT machine's image for
# the staging machine and does not build.
#
# Each run:
# 1. `make` builds the .db files from the same upstream sources
# 2. Converts each <name>.db -> <name>.duckdb (rm'ing the .db as it goes to
# keep the runner under its disk cap)
# 3. Uploads the .duckdb files to R2 under the duckdb/ prefix
# 4. Discovers the role=current machine + its image
# 5. Creates a fresh volume + a role=staging machine on that same image
# (no public ports)
# 6. SFTPs pull-from-r2-direct.sh and populates /data from R2
# 7. Restarts to attach the files, warms the cold schema scan, smoke-tests
# 8. Promotes: adds services to staging, warms again (port update restarts),
# cordons + destroys the old machine and its volume
#
# Bootstrap is one-time manual (like the SQLite app): create the first machine
# via `flyctl deploy -c fly.duckdb.toml --remote-only` after creating the app
# and allocating IPs. From then on deploy-duckdb.yml rolls images and this
# workflow refreshes data.
#
# Known gaps vs. the SQLite site (tracked in the datasette-duckdb work):
# * No FTS yet — the converter doesn't build FTS indexes, so search boxes
# are absent until that lands.
# * No --crossdb / union_names canned queries (sqlite-specific).
name: Refresh data (DuckDB)
on:
# TEMPORARY: fire on every push to the feature branch so we can iterate
# the full refresh pipeline before merging. Remove before merging.
push:
branches:
- duckdb-parallel-build
schedule:
# 30 min after the SQLite refresh (0 7) so upstream sources are settled
# and the two runners don't contend.
- cron: "30 7 * * *"
workflow_dispatch:
inputs:
promote:
description: "Promote staging to current after smoke (destroys current machine + volume)"
type: boolean
default: true
concurrency:
# Own group: serialize DuckDB refreshes against each other, but don't block
# on (or get blocked by) the SQLite warehouse-deploy group.
group: warehouse-duckdb-deploy
cancel-in-progress: false
permissions:
contents: read
env:
FLY_APP: warehouse-duckdb
FLY_REGION: iad
R2_BUCKET: labordata-warehouse-staging
R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }}
DUCKDB_PREFIX: duckdb
STAGING_VOL_GB: 10
# TEMPORARY: push events promote too, so a commit-triggered refresh leaves
# a live URL instead of a dry-run that tears itself down. Revert with the
# push: trigger above.
PROMOTE: ${{ (github.event_name == 'schedule' || github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }}
jobs:
refresh:
runs-on: ubuntu-latest
timeout-minutes: 90
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
/opt/hostedtoolcache/CodeQL || true
df -h
- name: Install build dependencies
run: |
# sqlite-utils / csvs-to-sqlite drive the Makefile; labor-union-parser
# builds the crosswalk.
pip install -r requirements.txt
pip install labor-union-parser
# Plugin (converter + duckdb driver) from the deploy branch.
pip install "datasette-duckdb @ git+https://github.com/fgregg/datasette@duckdb-deploy#subdirectory=datasette-duckdb"
# requirements.txt already installed the no_limit_csv datasette fork,
# so pip treats the plugin's unpinned `datasette` dep as already
# satisfied and does NOT pull duckdb-deploy. Force the deploy
# datasette in last so the backends module + the `inspect -c` flag
# (used below to build inspect-data.json) are both available.
pip install --force-reinstall --no-deps "datasette @ git+https://github.com/fgregg/datasette@duckdb-deploy"
# Fail fast if anything is wrong.
python -c "import datasette.backends, datasette_duckdb.convert; print('OK')"
- name: Build all databases
run: make
- name: Convert databases to DuckDB
run: |
# Convert one at a time and delete each .db immediately so peak disk
# stays bounded (the .db set is ~10 GB; keeping both forms would blow
# the runner). Fail loud if any conversion errors.
set -eu
for db in *.db; do
name="${db%.db}"
echo "::group::Converting $db -> $name.duckdb"
python -m datasette_duckdb.convert "$db" "$name.duckdb"
rm -f "$db"
echo "::endgroup::"
done
ls -la *.duckdb
df -h
- name: Build inspect-data.json
run: |
# Precompute per-table counts so datasette's startup scan
# short-circuits to the cache (Database.cached_table_counts keyed by
# db name). Without this, cold /-/databases.json over the real
# warehouse data takes ~245s on shared-cpu-1x.
#
# Build a temp config mirroring serve-duckdb.sh's plugin mount, but
# with LOCAL paths so the CLI can open the files here. The inspect
# JSON is matched at serve-time by db name (not file path), so the
# "file" field being a CI path is fine — runtime config points at
# /data/<name>.duckdb under the same names.
python3 - <<'PY'
import glob, os, yaml
dbs = {
os.path.splitext(os.path.basename(p))[0]: os.path.abspath(p)
for p in sorted(glob.glob("*.duckdb"))
}
yaml.safe_dump(
{"plugins": {"datasette-duckdb": {"databases": dbs}}},
open("/tmp/inspect-config.yml", "w"),
)
print("mounting %d database(s) for inspect" % len(dbs))
PY
datasette inspect -c /tmp/inspect-config.yml --inspect-file inspect-data.json
echo "wrote inspect-data.json ($(wc -c < inspect-data.json) bytes)"
- name: Upload DuckDB databases to R2
env:
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }}
AWS_DEFAULT_REGION: auto
run: |
# Separate key namespace from the SQLite .db files in the same bucket.
aws s3 sync . "s3://$R2_BUCKET/$DUCKDB_PREFIX/" \
--exclude "*" --include "*.duckdb" --include "inspect-data.json" \
--no-progress
- name: Install flyctl
uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 # v1.6
- name: Discover current machine + volume + image
id: cur
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Prefer role=current; fall back to the first machine. Image is
# captured so the staging machine boots from the SAME rootfs as
# current — image lifecycle lives in deploy-duckdb.yml now, this
# workflow no longer builds. Fails if no machine exists (bootstrap
# is a one-time manual step, like the SQLite app).
JSON=$(flyctl machine list --app "$FLY_APP" --json)
ROW=$(echo "$JSON" | jq '
[.[] | select(.config.metadata.role == "current")][0] // .[0]
')
if [ -z "$ROW" ] || [ "$ROW" = "null" ]; then
echo "No machine on $FLY_APP — bootstrap one manually first" >&2
echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role,state}]' >&2
exit 1
fi
MID=$(echo "$ROW" | jq -r '.id')
VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume // empty')
IMG=$(echo "$ROW" | jq -r '.config.image')
if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then
echo "Current machine has no image set — corrupted state" >&2
exit 1
fi
echo "machine=$MID" >> $GITHUB_OUTPUT
echo "volume=$VOL" >> $GITHUB_OUTPUT
echo "image=$IMG" >> $GITHUB_OUTPUT
echo "Current: machine=$MID volume=$VOL image=$IMG"
echo "Promote? $PROMOTE"
- name: Create staging volume
id: vol
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
NAME="dbs_stage_${GITHUB_RUN_ID}"
VOL_ID=$(flyctl volumes create "$NAME" \
--app "$FLY_APP" \
--size "$STAGING_VOL_GB" \
--region "$FLY_REGION" \
--yes \
--json \
| jq -r '.id')
echo "id=$VOL_ID" >> $GITHUB_OUTPUT
echo "Created staging volume: $NAME ($VOL_ID)"
- name: Create staging machine
id: mach
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Boot from the CURRENT machine's image so this workflow doesn't
# have to build — image lifecycle is owned by deploy-duckdb.yml.
# No public ports until promotion. `flyctl machine run` has no
# --json; parse stdout for "Machine ID: XXX". We `tee` the output
# (rather than OUT=$(...), which under `set -e` exits before we
# can echo it). Retry isn't needed here the way it was when we
# built+ran in the same step — the current image has been in the
# registry for a while.
flyctl machine run "${{ steps.cur.outputs.image }}" \
--app "$FLY_APP" \
--region "$FLY_REGION" \
--volume "${{ steps.vol.outputs.id }}:/data" \
--metadata role=staging \
--metadata refresh_run="$GITHUB_RUN_ID" \
--vm-cpu-kind shared \
--vm-cpus 1 \
--vm-memory 2048 2>&1 | tee /tmp/machine_run.log
MID=$(grep -oE 'Machine ID: [a-f0-9]+' /tmp/machine_run.log | head -1 | awk '{print $3}')
if [ -z "$MID" ]; then
echo "Could not parse machine ID from flyctl output" >&2
exit 1
fi
echo "id=$MID" >> $GITHUB_OUTPUT
echo "Created staging machine: $MID"
sleep 15
- name: Populate staging /data from R2
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# The .duckdb files are still on the runner (we only rm'd the .db
# forms). Enumerate them for the pull list; the public base points at
# the duckdb/ prefix we synced to. pull-from-r2-direct.sh cds /data
# and writes each basename, so it's reused unchanged.
ASSETS="$(ls *.duckdb | tr '\n' ' ') inspect-data.json"
echo "Pulling: $ASSETS"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/pull-from-r2-direct.sh"
echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE/$DUCKDB_PREFIX $ASSETS"
- name: Restart staging to attach fresh data
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
MID="${{ steps.mach.outputs.id }}"
flyctl machine restart "$MID" --app "$FLY_APP"
for i in $(seq 1 90); do
state=$(flyctl machine list --app "$FLY_APP" --json \
| jq -r --arg id "$MID" '.[] | select(.id == $id) | .state')
echo " attempt $i: state=$state"
if [ "$state" = "started" ]; then break; fi
sleep 2
done
if [ "$state" != "started" ]; then
echo "Machine never reached started state" >&2
exit 1
fi
- name: Wait for SSH (hallpass) on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
MID="${{ steps.mach.outputs.id }}"
for i in $(seq 1 30); do
if flyctl ssh console --app "$FLY_APP" --machine "$MID" \
-C "true" >/dev/null 2>&1; then
echo " ssh ready on attempt $i"
break
fi
echo " attempt $i: ssh not ready"
sleep 2
done
flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true"
- name: Wait for datasette to bind :8080 on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Wait-for-SSH above only checks Fly's hallpass — datasette itself
# takes ~2 min to start over 13 attached duckdb dbs on shared-cpu-1x.
# Without an explicit wait, smoke-test.sh's built-in 120s poll
# races against the boot and tail-fails.
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/wait-for-datasette.sh"
echo "put scripts/wait-for-datasette.sh /tmp/wait-for-datasette.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/wait-for-datasette.sh"
- name: Smoke test datasette on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Reused unchanged: datasette names a db by its filename stem, so
# <name>.duckdb attaches as "<name>" — the same set smoke-test.sh
# expects.
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/smoke-test.sh"
echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/smoke-test.sh"
- name: Dump staging diagnostics (on failure)
# On failure, capture the staging machine's datasette logs + memory /
# process state into the CI output so the next time something breaks
# it's diagnosable without reproducing. (Teardown destroys the machine
# immediately after, so without this we lose the evidence.)
if: failure()
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
MID="${{ steps.mach.outputs.id }}"
echo "=== machine status ==="
flyctl machine status "$MID" --app "$FLY_APP" 2>&1 | head -40 || true
echo "=== datasette logs (tail) ==="
flyctl logs --app "$FLY_APP" --machine "$MID" --no-tail 2>&1 | tail -150 || true
echo "=== in-container probe ==="
flyctl ssh console --app "$FLY_APP" --machine "$MID" \
-C "sh -c 'grep -E \"MemTotal|MemAvailable\" /proc/meminfo; echo ---DATA---; ls -la /data; echo ---PROCS---; ps aux 2>/dev/null | grep -E \"datasette|uvicorn\" | grep -v grep'" 2>&1 | head -50 || true
# ─── Promote path (only when explicitly requested) ──────────────
- name: Promote staging to current
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
set -eu
NEW="${{ steps.mach.outputs.id }}"
OLD="${{ steps.cur.outputs.machine }}"
# 1. Put services on the staging machine so Fly's proxy routes to it.
# machine update restarts the container, so the warm cache from
# the smoke step is dropped. inspect-data.json on the volume
# means the first /-/databases.json is fast, but datasette
# still takes ~2 min to bind :8080 over 13 dbs — the next step
# waits for it so we don't cordon the old machine before the
# new one is serving (which would leak 502s to visitors).
flyctl machine update "$NEW" --app "$FLY_APP" --yes \
--port "80:8080/tcp:http" \
--port "443:8080/tcp:tls:http"
- name: Wait for datasette to bind :8080 on promoted machine
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/wait-for-datasette.sh"
echo "put scripts/wait-for-datasette.sh /tmp/wait-for-datasette.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/wait-for-datasette.sh"
- name: Cordon + finalize promote metadata
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
set -eu
NEW="${{ steps.mach.outputs.id }}"
OLD="${{ steps.cur.outputs.machine }}"
# 2/3. Cordon the old machine and let connections drain before
# swapping roles.
flyctl machine cordon "$OLD" --app "$FLY_APP"
sleep 45
# 4. Promote metadata so the next run finds this machine.
flyctl machine update "$NEW" --app "$FLY_APP" --yes \
--metadata role=current
- name: Destroy old current
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
OLD="${{ steps.cur.outputs.machine }}"
OLDVOL="${{ steps.cur.outputs.volume }}"
flyctl machine destroy "$OLD" --app "$FLY_APP" --force
if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then
flyctl volumes destroy "$OLDVOL" -y
fi
# ─── Teardown of staging if not promoted (dry-run + failure path) ─
- name: Tear down staging
if: always() && (env.PROMOTE != 'true' || failure())
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
if [ -n "${{ steps.mach.outputs.id }}" ]; then
flyctl machine destroy "${{ steps.mach.outputs.id }}" \
--app "$FLY_APP" --force || true
fi
if [ -n "${{ steps.vol.outputs.id }}" ]; then
flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true
fi