Refresh data (DuckDB) #31
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # DuckDB data refresh, blue/green style. Parallel track to refresh-data.yml. | |
| # | |
| # Ships the *same* source data as the SQLite build, but converted to DuckDB | |
| # (`python -m datasette_duckdb.convert`) and deployed to a SEPARATE Fly app | |
| # (warehouse-duckdb) with its own machine + volume. The SQLite production app | |
| # (warehouse / labordata.bunkum.us) is never touched by this workflow. | |
| # | |
| # Splits cleanly with deploy-duckdb.yml the same way refresh-data.yml splits | |
| # with deploy.yml on the SQLite track: image lifecycle lives in deploy, data | |
| # lifecycle lives here. This workflow reuses the CURRENT machine's image for | |
| # the staging machine and does not build. | |
| # | |
| # Each run: | |
| # 1. `make` builds the .db files from the same upstream sources | |
| # 2. Converts each <name>.db -> <name>.duckdb (rm'ing the .db as it goes to | |
| # keep the runner under its disk cap) | |
| # 3. Uploads the .duckdb files to R2 under the duckdb/ prefix | |
| # 4. Discovers the role=current machine + its image | |
| # 5. Creates a fresh volume + a role=staging machine on that same image | |
| # (no public ports) | |
| # 6. SFTPs pull-from-r2-direct.sh and populates /data from R2 | |
| # 7. Restarts to attach the files, warms the cold schema scan, smoke-tests | |
| # 8. Promotes: adds services to staging, warms again (port update restarts), | |
| # cordons + destroys the old machine and its volume | |
| # | |
| # Bootstrap is one-time manual (like the SQLite app): create the first machine | |
| # via `flyctl deploy -c fly.duckdb.toml --remote-only` after creating the app | |
| # and allocating IPs. From then on deploy-duckdb.yml rolls images and this | |
| # workflow refreshes data. | |
| # | |
| # Known gaps vs. the SQLite site (tracked in the datasette-duckdb work): | |
| # * No FTS yet — the converter doesn't build FTS indexes, so search boxes | |
| # are absent until that lands. | |
| # * No --crossdb / union_names canned queries (sqlite-specific). | |
| name: Refresh data (DuckDB) | |
| on: | |
| # TEMPORARY: fire on every push to the feature branch so we can iterate | |
| # the full refresh pipeline before merging. Remove before merging. | |
| push: | |
| branches: | |
| - duckdb-parallel-build | |
| schedule: | |
| # 30 min after the SQLite refresh (0 7) so upstream sources are settled | |
| # and the two runners don't contend. | |
| - cron: "30 7 * * *" | |
| workflow_dispatch: | |
| inputs: | |
| promote: | |
| description: "Promote staging to current after smoke (destroys current machine + volume)" | |
| type: boolean | |
| default: true | |
| concurrency: | |
| # Own group: serialize DuckDB refreshes against each other, but don't block | |
| # on (or get blocked by) the SQLite warehouse-deploy group. | |
| group: warehouse-duckdb-deploy | |
| cancel-in-progress: false | |
| permissions: | |
| contents: read | |
| env: | |
| FLY_APP: warehouse-duckdb | |
| FLY_REGION: iad | |
| R2_BUCKET: labordata-warehouse-staging | |
| R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }} | |
| DUCKDB_PREFIX: duckdb | |
| STAGING_VOL_GB: 10 | |
| # TEMPORARY: push events promote too, so a commit-triggered refresh leaves | |
| # a live URL instead of a dry-run that tears itself down. Revert with the | |
| # push: trigger above. | |
| PROMOTE: ${{ (github.event_name == 'schedule' || github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }} | |
| jobs: | |
| refresh: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 90 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Free disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ | |
| /opt/hostedtoolcache/CodeQL || true | |
| df -h | |
| - name: Install build dependencies | |
| run: | | |
| # sqlite-utils / csvs-to-sqlite drive the Makefile; labor-union-parser | |
| # builds the crosswalk. | |
| pip install -r requirements.txt | |
| pip install labor-union-parser | |
| # Plugin (converter + duckdb driver) from the deploy branch. | |
| pip install "datasette-duckdb @ git+https://github.com/fgregg/datasette@duckdb-deploy#subdirectory=datasette-duckdb" | |
| # requirements.txt already installed the no_limit_csv datasette fork, | |
| # so pip treats the plugin's unpinned `datasette` dep as already | |
| # satisfied and does NOT pull duckdb-deploy. Force the deploy | |
| # datasette in last so the backends module + the `inspect -c` flag | |
| # (used below to build inspect-data.json) are both available. | |
| pip install --force-reinstall --no-deps "datasette @ git+https://github.com/fgregg/datasette@duckdb-deploy" | |
| # Fail fast if anything is wrong. | |
| python -c "import datasette.backends, datasette_duckdb.convert; print('OK')" | |
| - name: Build all databases | |
| run: make | |
| - name: Convert databases to DuckDB | |
| run: | | |
| # Convert one at a time and delete each .db immediately so peak disk | |
| # stays bounded (the .db set is ~10 GB; keeping both forms would blow | |
| # the runner). Fail loud if any conversion errors. | |
| set -eu | |
| for db in *.db; do | |
| name="${db%.db}" | |
| echo "::group::Converting $db -> $name.duckdb" | |
| python -m datasette_duckdb.convert "$db" "$name.duckdb" | |
| rm -f "$db" | |
| echo "::endgroup::" | |
| done | |
| ls -la *.duckdb | |
| df -h | |
| - name: Build inspect-data.json | |
| run: | | |
| # Precompute per-table counts so datasette's startup scan | |
| # short-circuits to the cache (Database.cached_table_counts keyed by | |
| # db name). Without this, cold /-/databases.json over the real | |
| # warehouse data takes ~245s on shared-cpu-1x. | |
| # | |
| # Build a temp config mirroring serve-duckdb.sh's plugin mount, but | |
| # with LOCAL paths so the CLI can open the files here. The inspect | |
| # JSON is matched at serve-time by db name (not file path), so the | |
| # "file" field being a CI path is fine — runtime config points at | |
| # /data/<name>.duckdb under the same names. | |
| python3 - <<'PY' | |
| import glob, os, yaml | |
| dbs = { | |
| os.path.splitext(os.path.basename(p))[0]: os.path.abspath(p) | |
| for p in sorted(glob.glob("*.duckdb")) | |
| } | |
| yaml.safe_dump( | |
| {"plugins": {"datasette-duckdb": {"databases": dbs}}}, | |
| open("/tmp/inspect-config.yml", "w"), | |
| ) | |
| print("mounting %d database(s) for inspect" % len(dbs)) | |
| PY | |
| datasette inspect -c /tmp/inspect-config.yml --inspect-file inspect-data.json | |
| echo "wrote inspect-data.json ($(wc -c < inspect-data.json) bytes)" | |
| - name: Upload DuckDB databases to R2 | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }} | |
| AWS_DEFAULT_REGION: auto | |
| run: | | |
| # Separate key namespace from the SQLite .db files in the same bucket. | |
| aws s3 sync . "s3://$R2_BUCKET/$DUCKDB_PREFIX/" \ | |
| --exclude "*" --include "*.duckdb" --include "inspect-data.json" \ | |
| --no-progress | |
| - name: Install flyctl | |
| uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 # v1.6 | |
| - name: Discover current machine + volume + image | |
| id: cur | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Prefer role=current; fall back to the first machine. Image is | |
| # captured so the staging machine boots from the SAME rootfs as | |
| # current — image lifecycle lives in deploy-duckdb.yml now, this | |
| # workflow no longer builds. Fails if no machine exists (bootstrap | |
| # is a one-time manual step, like the SQLite app). | |
| JSON=$(flyctl machine list --app "$FLY_APP" --json) | |
| ROW=$(echo "$JSON" | jq ' | |
| [.[] | select(.config.metadata.role == "current")][0] // .[0] | |
| ') | |
| if [ -z "$ROW" ] || [ "$ROW" = "null" ]; then | |
| echo "No machine on $FLY_APP — bootstrap one manually first" >&2 | |
| echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role,state}]' >&2 | |
| exit 1 | |
| fi | |
| MID=$(echo "$ROW" | jq -r '.id') | |
| VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume // empty') | |
| IMG=$(echo "$ROW" | jq -r '.config.image') | |
| if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then | |
| echo "Current machine has no image set — corrupted state" >&2 | |
| exit 1 | |
| fi | |
| echo "machine=$MID" >> $GITHUB_OUTPUT | |
| echo "volume=$VOL" >> $GITHUB_OUTPUT | |
| echo "image=$IMG" >> $GITHUB_OUTPUT | |
| echo "Current: machine=$MID volume=$VOL image=$IMG" | |
| echo "Promote? $PROMOTE" | |
| - name: Create staging volume | |
| id: vol | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| NAME="dbs_stage_${GITHUB_RUN_ID}" | |
| VOL_ID=$(flyctl volumes create "$NAME" \ | |
| --app "$FLY_APP" \ | |
| --size "$STAGING_VOL_GB" \ | |
| --region "$FLY_REGION" \ | |
| --yes \ | |
| --json \ | |
| | jq -r '.id') | |
| echo "id=$VOL_ID" >> $GITHUB_OUTPUT | |
| echo "Created staging volume: $NAME ($VOL_ID)" | |
| - name: Create staging machine | |
| id: mach | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Boot from the CURRENT machine's image so this workflow doesn't | |
| # have to build — image lifecycle is owned by deploy-duckdb.yml. | |
| # No public ports until promotion. `flyctl machine run` has no | |
| # --json; parse stdout for "Machine ID: XXX". We `tee` the output | |
| # (rather than OUT=$(...), which under `set -e` exits before we | |
| # can echo it). Retry isn't needed here the way it was when we | |
| # built+ran in the same step — the current image has been in the | |
| # registry for a while. | |
| flyctl machine run "${{ steps.cur.outputs.image }}" \ | |
| --app "$FLY_APP" \ | |
| --region "$FLY_REGION" \ | |
| --volume "${{ steps.vol.outputs.id }}:/data" \ | |
| --metadata role=staging \ | |
| --metadata refresh_run="$GITHUB_RUN_ID" \ | |
| --vm-cpu-kind shared \ | |
| --vm-cpus 1 \ | |
| --vm-memory 2048 2>&1 | tee /tmp/machine_run.log | |
| MID=$(grep -oE 'Machine ID: [a-f0-9]+' /tmp/machine_run.log | head -1 | awk '{print $3}') | |
| if [ -z "$MID" ]; then | |
| echo "Could not parse machine ID from flyctl output" >&2 | |
| exit 1 | |
| fi | |
| echo "id=$MID" >> $GITHUB_OUTPUT | |
| echo "Created staging machine: $MID" | |
| sleep 15 | |
| - name: Populate staging /data from R2 | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # The .duckdb files are still on the runner (we only rm'd the .db | |
| # forms). Enumerate them for the pull list; the public base points at | |
| # the duckdb/ prefix we synced to. pull-from-r2-direct.sh cds /data | |
| # and writes each basename, so it's reused unchanged. | |
| ASSETS="$(ls *.duckdb | tr '\n' ' ') inspect-data.json" | |
| echo "Pulling: $ASSETS" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/pull-from-r2-direct.sh" | |
| echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE/$DUCKDB_PREFIX $ASSETS" | |
| - name: Restart staging to attach fresh data | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| flyctl machine restart "$MID" --app "$FLY_APP" | |
| for i in $(seq 1 90); do | |
| state=$(flyctl machine list --app "$FLY_APP" --json \ | |
| | jq -r --arg id "$MID" '.[] | select(.id == $id) | .state') | |
| echo " attempt $i: state=$state" | |
| if [ "$state" = "started" ]; then break; fi | |
| sleep 2 | |
| done | |
| if [ "$state" != "started" ]; then | |
| echo "Machine never reached started state" >&2 | |
| exit 1 | |
| fi | |
| - name: Wait for SSH (hallpass) on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| for i in $(seq 1 30); do | |
| if flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "true" >/dev/null 2>&1; then | |
| echo " ssh ready on attempt $i" | |
| break | |
| fi | |
| echo " attempt $i: ssh not ready" | |
| sleep 2 | |
| done | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true" | |
| - name: Wait for datasette to bind :8080 on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Wait-for-SSH above only checks Fly's hallpass — datasette itself | |
| # takes ~2 min to start over 13 attached duckdb dbs on shared-cpu-1x. | |
| # Without an explicit wait, smoke-test.sh's built-in 120s poll | |
| # races against the boot and tail-fails. | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/wait-for-datasette.sh" | |
| echo "put scripts/wait-for-datasette.sh /tmp/wait-for-datasette.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/wait-for-datasette.sh" | |
| - name: Smoke test datasette on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Reused unchanged: datasette names a db by its filename stem, so | |
| # <name>.duckdb attaches as "<name>" — the same set smoke-test.sh | |
| # expects. | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/smoke-test.sh" | |
| echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/smoke-test.sh" | |
| - name: Dump staging diagnostics (on failure) | |
| # On failure, capture the staging machine's datasette logs + memory / | |
| # process state into the CI output so the next time something breaks | |
| # it's diagnosable without reproducing. (Teardown destroys the machine | |
| # immediately after, so without this we lose the evidence.) | |
| if: failure() | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| MID="${{ steps.mach.outputs.id }}" | |
| echo "=== machine status ===" | |
| flyctl machine status "$MID" --app "$FLY_APP" 2>&1 | head -40 || true | |
| echo "=== datasette logs (tail) ===" | |
| flyctl logs --app "$FLY_APP" --machine "$MID" --no-tail 2>&1 | tail -150 || true | |
| echo "=== in-container probe ===" | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "sh -c 'grep -E \"MemTotal|MemAvailable\" /proc/meminfo; echo ---DATA---; ls -la /data; echo ---PROCS---; ps aux 2>/dev/null | grep -E \"datasette|uvicorn\" | grep -v grep'" 2>&1 | head -50 || true | |
| # ─── Promote path (only when explicitly requested) ────────────── | |
| - name: Promote staging to current | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| set -eu | |
| NEW="${{ steps.mach.outputs.id }}" | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| # 1. Put services on the staging machine so Fly's proxy routes to it. | |
| # machine update restarts the container, so the warm cache from | |
| # the smoke step is dropped. inspect-data.json on the volume | |
| # means the first /-/databases.json is fast, but datasette | |
| # still takes ~2 min to bind :8080 over 13 dbs — the next step | |
| # waits for it so we don't cordon the old machine before the | |
| # new one is serving (which would leak 502s to visitors). | |
| flyctl machine update "$NEW" --app "$FLY_APP" --yes \ | |
| --port "80:8080/tcp:http" \ | |
| --port "443:8080/tcp:tls:http" | |
| - name: Wait for datasette to bind :8080 on promoted machine | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/wait-for-datasette.sh" | |
| echo "put scripts/wait-for-datasette.sh /tmp/wait-for-datasette.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/wait-for-datasette.sh" | |
| - name: Cordon + finalize promote metadata | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| set -eu | |
| NEW="${{ steps.mach.outputs.id }}" | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| # 2/3. Cordon the old machine and let connections drain before | |
| # swapping roles. | |
| flyctl machine cordon "$OLD" --app "$FLY_APP" | |
| sleep 45 | |
| # 4. Promote metadata so the next run finds this machine. | |
| flyctl machine update "$NEW" --app "$FLY_APP" --yes \ | |
| --metadata role=current | |
| - name: Destroy old current | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| OLDVOL="${{ steps.cur.outputs.volume }}" | |
| flyctl machine destroy "$OLD" --app "$FLY_APP" --force | |
| if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then | |
| flyctl volumes destroy "$OLDVOL" -y | |
| fi | |
| # ─── Teardown of staging if not promoted (dry-run + failure path) ─ | |
| - name: Tear down staging | |
| if: always() && (env.PROMOTE != 'true' || failure()) | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| if [ -n "${{ steps.mach.outputs.id }}" ]; then | |
| flyctl machine destroy "${{ steps.mach.outputs.id }}" \ | |
| --app "$FLY_APP" --force || true | |
| fi | |
| if [ -n "${{ steps.vol.outputs.id }}" ]; then | |
| flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true | |
| fi |