Skip to content

Refresh data

Refresh data #9

Workflow file for this run

# Data refresh, blue/green style.
#
# Each run:
# 1. Builds the .db files and pushes them to R2
# 2. Creates a fresh empty 15 GB volume in iad
# 3. Creates a new machine on it, attached to the *current* machine's
# image, with role=staging metadata and no public ports
# 4. SFTPs `pull-from-r2-direct.sh` into /tmp and runs it (populates
# /data directly, no /data/incoming dance)
# 5. Restarts the machine so datasette mmaps the fresh files
# 6. Smoke-tests datasette on its private IP via flyctl ssh
# 7. Promotes: adds services to staging, cordons old, swaps role=current
# metadata, then destroys the old machine + volume
#
# Failures before promotion tear down the staging resources without
# touching production. The drain window between "staging gets services"
# and "old gets destroyed" is ~45s — during it both machines may serve
# traffic, mixing yesterday's data with today's. Acceptable tradeoff for
# our read-only batch workload.
#
# Scheduled runs always promote. Manual dispatch defaults to promote
# but can be flipped to a dry-run (build + populate + smoke + teardown)
# by setting promote=false.
name: Refresh data
on:
schedule:
- cron: "0 7 * * *"
workflow_dispatch:
inputs:
promote:
description: "Promote staging to current after smoke (destroys current machine + volume)"
type: boolean
default: true
concurrency:
group: warehouse-deploy
cancel-in-progress: false
# Public repo. GITHUB_TOKEN defaults to permissive on push/schedule
# events; lock it down so a compromised step can't write to
# issues/PRs/contents. Only `contents: read` is needed for
# actions/checkout.
permissions:
contents: read
env:
FLY_APP: warehouse
FLY_REGION: iad
R2_BUCKET: labordata-warehouse-staging
R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }}
STAGING_VOL_GB: 15
# Schedule + default-dispatch promote; workflow_dispatch can set
# promote=false to dry-run.
PROMOTE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }}
jobs:
refresh:
runs-on: ubuntu-latest
# Cap total wall time. A hung flyctl ssh or R2 stall otherwise burns
# the 6-hour GH default.
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
/opt/hostedtoolcache/CodeQL || true
df -h
- name: Install build dependencies
run: |
pip install -r requirements.txt
pip install labor-union-parser
- name: Build all databases
run: make
- name: Build inspect-data.json
run: datasette inspect *.db > inspect-data.json
- name: Upload databases to R2
env:
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }}
AWS_DEFAULT_REGION: auto
run: |
aws s3 sync . "s3://$R2_BUCKET/nightly/" \
--exclude "*" --include "*.db" --include "inspect-data.json" \
--no-progress
- name: Install flyctl
# Pinned to v1.6 commit SHA. `@master` would let an upstream
# compromise run with our FLY_API_TOKEN. Bump intentionally.
uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 # v1.6
- name: Discover current machine + volume
id: cur
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Prefer a machine tagged role=current; fall back to the first
# machine. Captures image (for the staging machine), machine ID
# (to cordon + destroy on promote), and the volume ID (to
# destroy on promote).
JSON=$(flyctl machine list --app "$FLY_APP" --json)
ROW=$(echo "$JSON" | jq '
[.[] | select(.config.metadata.role == "current")][0]
// .[0]
')
IMG=$(echo "$ROW" | jq -r '.config.image')
MID=$(echo "$ROW" | jq -r '.id')
VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume')
if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then
echo "Could not resolve current machine" >&2
# Public-repo run logs are world-readable. Show only the
# bare minimum (ids + role) — image hashes and metadata
# are unnecessary recon for an attacker, more useful to
# us only when actually debugging via SSH.
echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role}]' >&2
exit 1
fi
echo "image=$IMG" >> $GITHUB_OUTPUT
echo "machine=$MID" >> $GITHUB_OUTPUT
echo "volume=$VOL" >> $GITHUB_OUTPUT
echo "Current: machine=$MID volume=$VOL image=$IMG"
echo "Promote? $PROMOTE"
- name: Create staging volume
id: vol
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Volume name has to be a valid identifier; GITHUB_RUN_ID is numeric.
NAME="dbs_stage_${GITHUB_RUN_ID}"
VOL_ID=$(flyctl volumes create "$NAME" \
--app "$FLY_APP" \
--size "$STAGING_VOL_GB" \
--region "$FLY_REGION" \
--yes \
--json \
| jq -r '.id')
echo "id=$VOL_ID" >> $GITHUB_OUTPUT
echo "name=$NAME" >> $GITHUB_OUTPUT
echo "Created staging volume: $NAME ($VOL_ID)"
- name: Create staging machine
id: mach
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# No --port flags — staging machine has no public services until
# promotion. role=staging metadata so failure-recovery and the
# discover step in future runs can find/ignore it.
# `flyctl machine run` has no --json flag; parse stdout for the
# "Machine ID: XXX" line.
OUT=$(flyctl machine run "${{ steps.cur.outputs.image }}" \
--app "$FLY_APP" \
--region "$FLY_REGION" \
--volume "${{ steps.vol.outputs.id }}:/data" \
--metadata role=staging \
--metadata refresh_run="$GITHUB_RUN_ID" \
--vm-cpu-kind shared \
--vm-cpus 1 \
--vm-memory 2048 2>&1)
echo "$OUT"
MID=$(echo "$OUT" | grep -oE 'Machine ID: [a-f0-9]+' | head -1 | awk '{print $3}')
if [ -z "$MID" ]; then
echo "Could not parse machine ID from flyctl output" >&2
exit 1
fi
echo "id=$MID" >> $GITHUB_OUTPUT
echo "Created staging machine: $MID"
sleep 15
- name: Populate staging /data from R2
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
ASSETS=$(python -c "import json; print(' '.join(k+'.db' for k in json.load(open('inspect-data.json')).keys()))")
ASSETS="$ASSETS inspect-data.json"
echo "Pulling: $ASSETS"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/pull-from-r2-direct.sh"
echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE/nightly $ASSETS"
- name: Restart staging to mmap fresh data
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Datasette was started by the entrypoint at machine creation
# against an empty /data — the "no .db files in /data yet"
# branch of serve.sh. Restart so it picks up what we just
# downloaded.
MID="${{ steps.mach.outputs.id }}"
flyctl machine restart "$MID" --app "$FLY_APP"
# `flyctl machine restart` returns before the machine has
# finished restarting; poll until state=started before letting
# later steps try to ssh in. Cap at 3 minutes.
for i in $(seq 1 90); do
state=$(flyctl machine list --app "$FLY_APP" --json \
| jq -r --arg id "$MID" '.[] | select(.id == $id) | .state')
echo " attempt $i: state=$state"
if [ "$state" = "started" ]; then break; fi
sleep 2
done
if [ "$state" != "started" ]; then
echo "Machine never reached started state" >&2
exit 1
fi
- name: Wait for SSH (hallpass) on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# `state=started` from Fly means the VM init is up — it does not
# guarantee that hallpass (the SSH daemon on :22) has finished
# binding. Probe with a no-op `flyctl ssh console -C true` until
# it succeeds; otherwise the very next step races hallpass and
# gets "connection refused" (seen on run 26282389917 where
# `state=started` was true on poll attempt 1, leaving no dwell
# time). Cap at ~60s.
MID="${{ steps.mach.outputs.id }}"
for i in $(seq 1 30); do
if flyctl ssh console --app "$FLY_APP" --machine "$MID" \
-C "true" >/dev/null 2>&1; then
echo " ssh ready on attempt $i"
break
fi
echo " attempt $i: ssh not ready"
sleep 2
done
flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true"
- name: Smoke test datasette on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/smoke-test.sh"
echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/smoke-test.sh"
# ─── Promote path (only when explicitly requested) ──────────────
- name: Promote staging to current
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
set -eu
NEW="${{ steps.mach.outputs.id }}"
OLD="${{ steps.cur.outputs.machine }}"
# 1. Put services on the staging machine so Fly's proxy routes
# traffic to it. Ports + handlers match fly.toml's
# [[services]] block; force_https is handled by Cloudflare's
# always_use_https setting so we don't need to express it
# here. flyctl machine update auto-stops, applies, restarts;
# the new container then comes up with services + warm data
# (mmap was done on the previous restart we already did).
# Known gap vs. fly.toml: --port doesn't carry over the
# http_checks or concurrency block. We rely on the smoke
# test for the one-time pre-promotion gate; ongoing health
# checking on the new machine is missing until we switch to
# the Machines API for the full services blob.
flyctl machine update "$NEW" --app "$FLY_APP" --yes \
--port "80:8080/tcp:http" \
--port "443:8080/tcp:tls:http"
# 2. Cordon old machine: stops accepting new connections.
# Existing connections drain naturally.
flyctl machine cordon "$OLD" --app "$FLY_APP"
# 3. Drain window. Fly's default request timeout + HTTP
# keep-alive bound is ~30s; give it a bit more.
sleep 45
# 4. Promote metadata: future runs find the new machine as
# role=current.
flyctl machine update "$NEW" --app "$FLY_APP" --yes \
--metadata role=current
- name: Destroy old current
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
OLD="${{ steps.cur.outputs.machine }}"
OLDVOL="${{ steps.cur.outputs.volume }}"
flyctl machine destroy "$OLD" --app "$FLY_APP" --force
if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then
flyctl volumes destroy "$OLDVOL" -y
fi
# ─── Teardown of staging if not promoted (dry-run + failure path) ─
- name: Tear down staging
# Dry-run mode (promote=false): always tear down. Promote mode:
# tear down only if something failed before we successfully
# promoted — by that point staging is the new prod and we keep
# it.
if: always() && (env.PROMOTE != 'true' || failure())
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
if [ -n "${{ steps.mach.outputs.id }}" ]; then
flyctl machine destroy "${{ steps.mach.outputs.id }}" \
--app "$FLY_APP" --force || true
fi
if [ -n "${{ steps.vol.outputs.id }}" ]; then
flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true
fi