Refresh data #9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Data refresh, blue/green style. | |
| # | |
| # Each run: | |
| # 1. Builds the .db files and pushes them to R2 | |
| # 2. Creates a fresh empty 15 GB volume in iad | |
| # 3. Creates a new machine on it, attached to the *current* machine's | |
| # image, with role=staging metadata and no public ports | |
| # 4. SFTPs `pull-from-r2-direct.sh` into /tmp and runs it (populates | |
| # /data directly, no /data/incoming dance) | |
| # 5. Restarts the machine so datasette mmaps the fresh files | |
| # 6. Smoke-tests datasette on its private IP via flyctl ssh | |
| # 7. Promotes: adds services to staging, cordons old, swaps role=current | |
| # metadata, then destroys the old machine + volume | |
| # | |
| # Failures before promotion tear down the staging resources without | |
| # touching production. The drain window between "staging gets services" | |
| # and "old gets destroyed" is ~45s — during it both machines may serve | |
| # traffic, mixing yesterday's data with today's. Acceptable tradeoff for | |
| # our read-only batch workload. | |
| # | |
| # Scheduled runs always promote. Manual dispatch defaults to promote | |
| # but can be flipped to a dry-run (build + populate + smoke + teardown) | |
| # by setting promote=false. | |
| name: Refresh data | |
| on: | |
| schedule: | |
| - cron: "0 7 * * *" | |
| workflow_dispatch: | |
| inputs: | |
| promote: | |
| description: "Promote staging to current after smoke (destroys current machine + volume)" | |
| type: boolean | |
| default: true | |
| concurrency: | |
| group: warehouse-deploy | |
| cancel-in-progress: false | |
| # Public repo. GITHUB_TOKEN defaults to permissive on push/schedule | |
| # events; lock it down so a compromised step can't write to | |
| # issues/PRs/contents. Only `contents: read` is needed for | |
| # actions/checkout. | |
| permissions: | |
| contents: read | |
| env: | |
| FLY_APP: warehouse | |
| FLY_REGION: iad | |
| R2_BUCKET: labordata-warehouse-staging | |
| R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }} | |
| STAGING_VOL_GB: 15 | |
| # Schedule + default-dispatch promote; workflow_dispatch can set | |
| # promote=false to dry-run. | |
| PROMOTE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }} | |
| jobs: | |
| refresh: | |
| runs-on: ubuntu-latest | |
| # Cap total wall time. A hung flyctl ssh or R2 stall otherwise burns | |
| # the 6-hour GH default. | |
| timeout-minutes: 60 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Free disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ | |
| /opt/hostedtoolcache/CodeQL || true | |
| df -h | |
| - name: Install build dependencies | |
| run: | | |
| pip install -r requirements.txt | |
| pip install labor-union-parser | |
| - name: Build all databases | |
| run: make | |
| - name: Build inspect-data.json | |
| run: datasette inspect *.db > inspect-data.json | |
| - name: Upload databases to R2 | |
| env: | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }} | |
| AWS_DEFAULT_REGION: auto | |
| run: | | |
| aws s3 sync . "s3://$R2_BUCKET/nightly/" \ | |
| --exclude "*" --include "*.db" --include "inspect-data.json" \ | |
| --no-progress | |
| - name: Install flyctl | |
| # Pinned to v1.6 commit SHA. `@master` would let an upstream | |
| # compromise run with our FLY_API_TOKEN. Bump intentionally. | |
| uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 # v1.6 | |
| - name: Discover current machine + volume | |
| id: cur | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Prefer a machine tagged role=current; fall back to the first | |
| # machine. Captures image (for the staging machine), machine ID | |
| # (to cordon + destroy on promote), and the volume ID (to | |
| # destroy on promote). | |
| JSON=$(flyctl machine list --app "$FLY_APP" --json) | |
| ROW=$(echo "$JSON" | jq ' | |
| [.[] | select(.config.metadata.role == "current")][0] | |
| // .[0] | |
| ') | |
| IMG=$(echo "$ROW" | jq -r '.config.image') | |
| MID=$(echo "$ROW" | jq -r '.id') | |
| VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume') | |
| if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then | |
| echo "Could not resolve current machine" >&2 | |
| # Public-repo run logs are world-readable. Show only the | |
| # bare minimum (ids + role) — image hashes and metadata | |
| # are unnecessary recon for an attacker, more useful to | |
| # us only when actually debugging via SSH. | |
| echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role}]' >&2 | |
| exit 1 | |
| fi | |
| echo "image=$IMG" >> $GITHUB_OUTPUT | |
| echo "machine=$MID" >> $GITHUB_OUTPUT | |
| echo "volume=$VOL" >> $GITHUB_OUTPUT | |
| echo "Current: machine=$MID volume=$VOL image=$IMG" | |
| echo "Promote? $PROMOTE" | |
| - name: Create staging volume | |
| id: vol | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Volume name has to be a valid identifier; GITHUB_RUN_ID is numeric. | |
| NAME="dbs_stage_${GITHUB_RUN_ID}" | |
| VOL_ID=$(flyctl volumes create "$NAME" \ | |
| --app "$FLY_APP" \ | |
| --size "$STAGING_VOL_GB" \ | |
| --region "$FLY_REGION" \ | |
| --yes \ | |
| --json \ | |
| | jq -r '.id') | |
| echo "id=$VOL_ID" >> $GITHUB_OUTPUT | |
| echo "name=$NAME" >> $GITHUB_OUTPUT | |
| echo "Created staging volume: $NAME ($VOL_ID)" | |
| - name: Create staging machine | |
| id: mach | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # No --port flags — staging machine has no public services until | |
| # promotion. role=staging metadata so failure-recovery and the | |
| # discover step in future runs can find/ignore it. | |
| # `flyctl machine run` has no --json flag; parse stdout for the | |
| # "Machine ID: XXX" line. | |
| OUT=$(flyctl machine run "${{ steps.cur.outputs.image }}" \ | |
| --app "$FLY_APP" \ | |
| --region "$FLY_REGION" \ | |
| --volume "${{ steps.vol.outputs.id }}:/data" \ | |
| --metadata role=staging \ | |
| --metadata refresh_run="$GITHUB_RUN_ID" \ | |
| --vm-cpu-kind shared \ | |
| --vm-cpus 1 \ | |
| --vm-memory 2048 2>&1) | |
| echo "$OUT" | |
| MID=$(echo "$OUT" | grep -oE 'Machine ID: [a-f0-9]+' | head -1 | awk '{print $3}') | |
| if [ -z "$MID" ]; then | |
| echo "Could not parse machine ID from flyctl output" >&2 | |
| exit 1 | |
| fi | |
| echo "id=$MID" >> $GITHUB_OUTPUT | |
| echo "Created staging machine: $MID" | |
| sleep 15 | |
| - name: Populate staging /data from R2 | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| ASSETS=$(python -c "import json; print(' '.join(k+'.db' for k in json.load(open('inspect-data.json')).keys()))") | |
| ASSETS="$ASSETS inspect-data.json" | |
| echo "Pulling: $ASSETS" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/pull-from-r2-direct.sh" | |
| echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE/nightly $ASSETS" | |
| - name: Restart staging to mmap fresh data | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # Datasette was started by the entrypoint at machine creation | |
| # against an empty /data — the "no .db files in /data yet" | |
| # branch of serve.sh. Restart so it picks up what we just | |
| # downloaded. | |
| MID="${{ steps.mach.outputs.id }}" | |
| flyctl machine restart "$MID" --app "$FLY_APP" | |
| # `flyctl machine restart` returns before the machine has | |
| # finished restarting; poll until state=started before letting | |
| # later steps try to ssh in. Cap at 3 minutes. | |
| for i in $(seq 1 90); do | |
| state=$(flyctl machine list --app "$FLY_APP" --json \ | |
| | jq -r --arg id "$MID" '.[] | select(.id == $id) | .state') | |
| echo " attempt $i: state=$state" | |
| if [ "$state" = "started" ]; then break; fi | |
| sleep 2 | |
| done | |
| if [ "$state" != "started" ]; then | |
| echo "Machine never reached started state" >&2 | |
| exit 1 | |
| fi | |
| - name: Wait for SSH (hallpass) on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| # `state=started` from Fly means the VM init is up — it does not | |
| # guarantee that hallpass (the SSH daemon on :22) has finished | |
| # binding. Probe with a no-op `flyctl ssh console -C true` until | |
| # it succeeds; otherwise the very next step races hallpass and | |
| # gets "connection refused" (seen on run 26282389917 where | |
| # `state=started` was true on poll attempt 1, leaving no dwell | |
| # time). Cap at ~60s. | |
| MID="${{ steps.mach.outputs.id }}" | |
| for i in $(seq 1 30); do | |
| if flyctl ssh console --app "$FLY_APP" --machine "$MID" \ | |
| -C "true" >/dev/null 2>&1; then | |
| echo " ssh ready on attempt $i" | |
| break | |
| fi | |
| echo " attempt $i: ssh not ready" | |
| sleep 2 | |
| done | |
| flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true" | |
| - name: Smoke test datasette on staging | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "rm -f /tmp/smoke-test.sh" | |
| echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \ | |
| | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" | |
| flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \ | |
| -C "sh /tmp/smoke-test.sh" | |
| # ─── Promote path (only when explicitly requested) ────────────── | |
| - name: Promote staging to current | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| set -eu | |
| NEW="${{ steps.mach.outputs.id }}" | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| # 1. Put services on the staging machine so Fly's proxy routes | |
| # traffic to it. Ports + handlers match fly.toml's | |
| # [[services]] block; force_https is handled by Cloudflare's | |
| # always_use_https setting so we don't need to express it | |
| # here. flyctl machine update auto-stops, applies, restarts; | |
| # the new container then comes up with services + warm data | |
| # (mmap was done on the previous restart we already did). | |
| # Known gap vs. fly.toml: --port doesn't carry over the | |
| # http_checks or concurrency block. We rely on the smoke | |
| # test for the one-time pre-promotion gate; ongoing health | |
| # checking on the new machine is missing until we switch to | |
| # the Machines API for the full services blob. | |
| flyctl machine update "$NEW" --app "$FLY_APP" --yes \ | |
| --port "80:8080/tcp:http" \ | |
| --port "443:8080/tcp:tls:http" | |
| # 2. Cordon old machine: stops accepting new connections. | |
| # Existing connections drain naturally. | |
| flyctl machine cordon "$OLD" --app "$FLY_APP" | |
| # 3. Drain window. Fly's default request timeout + HTTP | |
| # keep-alive bound is ~30s; give it a bit more. | |
| sleep 45 | |
| # 4. Promote metadata: future runs find the new machine as | |
| # role=current. | |
| flyctl machine update "$NEW" --app "$FLY_APP" --yes \ | |
| --metadata role=current | |
| - name: Destroy old current | |
| if: env.PROMOTE == 'true' | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| OLD="${{ steps.cur.outputs.machine }}" | |
| OLDVOL="${{ steps.cur.outputs.volume }}" | |
| flyctl machine destroy "$OLD" --app "$FLY_APP" --force | |
| if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then | |
| flyctl volumes destroy "$OLDVOL" -y | |
| fi | |
| # ─── Teardown of staging if not promoted (dry-run + failure path) ─ | |
| - name: Tear down staging | |
| # Dry-run mode (promote=false): always tear down. Promote mode: | |
| # tear down only if something failed before we successfully | |
| # promoted — by that point staging is the new prod and we keep | |
| # it. | |
| if: always() && (env.PROMOTE != 'true' || failure()) | |
| env: | |
| FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} | |
| run: | | |
| if [ -n "${{ steps.mach.outputs.id }}" ]; then | |
| flyctl machine destroy "${{ steps.mach.outputs.id }}" \ | |
| --app "$FLY_APP" --force || true | |
| fi | |
| if [ -n "${{ steps.vol.outputs.id }}" ]; then | |
| flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true | |
| fi |