warehouse/.github/workflows/refresh-data.yml at main · labordata/warehouse · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Data refresh, blue/green style.
#
# Each run:
#   1. Builds the .db files and pushes them to R2
#   2. Creates a fresh empty 15 GB volume in iad
#   3. Creates a new machine on it, attached to the *current* machine's
#      image, with role=staging metadata and no public ports
#   4. SFTPs `pull-from-r2-direct.sh` into /tmp and runs it (populates
#      /data directly, no /data/incoming dance)
#   5. Restarts the machine so datasette mmaps the fresh files
#   6. Smoke-tests datasette on its private IP via flyctl ssh
#   7. Promotes: adds services to staging, cordons old, swaps role=current
#      metadata, then destroys the old machine + volume
#
# Failures before promotion tear down the staging resources without
# touching production. The drain window between "staging gets services"
# and "old gets destroyed" is ~45s — during it both machines may serve
# traffic, mixing yesterday's data with today's. Acceptable tradeoff for
# our read-only batch workload.
#
# Scheduled runs always promote. Manual dispatch defaults to promote
# but can be flipped to a dry-run (build + populate + smoke + teardown)
# by setting promote=false.

name: Refresh data

on:
  schedule:
    - cron: "0 7 * * *"
  workflow_dispatch:
    inputs:
      promote:
        description: "Promote staging to current after smoke (destroys current machine + volume)"
        type: boolean
        default: true

concurrency:
  group: warehouse-deploy
  cancel-in-progress: false

# Public repo. GITHUB_TOKEN defaults to permissive on push/schedule
# events; lock it down so a compromised step can't write to
# issues/PRs/contents. Only `contents: read` is needed for
# actions/checkout.
permissions:
  contents: read

env:
  FLY_APP: warehouse
  FLY_REGION: iad
  R2_BUCKET: labordata-warehouse-staging
  R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }}
  STAGING_VOL_GB: 15
  # Schedule + default-dispatch promote; workflow_dispatch can set
  # promote=false to dry-run.
  PROMOTE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }}

jobs:
  refresh:
    runs-on: ubuntu-latest
    # Cap total wall time. A hung flyctl ssh or R2 stall otherwise burns
    # the 6-hour GH default.
    timeout-minutes: 60
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
                       /opt/hostedtoolcache/CodeQL || true
          df -h

      - name: Install build dependencies
        run: |
          pip install -r requirements.txt
          pip install labor-union-parser

      - name: Build all databases
        run: make

      - name: Build inspect-data.json
        run: datasette inspect *.db > inspect-data.json

      - name: Upload databases to R2
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
          AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }}
          AWS_DEFAULT_REGION: auto
        run: |
          aws s3 sync . "s3://$R2_BUCKET/" \
            --exclude "*" --include "*.db" --include "inspect-data.json" \
            --no-progress

      - name: Install flyctl
        # Pinned to v1.6 commit SHA. `@master` would let an upstream
        # compromise run with our FLY_API_TOKEN. Bump intentionally.
        uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1  # v1.6

      - name: Discover current machine + volume
        id: cur
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          # Prefer a machine tagged role=current; fall back to the first
          # machine. Captures image (for the staging machine), machine ID
          # (to cordon + destroy on promote), and the volume ID (to
          # destroy on promote).
          JSON=$(flyctl machine list --app "$FLY_APP" --json)
          ROW=$(echo "$JSON" | jq '
            [.[] | select(.config.metadata.role == "current")][0]
            // .[0]
          ')
          IMG=$(echo "$ROW" | jq -r '.config.image')
          MID=$(echo "$ROW" | jq -r '.id')
          VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume')
          if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then
            echo "Could not resolve current machine" >&2
            # Public-repo run logs are world-readable. Show only the
            # bare minimum (ids + role) — image hashes and metadata
            # are unnecessary recon for an attacker, more useful to
            # us only when actually debugging via SSH.
            echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role}]' >&2
            exit 1
          fi
          echo "image=$IMG" >> $GITHUB_OUTPUT
          echo "machine=$MID" >> $GITHUB_OUTPUT
          echo "volume=$VOL" >> $GITHUB_OUTPUT
          echo "Current:  machine=$MID  volume=$VOL  image=$IMG"
          echo "Promote?  $PROMOTE"

      - name: Create staging volume
        id: vol
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          # Volume name has to be a valid identifier; GITHUB_RUN_ID is numeric.
          NAME="dbs_stage_${GITHUB_RUN_ID}"
          VOL_ID=$(flyctl volumes create "$NAME" \
            --app "$FLY_APP" \
            --size "$STAGING_VOL_GB" \
            --region "$FLY_REGION" \
            --yes \
            --json \
            | jq -r '.id')
          echo "id=$VOL_ID" >> $GITHUB_OUTPUT
          echo "name=$NAME" >> $GITHUB_OUTPUT
          echo "Created staging volume: $NAME ($VOL_ID)"

      - name: Create staging machine
        id: mach
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          # No --port flags — staging machine has no public services until
          # promotion. role=staging metadata so failure-recovery and the
          # discover step in future runs can find/ignore it.
          # `flyctl machine run` has no --json flag; parse stdout for the
          # "Machine ID: XXX" line.
          OUT=$(flyctl machine run "${{ steps.cur.outputs.image }}" \
            --app "$FLY_APP" \
            --region "$FLY_REGION" \
            --volume "${{ steps.vol.outputs.id }}:/data" \
            --metadata role=staging \
            --metadata refresh_run="$GITHUB_RUN_ID" \
            --vm-cpu-kind shared \
            --vm-cpus 1 \
            --vm-memory 2048 2>&1)
          echo "$OUT"
          MID=$(echo "$OUT" | grep -oE 'Machine ID: [a-f0-9]+' | head -1 | awk '{print $3}')
          if [ -z "$MID" ]; then
            echo "Could not parse machine ID from flyctl output" >&2
            exit 1
          fi
          echo "id=$MID" >> $GITHUB_OUTPUT
          echo "Created staging machine: $MID"
          sleep 15

      - name: Populate staging /data from R2
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          ASSETS=$(python -c "import json; print(' '.join(k+'.db' for k in json.load(open('inspect-data.json')).keys()))")
          ASSETS="$ASSETS inspect-data.json"
          echo "Pulling: $ASSETS"
          flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
            -C "rm -f /tmp/pull-from-r2-direct.sh"
          echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \
            | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
          flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
            -C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE $ASSETS"

      - name: Restart staging to mmap fresh data
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          # Datasette was started by the entrypoint at machine creation
          # against an empty /data — the "no .db files in /data yet"
          # branch of serve.sh. Restart so it picks up what we just
          # downloaded.
          MID="${{ steps.mach.outputs.id }}"
          flyctl machine restart "$MID" --app "$FLY_APP"
          # `flyctl machine restart` returns before the machine has
          # finished restarting; poll until state=started before letting
          # later steps try to ssh in. Cap at 3 minutes.
          for i in $(seq 1 90); do
            state=$(flyctl machine list --app "$FLY_APP" --json \
              | jq -r --arg id "$MID" '.[] | select(.id == $id) | .state')
            echo "  attempt $i: state=$state"
            if [ "$state" = "started" ]; then break; fi
            sleep 2
          done
          if [ "$state" != "started" ]; then
            echo "Machine never reached started state" >&2
            exit 1
          fi

      - name: Wait for SSH (hallpass) on staging
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          # `state=started` from Fly means the VM init is up — it does not
          # guarantee that hallpass (the SSH daemon on :22) has finished
          # binding. Probe with a no-op `flyctl ssh console -C true` until
          # it succeeds; otherwise the very next step races hallpass and
          # gets "connection refused" (seen on run 26282389917 where
          # `state=started` was true on poll attempt 1, leaving no dwell
          # time). Cap at ~60s.
          MID="${{ steps.mach.outputs.id }}"
          for i in $(seq 1 30); do
            if flyctl ssh console --app "$FLY_APP" --machine "$MID" \
                 -C "true" >/dev/null 2>&1; then
              echo "  ssh ready on attempt $i"
              break
            fi
            echo "  attempt $i: ssh not ready"
            sleep 2
          done
          flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true"

      - name: Smoke test datasette on staging
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
            -C "rm -f /tmp/smoke-test.sh"
          echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \
            | flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
          flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
            -C "sh /tmp/smoke-test.sh"

      # ─── Promote path (only when explicitly requested) ──────────────

      - name: Promote staging to current
        if: env.PROMOTE == 'true'
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          set -eu
          NEW="${{ steps.mach.outputs.id }}"
          OLD="${{ steps.cur.outputs.machine }}"

          # 1. Put services on the staging machine so Fly's proxy routes
          #    traffic to it. Ports + handlers match fly.toml's
          #    [[services]] block; force_https is handled by Cloudflare's
          #    always_use_https setting so we don't need to express it
          #    here. flyctl machine update auto-stops, applies, restarts;
          #    the new container then comes up with services + warm data
          #    (mmap was done on the previous restart we already did).
          #    Known gap vs. fly.toml: --port doesn't carry over the
          #    http_checks or concurrency block. We rely on the smoke
          #    test for the one-time pre-promotion gate; ongoing health
          #    checking on the new machine is missing until we switch to
          #    the Machines API for the full services blob.
          flyctl machine update "$NEW" --app "$FLY_APP" --yes \
            --port "80:8080/tcp:http" \
            --port "443:8080/tcp:tls:http"

          # 2. Cordon old machine: stops accepting new connections.
          #    Existing connections drain naturally.
          flyctl machine cordon "$OLD" --app "$FLY_APP"

          # 3. Drain window. Fly's default request timeout + HTTP
          #    keep-alive bound is ~30s; give it a bit more.
          sleep 45

          # 4. Promote metadata: future runs find the new machine as
          #    role=current.
          flyctl machine update "$NEW" --app "$FLY_APP" --yes \
            --metadata role=current

      - name: Destroy old current
        if: env.PROMOTE == 'true'
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          OLD="${{ steps.cur.outputs.machine }}"
          OLDVOL="${{ steps.cur.outputs.volume }}"
          flyctl machine destroy "$OLD" --app "$FLY_APP" --force
          if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then
            flyctl volumes destroy "$OLDVOL" -y
          fi

      # ─── Teardown of staging if not promoted (dry-run + failure path) ─

      - name: Tear down staging
        # Dry-run mode (promote=false): always tear down. Promote mode:
        # tear down only if something failed before we successfully
        # promoted — by that point staging is the new prod and we keep
        # it.
        if: always() && (env.PROMOTE != 'true' || failure())
        env:
          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
        run: |
          if [ -n "${{ steps.mach.outputs.id }}" ]; then
            flyctl machine destroy "${{ steps.mach.outputs.id }}" \
              --app "$FLY_APP" --force || true
          fi
          if [ -n "${{ steps.vol.outputs.id }}" ]; then
            flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true
          fi