-
Notifications
You must be signed in to change notification settings - Fork 0
325 lines (296 loc) · 13.2 KB
/
refresh-data.yml
File metadata and controls
325 lines (296 loc) · 13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Data refresh, blue/green style.
#
# Each run:
# 1. Builds the .db files and pushes them to R2
# 2. Creates a fresh empty 15 GB volume in iad
# 3. Creates a new machine on it, attached to the *current* machine's
# image, with role=staging metadata and no public ports
# 4. SFTPs `pull-from-r2-direct.sh` into /tmp and runs it (populates
# /data directly, no /data/incoming dance)
# 5. Restarts the machine so datasette mmaps the fresh files
# 6. Smoke-tests datasette on its private IP via flyctl ssh
# 7. Promotes: adds services to staging, cordons old, swaps role=current
# metadata, then destroys the old machine + volume
#
# Failures before promotion tear down the staging resources without
# touching production. The drain window between "staging gets services"
# and "old gets destroyed" is ~45s — during it both machines may serve
# traffic, mixing yesterday's data with today's. Acceptable tradeoff for
# our read-only batch workload.
#
# Scheduled runs always promote. Manual dispatch defaults to promote
# but can be flipped to a dry-run (build + populate + smoke + teardown)
# by setting promote=false.
name: Refresh data
on:
schedule:
- cron: "0 7 * * *"
workflow_dispatch:
inputs:
promote:
description: "Promote staging to current after smoke (destroys current machine + volume)"
type: boolean
default: true
concurrency:
group: warehouse-deploy
cancel-in-progress: false
# Public repo. GITHUB_TOKEN defaults to permissive on push/schedule
# events; lock it down so a compromised step can't write to
# issues/PRs/contents. Only `contents: read` is needed for
# actions/checkout.
permissions:
contents: read
env:
FLY_APP: warehouse
FLY_REGION: iad
R2_BUCKET: labordata-warehouse-staging
R2_PUBLIC_BASE: ${{ secrets.R2_PUBLIC_BASE }}
STAGING_VOL_GB: 15
# Schedule + default-dispatch promote; workflow_dispatch can set
# promote=false to dry-run.
PROMOTE: ${{ (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && inputs.promote == true)) && 'true' || 'false' }}
jobs:
refresh:
runs-on: ubuntu-latest
# Cap total wall time. A hung flyctl ssh or R2 stall otherwise burns
# the 6-hour GH default.
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
/opt/hostedtoolcache/CodeQL || true
df -h
- name: Install build dependencies
run: |
pip install -r requirements.txt
pip install labor-union-parser
- name: Build all databases
run: make
- name: Build inspect-data.json
run: datasette inspect *.db > inspect-data.json
- name: Upload databases to R2
env:
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL_S3: ${{ secrets.R2_ENDPOINT }}
AWS_DEFAULT_REGION: auto
run: |
aws s3 sync . "s3://$R2_BUCKET/" \
--exclude "*" --include "*.db" --include "inspect-data.json" \
--no-progress
- name: Install flyctl
# Pinned to v1.6 commit SHA. `@master` would let an upstream
# compromise run with our FLY_API_TOKEN. Bump intentionally.
uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 # v1.6
- name: Discover current machine + volume
id: cur
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Prefer a machine tagged role=current; fall back to the first
# machine. Captures image (for the staging machine), machine ID
# (to cordon + destroy on promote), and the volume ID (to
# destroy on promote).
JSON=$(flyctl machine list --app "$FLY_APP" --json)
ROW=$(echo "$JSON" | jq '
[.[] | select(.config.metadata.role == "current")][0]
// .[0]
')
IMG=$(echo "$ROW" | jq -r '.config.image')
MID=$(echo "$ROW" | jq -r '.id')
VOL=$(echo "$ROW" | jq -r '.config.mounts[0].volume')
if [ -z "$IMG" ] || [ "$IMG" = "null" ]; then
echo "Could not resolve current machine" >&2
# Public-repo run logs are world-readable. Show only the
# bare minimum (ids + role) — image hashes and metadata
# are unnecessary recon for an attacker, more useful to
# us only when actually debugging via SSH.
echo "$JSON" | jq '[.[]|{id,role:.config.metadata.role}]' >&2
exit 1
fi
echo "image=$IMG" >> $GITHUB_OUTPUT
echo "machine=$MID" >> $GITHUB_OUTPUT
echo "volume=$VOL" >> $GITHUB_OUTPUT
echo "Current: machine=$MID volume=$VOL image=$IMG"
echo "Promote? $PROMOTE"
- name: Create staging volume
id: vol
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Volume name has to be a valid identifier; GITHUB_RUN_ID is numeric.
NAME="dbs_stage_${GITHUB_RUN_ID}"
VOL_ID=$(flyctl volumes create "$NAME" \
--app "$FLY_APP" \
--size "$STAGING_VOL_GB" \
--region "$FLY_REGION" \
--yes \
--json \
| jq -r '.id')
echo "id=$VOL_ID" >> $GITHUB_OUTPUT
echo "name=$NAME" >> $GITHUB_OUTPUT
echo "Created staging volume: $NAME ($VOL_ID)"
- name: Create staging machine
id: mach
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# No --port flags — staging machine has no public services until
# promotion. role=staging metadata so failure-recovery and the
# discover step in future runs can find/ignore it.
# `flyctl machine run` has no --json flag; parse stdout for the
# "Machine ID: XXX" line.
OUT=$(flyctl machine run "${{ steps.cur.outputs.image }}" \
--app "$FLY_APP" \
--region "$FLY_REGION" \
--volume "${{ steps.vol.outputs.id }}:/data" \
--metadata role=staging \
--metadata refresh_run="$GITHUB_RUN_ID" \
--vm-cpu-kind shared \
--vm-cpus 1 \
--vm-memory 2048 2>&1)
echo "$OUT"
MID=$(echo "$OUT" | grep -oE 'Machine ID: [a-f0-9]+' | head -1 | awk '{print $3}')
if [ -z "$MID" ]; then
echo "Could not parse machine ID from flyctl output" >&2
exit 1
fi
echo "id=$MID" >> $GITHUB_OUTPUT
echo "Created staging machine: $MID"
sleep 15
- name: Populate staging /data from R2
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
ASSETS=$(python -c "import json; print(' '.join(k+'.db' for k in json.load(open('inspect-data.json')).keys()))")
ASSETS="$ASSETS inspect-data.json"
echo "Pulling: $ASSETS"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/pull-from-r2-direct.sh"
echo "put scripts/pull-from-r2-direct.sh /tmp/pull-from-r2-direct.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/pull-from-r2-direct.sh $R2_PUBLIC_BASE $ASSETS"
- name: Restart staging to mmap fresh data
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# Datasette was started by the entrypoint at machine creation
# against an empty /data — the "no .db files in /data yet"
# branch of serve.sh. Restart so it picks up what we just
# downloaded.
MID="${{ steps.mach.outputs.id }}"
flyctl machine restart "$MID" --app "$FLY_APP"
# `flyctl machine restart` returns before the machine has
# finished restarting; poll until state=started before letting
# later steps try to ssh in. Cap at 3 minutes.
for i in $(seq 1 90); do
state=$(flyctl machine list --app "$FLY_APP" --json \
| jq -r --arg id "$MID" '.[] | select(.id == $id) | .state')
echo " attempt $i: state=$state"
if [ "$state" = "started" ]; then break; fi
sleep 2
done
if [ "$state" != "started" ]; then
echo "Machine never reached started state" >&2
exit 1
fi
- name: Wait for SSH (hallpass) on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
# `state=started` from Fly means the VM init is up — it does not
# guarantee that hallpass (the SSH daemon on :22) has finished
# binding. Probe with a no-op `flyctl ssh console -C true` until
# it succeeds; otherwise the very next step races hallpass and
# gets "connection refused" (seen on run 26282389917 where
# `state=started` was true on poll attempt 1, leaving no dwell
# time). Cap at ~60s.
MID="${{ steps.mach.outputs.id }}"
for i in $(seq 1 30); do
if flyctl ssh console --app "$FLY_APP" --machine "$MID" \
-C "true" >/dev/null 2>&1; then
echo " ssh ready on attempt $i"
break
fi
echo " attempt $i: ssh not ready"
sleep 2
done
flyctl ssh console --app "$FLY_APP" --machine "$MID" -C "true"
- name: Smoke test datasette on staging
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "rm -f /tmp/smoke-test.sh"
echo "put scripts/smoke-test.sh /tmp/smoke-test.sh" \
| flyctl ssh sftp shell --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}"
flyctl ssh console --app "$FLY_APP" --machine "${{ steps.mach.outputs.id }}" \
-C "sh /tmp/smoke-test.sh"
# ─── Promote path (only when explicitly requested) ──────────────
- name: Promote staging to current
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
set -eu
NEW="${{ steps.mach.outputs.id }}"
OLD="${{ steps.cur.outputs.machine }}"
# 1. Put services on the staging machine so Fly's proxy routes
# traffic to it. Ports + handlers match fly.toml's
# [[services]] block; force_https is handled by Cloudflare's
# always_use_https setting so we don't need to express it
# here. flyctl machine update auto-stops, applies, restarts;
# the new container then comes up with services + warm data
# (mmap was done on the previous restart we already did).
# Known gap vs. fly.toml: --port doesn't carry over the
# http_checks or concurrency block. We rely on the smoke
# test for the one-time pre-promotion gate; ongoing health
# checking on the new machine is missing until we switch to
# the Machines API for the full services blob.
flyctl machine update "$NEW" --app "$FLY_APP" --yes \
--port "80:8080/tcp:http" \
--port "443:8080/tcp:tls:http"
# 2. Cordon old machine: stops accepting new connections.
# Existing connections drain naturally.
flyctl machine cordon "$OLD" --app "$FLY_APP"
# 3. Drain window. Fly's default request timeout + HTTP
# keep-alive bound is ~30s; give it a bit more.
sleep 45
# 4. Promote metadata: future runs find the new machine as
# role=current.
flyctl machine update "$NEW" --app "$FLY_APP" --yes \
--metadata role=current
- name: Destroy old current
if: env.PROMOTE == 'true'
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
OLD="${{ steps.cur.outputs.machine }}"
OLDVOL="${{ steps.cur.outputs.volume }}"
flyctl machine destroy "$OLD" --app "$FLY_APP" --force
if [ -n "$OLDVOL" ] && [ "$OLDVOL" != "null" ]; then
flyctl volumes destroy "$OLDVOL" -y
fi
# ─── Teardown of staging if not promoted (dry-run + failure path) ─
- name: Tear down staging
# Dry-run mode (promote=false): always tear down. Promote mode:
# tear down only if something failed before we successfully
# promoted — by that point staging is the new prod and we keep
# it.
if: always() && (env.PROMOTE != 'true' || failure())
env:
FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
run: |
if [ -n "${{ steps.mach.outputs.id }}" ]; then
flyctl machine destroy "${{ steps.mach.outputs.id }}" \
--app "$FLY_APP" --force || true
fi
if [ -n "${{ steps.vol.outputs.id }}" ]; then
flyctl volumes destroy "${{ steps.vol.outputs.id }}" -y || true
fi