-
Notifications
You must be signed in to change notification settings - Fork 17
338 lines (315 loc) · 17.3 KB
/
Copy pathcontainer-image-worker-cd.yml
File metadata and controls
338 lines (315 loc) · 17.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
name: Container Image Worker CD (per DuckDB version)
on:
push:
branches:
- main
workflow_dispatch:
env:
ECR_REGISTRY: 795637471508.dkr.ecr.us-east-1.amazonaws.com
GHCR_REGISTRY: ghcr.io
IMAGE_NAME: duckgres-worker
# Per-DuckDB-version matrix build for cmd/duckgres-worker.
#
# Each row produces one image (or multi-arch manifest) tagged
# duckgres-worker:<sha>-duckdb<version>. The "default" row is unsuffixed
# and triggers the Charts dispatch (kept stable so the existing duckgres
# release continues to roll out as before). Non-default rows publish
# their suffixed images and stop there — operators flip a tenant's
# `image` config-store column to point at a specific suffixed tag to
# canary that DuckDB version for that tenant.
#
# To add a DuckDB version, add a row under matrix.duckdb. The
# DUCKDB_GO_VERSION / DUCKDB_BINDINGS_VERSION pair maps to the
# duckdb-go module versions; the encoding is `v0.<major><minor:02d><patch:02d>.0`,
# so DuckDB 1.5.2 → v0.10502.0 / v2.10502.0 and 1.5.3 → v0.10503.0 /
# v2.10503.0. See scripts/ducklake_version_matrix.sh for the same
# mapping in test code.
#
# Each row must declare every field below — Dockerfile.worker asserts the
# build-args are non-empty (`:?must be set`) so a forgotten key produces a
# loud build failure rather than a silent fallback to the ARG default.
# Exactly one row must set `default: true`; the validate-matrix job
# enforces this invariant against BOTH the build and manifest matrices.
jobs:
validate-matrix:
name: Validate matrix invariants
if: github.repository == 'PostHog/duckgres'
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Exactly one default:true row in build + manifest matrices
run: |
set -euo pipefail
WF=.github/workflows/container-image-worker-cd.yml
for path in '.jobs.build.strategy.matrix.duckdb' '.jobs.manifest.strategy.matrix.duckdb'; do
n=$(yq "$path | [.[] | select(.default == true)] | length" "$WF")
if [ "$n" != "1" ]; then
echo "ERROR: $path has $n rows with default:true (expected exactly 1)" >&2
yq "$path" "$WF" >&2
exit 1
fi
echo "✓ $path has exactly 1 default:true row"
done
build:
name: Build worker ${{ matrix.duckdb.version }} ${{ matrix.platform.platform }}
needs: validate-matrix
if: github.repository == 'PostHog/duckgres'
# Don't let a flaky legacy/fallback row block the default version's
# multi-arch manifest publish. Default-row failures still fail the
# workflow (the unsuffixed `:<sha>`/`:latest` tags that downstream
# Charts dispatch picks up must come from a green build).
continue-on-error: ${{ !matrix.duckdb.default }}
strategy:
fail-fast: false
matrix:
duckdb:
- version: "1.5.3"
go: "v2.10503.0"
bindings: "v0.10503.0"
# cred-refresh-write-retry: mid-statement S3 credential
# recovery on ExpiredToken read/write auth failures
# (supersedes v1.5.3-cred-refresh, which it is based on).
# Lets statements outlive their starting STS token as long
# as the credential-refresh scheduler keeps rotating the
# ducklake_s3 secret.
httpfs: "v1.5.3-cred-refresh-write-retry"
ducklake: "v1.0-posthog.4"
# Stable repo hosts postgres_scanner for 1.5.3; nightly
# does not. See scripts/ducklake_version_matrix.sh
# commit history for the rationale on each row.
pg_scanner_repo: "https://extensions.duckdb.org"
default: true
- version: "1.5.2"
go: "v2.10502.0"
bindings: "v0.10502.0"
httpfs: "v1.5.2-stoi-fix"
ducklake: "v1.0-posthog.2"
# Nightly repo: preserves byte-identity with the
# previously-published 1.5.2 worker image, which
# bundled the nightly build for the DuckLake
# metadata-pool reaper fix (PR #447). Stable v1.5.2
# postgres_scanner exists too but the binary differs;
# keep this row aligned with what was last shipped so
# it remains a true rollback target.
pg_scanner_repo: "http://nightly-extensions.duckdb.org"
default: false
platform:
- platform: linux/arm64
runner: ubuntu-24.04-arm
slug: arm64
- platform: linux/amd64
runner: ubuntu-24.04
slug: amd64
runs-on: ${{ matrix.platform.runner }}
permissions:
id-token: write
contents: read
packages: write
steps:
- name: Check out
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
with:
role-to-assume: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}
aws-region: us-east-1
- name: Login to Amazon ECR
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Login to GHCR
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: ${{ env.GHCR_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push by digest
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2
with:
context: .
file: Dockerfile.worker
push: true
platforms: ${{ matrix.platform.platform }}
tags: |
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
build-args: |
VERSION=build-${{ github.sha }}
COMMIT=${{ github.sha }}
BUILD_TAGS=kubernetes
DUCKDB_GO_VERSION=${{ matrix.duckdb.go }}
DUCKDB_BINDINGS_VERSION=${{ matrix.duckdb.bindings }}
DUCKDB_EXTENSION_VERSION=${{ matrix.duckdb.version }}
HTTPFS_EXTENSION_TAG=${{ matrix.duckdb.httpfs }}
DUCKLAKE_EXTENSION_TAG=${{ matrix.duckdb.ducklake }}
POSTGRES_SCANNER_REPOSITORY=${{ matrix.duckdb.pg_scanner_repo }}
cache-from: type=gha,scope=worker-${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
cache-to: type=gha,mode=max,scope=worker-${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
# Smoke test the freshly-pushed image. We pull from GHCR (cheaper
# than ECR) and run the binary on the runner's native arch, so no
# qemu is needed. Two assertions:
# 1. `--version` exits 0 and prints the expected build identity.
# Catches stub-binary regressions like the pre-#521 exit-1
# stub that shipped to ECR for weeks before being noticed.
# 2. The binary boots with the same arg shape the K8s pool
# hardcodes (`--mode duckdb-service --duckdb-listen :8816`)
# and reaches the "Starting DuckDB service" log line within
# 30s. Catches flag.Parse regressions like the missing
# `--mode` flag fixed in #522, and any boot-time linkage
# failure that only manifests at runtime.
# If smoke fails for any matrix cell, the dependent `manifest`
# job is skipped (default `needs:` behavior), so the unsuffixed
# multi-arch tag is never produced and downstream Charts dispatch
# never picks up a broken image.
- name: Smoke test pushed image
env:
IMAGE: ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
EXPECTED_VERSION: build-${{ github.sha }}
run: |
set -euo pipefail
docker pull "$IMAGE"
echo "::group::--version"
out=$(docker run --rm "$IMAGE" --version)
echo "$out"
if ! grep -qF "duckgres version $EXPECTED_VERSION" <<<"$out"; then
echo "✗ --version output did not include 'duckgres version $EXPECTED_VERSION'"
exit 1
fi
echo "✓ --version OK"
echo "::endgroup::"
# Worker's TCP listen path requires TLS — duckdbservice.Serve
# always loads certs from cfg.TLSCertFile/KeyFile (default
# ./certs/server.crt + .key) when listener.Network()=="tcp".
# In prod the K8s pool mounts these via a Secret. For smoke
# we generate an ephemeral self-signed pair and bind-mount
# it; DUCKGRES_CERT / DUCKGRES_KEY env feed configresolve.
# Without this, the binary boots far enough to log
# "Starting DuckDB service" then dies in Serve(), which
# the prior version of this step false-passed under (see
# PR #528 follow-up).
echo "::group::generate ephemeral TLS pair"
CERT_DIR="$(mktemp -d)"
# `-nodes` (skip private key encryption) is required: Go's
# tls.LoadX509KeyPair expects an unencrypted PEM. The cert
# lives in CI for ~30s, is never published, never reused,
# and protects nothing real — so the unencrypted key is
# the desired property here, not a vulnerability.
# nosemgrep: trailofbits.generic.openssl-insecure-flags.openssl-insecure-flags
openssl req -x509 -newkey rsa:2048 -nodes \
-keyout "$CERT_DIR/server.key" \
-out "$CERT_DIR/server.crt" \
-days 1 -subj '/CN=worker-smoke' >/dev/null 2>&1
# mktemp -d defaults to 0700, which the container's
# non-root duckgres UID can't traverse via the bind
# mount → "permission denied" loading the cert. 0755 on
# the dir + 0644 on the files lets any UID read.
chmod 755 "$CERT_DIR"
chmod 644 "$CERT_DIR"/server.crt "$CERT_DIR"/server.key
echo "::endgroup::"
echo "::group::boot smoke"
docker run -d --name worker-smoke \
-v "$CERT_DIR:/etc/worker-smoke-tls:ro" \
-e DUCKGRES_CERT=/etc/worker-smoke-tls/server.crt \
-e DUCKGRES_KEY=/etc/worker-smoke-tls/server.key \
"$IMAGE" \
--mode duckdb-service \
--duckdb-listen :8816
trap 'docker rm -f worker-smoke >/dev/null 2>&1 || true; rm -rf "$CERT_DIR"' EXIT
# Three exit paths: ok, container-exited, 30s timeout.
# The level=ERROR substring check defends against the race
# where the binary logs "Starting DuckDB service" and then
# crashes inside Serve() before docker ps notices — the
# previous version of this step false-passed under that
# exact pattern.
status=fail
for i in $(seq 1 30); do
logs=$(docker logs worker-smoke 2>&1)
if grep -q "level=ERROR" <<<"$logs"; then
echo "✗ worker logged level=ERROR before reaching ready state:"
tail -80 <<<"$logs"
break
fi
if ! docker ps --format '{{.Names}}' | grep -qx worker-smoke; then
echo "✗ worker-smoke exited before listening:"
tail -80 <<<"$logs"
break
fi
if grep -q "Starting DuckDB service" <<<"$logs"; then
echo "✓ worker reached 'Starting DuckDB service' after ${i}s"
tail -20 <<<"$logs"
status=ok
break
fi
sleep 1
done
if [ "$status" != "ok" ] && [ "$i" = "30" ]; then
echo "✗ worker did not log 'Starting DuckDB service' within 30s"
docker logs worker-smoke 2>&1 | tail -80
fi
echo "::endgroup::"
[ "$status" = "ok" ]
manifest:
name: Multi-arch manifest worker ${{ matrix.duckdb.version }}
needs: build
if: github.repository == 'PostHog/duckgres'
# Mirror the build job's tolerance: if the 1.5.2 manifest fails
# (e.g. because that row's build was continue-on-errored), the
# default 1.5.3 manifest still publishes and Charts dispatch
# proceeds.
continue-on-error: ${{ !matrix.duckdb.default }}
strategy:
fail-fast: false
matrix:
duckdb:
- version: "1.5.3"
default: true
- version: "1.5.2"
default: false
runs-on: ubuntu-24.04
permissions:
id-token: write
contents: read
packages: write
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
with:
role-to-assume: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}
aws-region: us-east-1
- name: Login to Amazon ECR
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Login to GHCR
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: ${{ env.GHCR_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Create and push ECR / GHCR manifests for this version
run: |
set -euo pipefail
TAG_BASE="${{ github.sha }}-duckdb${{ matrix.duckdb.version }}"
docker buildx imagetools create \
--tag ${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE} \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
docker buildx imagetools create \
--tag ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE} \
${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
- name: Tag default version as <sha> and latest (default rows only)
if: matrix.duckdb.default
run: |
set -euo pipefail
TAG_BASE="${{ github.sha }}-duckdb${{ matrix.duckdb.version }}"
for tag in "${{ github.sha }}" "latest"; do
docker buildx imagetools create \
--tag ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${tag} \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
docker buildx imagetools create \
--tag ${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${tag} \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
done