Skip to content

Fix runtime worker claim race (#565) #59

Fix runtime worker claim race (#565)

Fix runtime worker claim race (#565) #59

name: Container Image Worker CD (per DuckDB version)
on:
push:
branches:
- main
workflow_dispatch:
env:
ECR_REGISTRY: 795637471508.dkr.ecr.us-east-1.amazonaws.com
GHCR_REGISTRY: ghcr.io
IMAGE_NAME: duckgres-worker
# Per-DuckDB-version matrix build for cmd/duckgres-worker.
#
# Each row produces one image (or multi-arch manifest) tagged
# duckgres-worker:<sha>-duckdb<version>. The "default" row is unsuffixed
# and triggers the Charts dispatch (kept stable so the existing duckgres
# release continues to roll out as before). Non-default rows publish
# their suffixed images and stop there — operators flip a tenant's
# `image` config-store column to point at a specific suffixed tag to
# canary that DuckDB version for that tenant.
#
# To add a DuckDB version, add a row under matrix.duckdb. The
# DUCKDB_GO_VERSION / DUCKDB_BINDINGS_VERSION pair maps to the
# duckdb-go module versions; the encoding is `v0.<major><minor:02d><patch:02d>.0`,
# so DuckDB 1.5.1 → v0.10501.0 / v2.10501.0 and 1.5.2 → v0.10502.0 /
# v2.10502.0. See scripts/ducklake_version_matrix.sh for the same
# mapping in test code.
jobs:
build:
name: Build worker ${{ matrix.duckdb.version }} ${{ matrix.platform.platform }}
if: github.repository == 'PostHog/duckgres'
strategy:
fail-fast: false
matrix:
duckdb:
- version: "1.5.2"
go: "v2.10502.0"
bindings: "v0.10502.0"
httpfs: "v1.5.2-stoi-fix"
default: true
- version: "1.5.1"
go: "v2.10501.0"
bindings: "v0.10501.0"
httpfs: "v1.5.1-stoi-fix"
default: false
platform:
- platform: linux/arm64
runner: ubuntu-24.04-arm
slug: arm64
- platform: linux/amd64
runner: ubuntu-24.04
slug: amd64
runs-on: ${{ matrix.platform.runner }}
permissions:
id-token: write
contents: read
packages: write
steps:
- name: Check out
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
with:
role-to-assume: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}
aws-region: us-east-1
- name: Login to Amazon ECR
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Login to GHCR
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: ${{ env.GHCR_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push by digest
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2
with:
context: .
file: Dockerfile.worker
push: true
platforms: ${{ matrix.platform.platform }}
tags: |
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
build-args: |
VERSION=build-${{ github.sha }}
COMMIT=${{ github.sha }}
BUILD_TAGS=kubernetes
DUCKDB_GO_VERSION=${{ matrix.duckdb.go }}
DUCKDB_BINDINGS_VERSION=${{ matrix.duckdb.bindings }}
DUCKDB_EXTENSION_VERSION=${{ matrix.duckdb.version }}
HTTPFS_EXTENSION_TAG=${{ matrix.duckdb.httpfs }}
cache-from: type=gha,scope=worker-${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
cache-to: type=gha,mode=max,scope=worker-${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
# Smoke test the freshly-pushed image. We pull from GHCR (cheaper
# than ECR) and run the binary on the runner's native arch, so no
# qemu is needed. Two assertions:
# 1. `--version` exits 0 and prints the expected build identity.
# Catches stub-binary regressions like the pre-#521 exit-1
# stub that shipped to ECR for weeks before being noticed.
# 2. The binary boots with the same arg shape the K8s pool
# hardcodes (`--mode duckdb-service --duckdb-listen :8816`)
# and reaches the "Starting DuckDB service" log line within
# 30s. Catches flag.Parse regressions like the missing
# `--mode` flag fixed in #522, and any boot-time linkage
# failure that only manifests at runtime.
# If smoke fails for any matrix cell, the dependent `manifest`
# job is skipped (default `needs:` behavior), so the unsuffixed
# multi-arch tag is never produced and downstream Charts dispatch
# never picks up a broken image.
- name: Smoke test pushed image
env:
IMAGE: ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
EXPECTED_VERSION: build-${{ github.sha }}
run: |
set -euo pipefail
docker pull "$IMAGE"
echo "::group::--version"
out=$(docker run --rm "$IMAGE" --version)
echo "$out"
if ! grep -qF "duckgres version $EXPECTED_VERSION" <<<"$out"; then
echo "✗ --version output did not include 'duckgres version $EXPECTED_VERSION'"
exit 1
fi
echo "✓ --version OK"
echo "::endgroup::"
# Worker's TCP listen path requires TLS — duckdbservice.Serve
# always loads certs from cfg.TLSCertFile/KeyFile (default
# ./certs/server.crt + .key) when listener.Network()=="tcp".
# In prod the K8s pool mounts these via a Secret. For smoke
# we generate an ephemeral self-signed pair and bind-mount
# it; DUCKGRES_CERT / DUCKGRES_KEY env feed configresolve.
# Without this, the binary boots far enough to log
# "Starting DuckDB service" then dies in Serve(), which
# the prior version of this step false-passed under (see
# PR #528 follow-up).
echo "::group::generate ephemeral TLS pair"
CERT_DIR="$(mktemp -d)"
# `-nodes` (skip private key encryption) is required: Go's
# tls.LoadX509KeyPair expects an unencrypted PEM. The cert
# lives in CI for ~30s, is never published, never reused,
# and protects nothing real — so the unencrypted key is
# the desired property here, not a vulnerability.
# nosemgrep: trailofbits.generic.openssl-insecure-flags.openssl-insecure-flags
openssl req -x509 -newkey rsa:2048 -nodes \
-keyout "$CERT_DIR/server.key" \
-out "$CERT_DIR/server.crt" \
-days 1 -subj '/CN=worker-smoke' >/dev/null 2>&1
# mktemp -d defaults to 0700, which the container's
# non-root duckgres UID can't traverse via the bind
# mount → "permission denied" loading the cert. 0755 on
# the dir + 0644 on the files lets any UID read.
chmod 755 "$CERT_DIR"
chmod 644 "$CERT_DIR"/server.crt "$CERT_DIR"/server.key
echo "::endgroup::"
echo "::group::boot smoke"
docker run -d --name worker-smoke \
-v "$CERT_DIR:/etc/worker-smoke-tls:ro" \
-e DUCKGRES_CERT=/etc/worker-smoke-tls/server.crt \
-e DUCKGRES_KEY=/etc/worker-smoke-tls/server.key \
"$IMAGE" \
--mode duckdb-service \
--duckdb-listen :8816
trap 'docker rm -f worker-smoke >/dev/null 2>&1 || true; rm -rf "$CERT_DIR"' EXIT
# Three exit paths: ok, container-exited, 30s timeout.
# The level=ERROR substring check defends against the race
# where the binary logs "Starting DuckDB service" and then
# crashes inside Serve() before docker ps notices — the
# previous version of this step false-passed under that
# exact pattern.
status=fail
for i in $(seq 1 30); do
logs=$(docker logs worker-smoke 2>&1)
if grep -q "level=ERROR" <<<"$logs"; then
echo "✗ worker logged level=ERROR before reaching ready state:"
tail -80 <<<"$logs"
break
fi
if ! docker ps --format '{{.Names}}' | grep -qx worker-smoke; then
echo "✗ worker-smoke exited before listening:"
tail -80 <<<"$logs"
break
fi
if grep -q "Starting DuckDB service" <<<"$logs"; then
echo "✓ worker reached 'Starting DuckDB service' after ${i}s"
tail -20 <<<"$logs"
status=ok
break
fi
sleep 1
done
if [ "$status" != "ok" ] && [ "$i" = "30" ]; then
echo "✗ worker did not log 'Starting DuckDB service' within 30s"
docker logs worker-smoke 2>&1 | tail -80
fi
echo "::endgroup::"
[ "$status" = "ok" ]
manifest:
name: Multi-arch manifest worker ${{ matrix.duckdb.version }}
needs: build
if: github.repository == 'PostHog/duckgres'
strategy:
fail-fast: false
matrix:
duckdb:
- version: "1.5.2"
default: true
- version: "1.5.1"
default: false
runs-on: ubuntu-24.04
permissions:
id-token: write
contents: read
packages: write
steps:
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
with:
role-to-assume: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}
aws-region: us-east-1
- name: Login to Amazon ECR
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
- name: Login to GHCR
uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
with:
registry: ${{ env.GHCR_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Create and push ECR / GHCR manifests for this version
run: |
set -euo pipefail
TAG_BASE="${{ github.sha }}-duckdb${{ matrix.duckdb.version }}"
docker buildx imagetools create \
--tag ${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE} \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
docker buildx imagetools create \
--tag ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE} \
${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
- name: Tag default version as <sha> and latest (default rows only)
if: matrix.duckdb.default
run: |
set -euo pipefail
TAG_BASE="${{ github.sha }}-duckdb${{ matrix.duckdb.version }}"
for tag in "${{ github.sha }}" "latest"; do
docker buildx imagetools create \
--tag ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${tag} \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
docker buildx imagetools create \
--tag ${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${tag} \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
done