Container Image Worker CD (per DuckDB version)

Fix runtime worker claim race (#565) #59

Workflow file for this run

.github/workflows/container-image-worker-cd.yml at 7350e5e

	name: Container Image Worker CD (per DuckDB version)

	on:
	push:
	branches:
	- main
	workflow_dispatch:

	env:
	ECR_REGISTRY: 795637471508.dkr.ecr.us-east-1.amazonaws.com
	GHCR_REGISTRY: ghcr.io
	IMAGE_NAME: duckgres-worker

	# Per-DuckDB-version matrix build for cmd/duckgres-worker.
	#
	# Each row produces one image (or multi-arch manifest) tagged
	# duckgres-worker:<sha>-duckdb<version>. The "default" row is unsuffixed
	# and triggers the Charts dispatch (kept stable so the existing duckgres
	# release continues to roll out as before). Non-default rows publish
	# their suffixed images and stop there — operators flip a tenant's
	# `image` config-store column to point at a specific suffixed tag to
	# canary that DuckDB version for that tenant.
	#
	# To add a DuckDB version, add a row under matrix.duckdb. The
	# DUCKDB_GO_VERSION / DUCKDB_BINDINGS_VERSION pair maps to the
	# duckdb-go module versions; the encoding is `v0.<major><minor:02d><patch:02d>.0`,
	# so DuckDB 1.5.1 → v0.10501.0 / v2.10501.0 and 1.5.2 → v0.10502.0 /
	# v2.10502.0. See scripts/ducklake_version_matrix.sh for the same
	# mapping in test code.

	jobs:
	build:
	name: Build worker ${{ matrix.duckdb.version }} ${{ matrix.platform.platform }}
	if: github.repository == 'PostHog/duckgres'
	strategy:
	fail-fast: false
	matrix:
	duckdb:
	- version: "1.5.2"
	go: "v2.10502.0"
	bindings: "v0.10502.0"
	httpfs: "v1.5.2-stoi-fix"
	default: true
	- version: "1.5.1"
	go: "v2.10501.0"
	bindings: "v0.10501.0"
	httpfs: "v1.5.1-stoi-fix"
	default: false
	platform:
	- platform: linux/arm64
	runner: ubuntu-24.04-arm
	slug: arm64
	- platform: linux/amd64
	runner: ubuntu-24.04
	slug: amd64
	runs-on: ${{ matrix.platform.runner }}
	permissions:
	id-token: write
	contents: read
	packages: write

	steps:
	- name: Check out
	uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
	with:
	role-to-assume: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}
	aws-region: us-east-1

	- name: Login to Amazon ECR
	uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

	- name: Login to GHCR
	uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
	with:
	registry: ${{ env.GHCR_REGISTRY }}
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Build and push by digest
	uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6.19.2
	with:
	context: .
	file: Dockerfile.worker
	push: true
	platforms: ${{ matrix.platform.platform }}
	tags: \|
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
	${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
	build-args: \|
	VERSION=build-${{ github.sha }}
	COMMIT=${{ github.sha }}
	BUILD_TAGS=kubernetes
	DUCKDB_GO_VERSION=${{ matrix.duckdb.go }}
	DUCKDB_BINDINGS_VERSION=${{ matrix.duckdb.bindings }}
	DUCKDB_EXTENSION_VERSION=${{ matrix.duckdb.version }}
	HTTPFS_EXTENSION_TAG=${{ matrix.duckdb.httpfs }}
	cache-from: type=gha,scope=worker-${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
	cache-to: type=gha,mode=max,scope=worker-${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}

	# Smoke test the freshly-pushed image. We pull from GHCR (cheaper
	# than ECR) and run the binary on the runner's native arch, so no
	# qemu is needed. Two assertions:
	# 1. `--version` exits 0 and prints the expected build identity.
	# Catches stub-binary regressions like the pre-#521 exit-1
	# stub that shipped to ECR for weeks before being noticed.
	# 2. The binary boots with the same arg shape the K8s pool
	# hardcodes (`--mode duckdb-service --duckdb-listen :8816`)
	# and reaches the "Starting DuckDB service" log line within
	# 30s. Catches flag.Parse regressions like the missing
	# `--mode` flag fixed in #522, and any boot-time linkage
	# failure that only manifests at runtime.
	# If smoke fails for any matrix cell, the dependent `manifest`
	# job is skipped (default `needs:` behavior), so the unsuffixed
	# multi-arch tag is never produced and downstream Charts dispatch
	# never picks up a broken image.
	- name: Smoke test pushed image
	env:
	IMAGE: ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${{ github.sha }}-duckdb${{ matrix.duckdb.version }}-${{ matrix.platform.slug }}
	EXPECTED_VERSION: build-${{ github.sha }}
	run: \|
	set -euo pipefail
	docker pull "$IMAGE"

	echo "::group::--version"
	out=$(docker run --rm "$IMAGE" --version)
	echo "$out"
	if ! grep -qF "duckgres version $EXPECTED_VERSION" <<<"$out"; then
	echo "✗ --version output did not include 'duckgres version $EXPECTED_VERSION'"
	exit 1
	fi
	echo "✓ --version OK"
	echo "::endgroup::"

	# Worker's TCP listen path requires TLS — duckdbservice.Serve
	# always loads certs from cfg.TLSCertFile/KeyFile (default
	# ./certs/server.crt + .key) when listener.Network()=="tcp".
	# In prod the K8s pool mounts these via a Secret. For smoke
	# we generate an ephemeral self-signed pair and bind-mount
	# it; DUCKGRES_CERT / DUCKGRES_KEY env feed configresolve.
	# Without this, the binary boots far enough to log
	# "Starting DuckDB service" then dies in Serve(), which
	# the prior version of this step false-passed under (see
	# PR #528 follow-up).
	echo "::group::generate ephemeral TLS pair"
	CERT_DIR="$(mktemp -d)"
	# `-nodes` (skip private key encryption) is required: Go's
	# tls.LoadX509KeyPair expects an unencrypted PEM. The cert
	# lives in CI for ~30s, is never published, never reused,
	# and protects nothing real — so the unencrypted key is
	# the desired property here, not a vulnerability.
	# nosemgrep: trailofbits.generic.openssl-insecure-flags.openssl-insecure-flags
	openssl req -x509 -newkey rsa:2048 -nodes \
	-keyout "$CERT_DIR/server.key" \
	-out "$CERT_DIR/server.crt" \
	-days 1 -subj '/CN=worker-smoke' >/dev/null 2>&1
	# mktemp -d defaults to 0700, which the container's
	# non-root duckgres UID can't traverse via the bind
	# mount → "permission denied" loading the cert. 0755 on
	# the dir + 0644 on the files lets any UID read.
	chmod 755 "$CERT_DIR"
	chmod 644 "$CERT_DIR"/server.crt "$CERT_DIR"/server.key
	echo "::endgroup::"

	echo "::group::boot smoke"
	docker run -d --name worker-smoke \
	-v "$CERT_DIR:/etc/worker-smoke-tls:ro" \
	-e DUCKGRES_CERT=/etc/worker-smoke-tls/server.crt \
	-e DUCKGRES_KEY=/etc/worker-smoke-tls/server.key \
	"$IMAGE" \
	--mode duckdb-service \
	--duckdb-listen :8816
	trap 'docker rm -f worker-smoke >/dev/null 2>&1 \|\| true; rm -rf "$CERT_DIR"' EXIT

	# Three exit paths: ok, container-exited, 30s timeout.
	# The level=ERROR substring check defends against the race
	# where the binary logs "Starting DuckDB service" and then
	# crashes inside Serve() before docker ps notices — the
	# previous version of this step false-passed under that
	# exact pattern.
	status=fail
	for i in $(seq 1 30); do
	logs=$(docker logs worker-smoke 2>&1)
	if grep -q "level=ERROR" <<<"$logs"; then
	echo "✗ worker logged level=ERROR before reaching ready state:"
	tail -80 <<<"$logs"
	break
	fi
	if ! docker ps --format '{{.Names}}' \| grep -qx worker-smoke; then
	echo "✗ worker-smoke exited before listening:"
	tail -80 <<<"$logs"
	break
	fi
	if grep -q "Starting DuckDB service" <<<"$logs"; then
	echo "✓ worker reached 'Starting DuckDB service' after ${i}s"
	tail -20 <<<"$logs"
	status=ok
	break
	fi
	sleep 1
	done
	if [ "$status" != "ok" ] && [ "$i" = "30" ]; then
	echo "✗ worker did not log 'Starting DuckDB service' within 30s"
	docker logs worker-smoke 2>&1 \| tail -80
	fi
	echo "::endgroup::"
	[ "$status" = "ok" ]

	manifest:
	name: Multi-arch manifest worker ${{ matrix.duckdb.version }}
	needs: build
	if: github.repository == 'PostHog/duckgres'
	strategy:
	fail-fast: false
	matrix:
	duckdb:
	- version: "1.5.2"
	default: true
	- version: "1.5.1"
	default: false
	runs-on: ubuntu-24.04
	permissions:
	id-token: write
	contents: read
	packages: write

	steps:
	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3.12.0

	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
	with:
	role-to-assume: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}
	aws-region: us-east-1

	- name: Login to Amazon ECR
	uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1

	- name: Login to GHCR
	uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
	with:
	registry: ${{ env.GHCR_REGISTRY }}
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Create and push ECR / GHCR manifests for this version
	run: \|
	set -euo pipefail
	TAG_BASE="${{ github.sha }}-duckdb${{ matrix.duckdb.version }}"
	docker buildx imagetools create \
	--tag ${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE} \
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
	docker buildx imagetools create \
	--tag ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE} \
	${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
	${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64

	- name: Tag default version as <sha> and latest (default rows only)
	if: matrix.duckdb.default
	run: \|
	set -euo pipefail
	TAG_BASE="${{ github.sha }}-duckdb${{ matrix.duckdb.version }}"
	for tag in "${{ github.sha }}" "latest"; do
	docker buildx imagetools create \
	--tag ${{ env.GHCR_REGISTRY }}/posthog/${{ env.IMAGE_NAME }}:${tag} \
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
	docker buildx imagetools create \
	--tag ${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${tag} \
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-arm64 \
	${{ env.ECR_REGISTRY }}/${{ env.IMAGE_NAME }}:${TAG_BASE}-amd64
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix runtime worker claim race (#565) #59

Workflow file

Fix runtime worker claim race (#565) #59

Uh oh!

Workflow file for this run