ci(e2e-mw-dev): cut ~2.5min off the gating check + de-flake one_session_per_worker #230
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Per-PR end-to-end test against the real posthog-mw-dev EKS cluster. | |
| # | |
| # Replaces what kind (tests/k8s/) cannot exercise: real Cilium network | |
| # policies, real Crossplane Duckling provisioning, real cnpg-shard + external | |
| # RDS metadata stores, and the real per-org Lakekeeper operator — the layers | |
| # where this quarter's bugs actually lived (Cilium egress, lakekeeper | |
| # encryption-key drift, cnpg role drift, RBAC delete gaps). | |
| # | |
| # Flow: | |
| # 1. build arm64-only worker + control-plane images, tagged pr-<N>-<sha>, | |
| # pushed to ECR (mw-dev is arm64, so one arch is enough for a PR). | |
| # 2. join the org tailnet via OIDC/WIF → reach the private mw-dev EKS API. | |
| # 3. stand up an isolated namespace duckgres-ci-pr-<N>: a throwaway | |
| # config-store Postgres + a control-plane Deployment running the PR image, | |
| # spawning worker pods in the same namespace. | |
| # 4. run the e2e harness as a Job INSIDE the namespace, talking to the CP | |
| # ClusterIP service (no public DNS / NLB needed). Covers the cnpg-shard | |
| # and external metadata backends. | |
| # 5. always tear the namespace down (and deprovision the ci-pr ducklings so | |
| # no S3 / cnpg role / lakekeeper CR leaks on shared infra). | |
| # | |
| # SECURITY — who can run this: | |
| # Same model as the AWS/OIDC job in ci.yml: the gate is the repo setting | |
| # "Require approval for outside collaborators". Members' PRs run | |
| # automatically; fork PRs from outside collaborators get NO secrets and don't | |
| # run until a maintainer clicks approve-and-run, so they can't reach the | |
| # cluster or assume the IAM role unapproved. (No per-workflow guard job / | |
| # required-reviewer Environment — that would either block external PRs even | |
| # after approval, or force an approval click on every maintainer push.) | |
| # | |
| # Required repo configuration (one-time, see tests/e2e-mw-dev/README.md): | |
| # vars: TS_WIF_CLIENT_ID_MW_DEV, TS_WIF_AUDIENCE_MW_DEV | |
| # secrets: AWS_ECR_PUBLISH_IAM_ROLE (already exists, used by CD), | |
| # MW_DEV_ACCOUNT_ID (mw-dev AWS account id; ARNs built from it) | |
| # IAM: github-duckgres-e2e role in the mw-dev account (posthog-cloud-infra) | |
| # — stripped down, NOT the account-admin terraform-infra role. | |
| # Repo setting: "Require approval for all outside collaborators". | |
| name: e2e-mw-dev | |
| on: | |
| pull_request: | |
| branches: [main] | |
| # Scope: only run when something that could change runtime behavior moves. | |
| # (Docs-only PRs shouldn't spin up a cluster namespace.) | |
| paths: | |
| - "**/*.go" | |
| - "go.mod" | |
| - "go.sum" | |
| - "Dockerfile*" | |
| - "tests/e2e-mw-dev/**" | |
| - ".github/workflows/e2e-mw-dev.yml" | |
| - ".github/workflows/_image-build.yml" | |
| workflow_dispatch: | |
| # Cleanup backstop: every 6h, sweep per-PR namespaces orphaned by runs that | |
| # died hard (cancelled mid-flight, runner OOM) before their always() teardown | |
| # could fire. Only the `e2e-cleanup` job runs on this trigger. | |
| schedule: | |
| - cron: "0 */6 * * *" | |
| # One in-flight run per PR; a new push cancels the old run (and its namespace | |
| # is GC'd by the always() teardown of the cancelled run + the e2e-cleanup sweep). | |
| concurrency: | |
| group: e2e-mw-dev-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| permissions: | |
| id-token: write | |
| contents: read | |
| jobs: | |
| # mw-dev runs ONE all-in-one `duckgres` image for both roles: the control | |
| # plane is `--mode control-plane` on it, and DUCKGRES_K8S_WORKER_IMAGE points | |
| # at the same image (workers run `--mode duckdb-service`). So the e2e flow | |
| # builds that single image — not the separate worker/controlplane CD images — | |
| # to match what actually ships to mw-dev. | |
| build: | |
| # PR / manual only — the scheduled trigger runs nothing but e2e-cleanup. | |
| if: github.event_name != 'schedule' | |
| uses: ./.github/workflows/_image-build.yml | |
| with: | |
| dockerfile: Dockerfile | |
| image-name: duckgres | |
| tag: pr-${{ github.event.pull_request.number }}-${{ github.sha }}-arm64 | |
| platform: linux/arm64 | |
| cache-scope: e2e-duckgres-arm64 | |
| # Default DuckDB row (1.5.3) — mirrors the default:true matrix entry in | |
| # container-image-worker-cd.yml. Keep in lock-step on a version bump. | |
| build-args: | | |
| DUCKDB_EXTENSION_VERSION=1.5.3 | |
| HTTPFS_EXTENSION_TAG=v1.5.3-stoi-fix | |
| DUCKLAKE_EXTENSION_TAG=v1.0-posthog.4 | |
| DUCKDB_EXTENSION_REPOSITORY=https://extensions.duckdb.org | |
| POSTGRES_SCANNER_REPOSITORY=https://extensions.duckdb.org | |
| secrets: | |
| ecr-role: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }} | |
| e2e: | |
| needs: [build] | |
| if: github.event_name != 'schedule' | |
| runs-on: ubuntu-24.04-arm | |
| timeout-minutes: 40 | |
| env: | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| NAMESPACE: duckgres-ci-pr-${{ github.event.pull_request.number }} | |
| # Single all-in-one image for both CP and workers (mw-dev parity). | |
| WORKER_IMAGE: ${{ needs.build.outputs.image }} | |
| CONTROLPLANE_IMAGE: ${{ needs.build.outputs.image }} | |
| KUBE_CONTEXT: posthog-mw-dev | |
| CLUSTER_NAME: posthog-mw-dev | |
| EKS_CLUSTER_NAME: posthog-mw-dev | |
| AWS_REGION: us-east-1 | |
| # The per-PR CP assumes the SAME EKS Pod Identity role as the real mw-dev | |
| # control plane, so STS-brokered S3 activation works with no cred | |
| # injection. Built from the account-id secret (no account id committed) + | |
| # the non-sensitive role name. | |
| CP_POD_IDENTITY_ROLE: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/duckgres-control-plane-dev | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - name: Configure AWS credentials (OIDC) | |
| uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 | |
| with: | |
| # Dedicated, stripped-down e2e role (NOT the account-admin | |
| # terraform-infra role): eks:DescribeCluster + Pod Identity | |
| # association calls + iam:PassRole on the CP role, plus an EKS access | |
| # entry for the kubectl it needs. Defined in posthog-cloud-infra | |
| # (mw-dev account). Account id comes from a repo secret so no AWS | |
| # account id is committed; the role name is not sensitive. | |
| role-to-assume: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/github-duckgres-e2e | |
| aws-region: us-east-1 | |
| # Private EKS API — join the org tailnet so kubectl can reach it. OIDC/WIF, | |
| # no static key. Mirrors PostHog/hogland deploy.yml. The subnet router | |
| # advertises the mw-dev VPC incl. the private API endpoint. | |
| - name: Connect to Tailscale | |
| uses: tailscale/github-action@306e68a486fd2350f2bfc3b19fcd143891a4a2d8 # v4.1.2 | |
| with: | |
| oauth-client-id: ${{ vars.TS_WIF_CLIENT_ID_MW_DEV }} | |
| audience: ${{ vars.TS_WIF_AUDIENCE_MW_DEV }} | |
| tags: tag:github-runner | |
| - name: Install kubectl | |
| uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1 | |
| - name: Update kubeconfig | |
| # --alias so the context is named posthog-mw-dev (matches KUBE_CONTEXT / | |
| # run.sh's explicit --context). Without it the context defaults to the | |
| # full cluster ARN and `kubectl --context posthog-mw-dev` fails with | |
| # "context does not exist". | |
| run: aws eks update-kubeconfig --name "$CLUSTER_NAME" --region us-east-1 --alias "$KUBE_CONTEXT" | |
| - name: Deploy isolated namespace | |
| run: bash tests/e2e-mw-dev/run.sh deploy | |
| - name: Run e2e harness (in-cluster Job) | |
| run: bash tests/e2e-mw-dev/run.sh test | |
| # Diagnostics are only worth collecting (and only get read) when the | |
| # harness failed — on green runs this step just added ~15s to the gating | |
| # check. failure() also covers a failed deploy step. | |
| - name: Collect diagnostics | |
| if: failure() | |
| run: bash tests/e2e-mw-dev/run.sh diagnostics | |
| # Teardown runs as its OWN job so the gating `e2e` check completes the moment | |
| # the harness verdict is known instead of waiting ~1min for deprovision + | |
| # namespace delete. cmd_teardown recovers the internal secret from the | |
| # in-cluster duckgres-tokens Secret, so it doesn't need the deploy runner's | |
| # disk. Trade-off vs the old in-job always() step: a run cancelled by | |
| # concurrency (new push) also cancels this queued job — but the new push's | |
| # deploy cleans the stale namespace first thing, and the 6h e2e-cleanup sweep | |
| # is the backstop for namespaces with no follow-up push. | |
| e2e-teardown: | |
| needs: [e2e] | |
| # Run on e2e success OR failure (not when e2e was skipped because build | |
| # failed — nothing was deployed). Scheduled trigger never reaches here. | |
| if: ${{ always() && github.event_name != 'schedule' && contains(fromJSON('["success","failure"]'), needs.e2e.result) }} | |
| runs-on: ubuntu-24.04-arm | |
| timeout-minutes: 20 | |
| env: | |
| PR_NUMBER: ${{ github.event.pull_request.number }} | |
| NAMESPACE: duckgres-ci-pr-${{ github.event.pull_request.number }} | |
| KUBE_CONTEXT: posthog-mw-dev | |
| CLUSTER_NAME: posthog-mw-dev | |
| EKS_CLUSTER_NAME: posthog-mw-dev | |
| AWS_REGION: us-east-1 | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - name: Configure AWS credentials (OIDC) | |
| uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 | |
| with: | |
| role-to-assume: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/github-duckgres-e2e | |
| aws-region: us-east-1 | |
| - name: Connect to Tailscale | |
| uses: tailscale/github-action@306e68a486fd2350f2bfc3b19fcd143891a4a2d8 # v4.1.2 | |
| with: | |
| oauth-client-id: ${{ vars.TS_WIF_CLIENT_ID_MW_DEV }} | |
| audience: ${{ vars.TS_WIF_AUDIENCE_MW_DEV }} | |
| tags: tag:github-runner | |
| - name: Install kubectl | |
| uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1 | |
| - name: Update kubeconfig | |
| run: aws eks update-kubeconfig --name "$CLUSTER_NAME" --region us-east-1 --alias "$KUBE_CONTEXT" | |
| # Deprovision the ci-pr ducklings (so no S3 / cnpg role+db / lakekeeper CR | |
| # leaks on shared infra) then delete the namespace. | |
| - name: Teardown | |
| run: bash tests/e2e-mw-dev/run.sh teardown | |
| # Backstop sweep of orphaned per-PR namespaces. Scheduled-trigger only (the | |
| # PR/manual triggers run build+e2e instead). Reuses the same Tailscale + e2e | |
| # IAM role as the e2e job; run.sh e2e-cleanup discovers stale namespaces by the | |
| # managed-by=e2e-mw-dev label and reaps anything older than 6h. Named | |
| # e2e-cleanup (not "janitor") to avoid colliding with duckgres's own janitor. | |
| e2e-cleanup: | |
| if: github.event_name == 'schedule' | |
| runs-on: ubuntu-24.04-arm | |
| timeout-minutes: 20 | |
| env: | |
| KUBE_CONTEXT: posthog-mw-dev | |
| CLUSTER_NAME: posthog-mw-dev | |
| EKS_CLUSTER_NAME: posthog-mw-dev | |
| AWS_REGION: us-east-1 | |
| steps: | |
| - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 | |
| - name: Configure AWS credentials (OIDC) | |
| uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1 | |
| with: | |
| role-to-assume: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/github-duckgres-e2e | |
| aws-region: us-east-1 | |
| - name: Connect to Tailscale | |
| uses: tailscale/github-action@306e68a486fd2350f2bfc3b19fcd143891a4a2d8 # v4.1.2 | |
| with: | |
| oauth-client-id: ${{ vars.TS_WIF_CLIENT_ID_MW_DEV }} | |
| audience: ${{ vars.TS_WIF_AUDIENCE_MW_DEV }} | |
| tags: tag:github-runner | |
| - name: Install kubectl | |
| uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1 | |
| - name: Update kubeconfig | |
| run: aws eks update-kubeconfig --name "$CLUSTER_NAME" --region us-east-1 --alias "$KUBE_CONTEXT" | |
| - name: Sweep stale per-PR namespaces | |
| run: bash tests/e2e-mw-dev/run.sh e2e-cleanup |