duckgres/.github/workflows/e2e-mw-dev.yml at ac58d71747bdbc27e6aaa57546fa9889da20e774 · PostHog/duckgres · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# Per-PR end-to-end test against the real posthog-mw-dev EKS cluster.
#
# Replaces what kind (tests/k8s/) cannot exercise: real Cilium network
# policies, real Crossplane Duckling provisioning, real cnpg-shard + external
# RDS metadata stores, and the real per-org Lakekeeper operator — the layers
# where this quarter's bugs actually lived (Cilium egress, lakekeeper
# encryption-key drift, cnpg role drift, RBAC delete gaps).
#
# Flow:
#   1. build arm64-only worker + control-plane images, tagged pr-<N>-<sha>,
#      pushed to ECR (mw-dev is arm64, so one arch is enough for a PR).
#   2. join the org tailnet via OIDC/WIF → reach the private mw-dev EKS API.
#   3. stand up an isolated namespace duckgres-ci-pr-<N>: a throwaway
#      config-store Postgres + a control-plane Deployment running the PR image,
#      spawning worker pods in the same namespace.
#   4. run the e2e harness as a Job INSIDE the namespace, talking to the CP
#      ClusterIP service (no public DNS / NLB needed). Covers the cnpg-shard
#      and external metadata backends.
#   5. always tear the namespace down (and deprovision the ci-pr ducklings so
#      no S3 / cnpg role / lakekeeper CR leaks on shared infra).
#
# SECURITY — who can run this:
#   Same model as the AWS/OIDC job in ci.yml: the gate is the repo setting
#   "Require approval for outside collaborators". Members' PRs run
#   automatically; fork PRs from outside collaborators get NO secrets and don't
#   run until a maintainer clicks approve-and-run, so they can't reach the
#   cluster or assume the IAM role unapproved. (No per-workflow guard job /
#   required-reviewer Environment — that would either block external PRs even
#   after approval, or force an approval click on every maintainer push.)
#
# Required repo configuration (one-time, see tests/e2e-mw-dev/README.md):
#   vars:    TS_WIF_CLIENT_ID_MW_DEV, TS_WIF_AUDIENCE_MW_DEV
#   secrets: AWS_ECR_PUBLISH_IAM_ROLE (already exists, used by CD),
#            MW_DEV_ACCOUNT_ID (mw-dev AWS account id; ARNs built from it)
#   IAM:     github-duckgres-e2e role in the mw-dev account (posthog-cloud-infra)
#            — stripped down, NOT the account-admin terraform-infra role.
#   Repo setting: "Require approval for all outside collaborators".
name: e2e-mw-dev

on:
  pull_request:
    branches: [main]
    # Scope: only run when something that could change runtime behavior moves.
    # (Docs-only PRs shouldn't spin up a cluster namespace.)
    paths:
      - "**/*.go"
      - "go.mod"
      - "go.sum"
      - "Dockerfile*"
      - "tests/e2e-mw-dev/**"
      - ".github/workflows/e2e-mw-dev.yml"
      - ".github/workflows/_image-build.yml"
  workflow_dispatch:
  # Cleanup backstop: every 6h, sweep per-PR namespaces orphaned by runs that
  # died hard (cancelled mid-flight, runner OOM) before their always() teardown
  # could fire. Only the `e2e-cleanup` job runs on this trigger.
  schedule:
    - cron: "0 */6 * * *"

# One in-flight run per PR; a new push cancels the old run (and its namespace
# is GC'd by the always() teardown of the cancelled run + the e2e-cleanup sweep).
concurrency:
  group: e2e-mw-dev-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

permissions:
  id-token: write
  contents: read

jobs:
  # mw-dev runs ONE all-in-one `duckgres` image for both roles: the control
  # plane is `--mode control-plane` on it, and DUCKGRES_K8S_WORKER_IMAGE points
  # at the same image (workers run `--mode duckdb-service`). So the e2e flow
  # builds that single image — not the separate worker/controlplane CD images —
  # to match what actually ships to mw-dev.
  build:
    # PR / manual only — the scheduled trigger runs nothing but e2e-cleanup.
    if: github.event_name != 'schedule'
    uses: ./.github/workflows/_image-build.yml
    with:
      dockerfile: Dockerfile
      image-name: duckgres
      tag: pr-${{ github.event.pull_request.number }}-${{ github.sha }}-arm64
      platform: linux/arm64
      cache-scope: e2e-duckgres-arm64
      # Default DuckDB row (1.5.3) — mirrors the default:true matrix entry in
      # container-image-worker-cd.yml. Keep in lock-step on a version bump.
      build-args: |
        DUCKDB_EXTENSION_VERSION=1.5.3
        HTTPFS_EXTENSION_TAG=v1.5.3-stoi-fix
        DUCKLAKE_EXTENSION_TAG=v1.0-posthog.4
        DUCKDB_EXTENSION_REPOSITORY=https://extensions.duckdb.org
        POSTGRES_SCANNER_REPOSITORY=https://extensions.duckdb.org
    secrets:
      ecr-role: ${{ secrets.AWS_ECR_PUBLISH_IAM_ROLE }}

  e2e:
    needs: [build]
    if: github.event_name != 'schedule'
    runs-on: ubuntu-24.04-arm
    timeout-minutes: 40
    env:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      NAMESPACE: duckgres-ci-pr-${{ github.event.pull_request.number }}
      # Single all-in-one image for both CP and workers (mw-dev parity).
      WORKER_IMAGE: ${{ needs.build.outputs.image }}
      CONTROLPLANE_IMAGE: ${{ needs.build.outputs.image }}
      KUBE_CONTEXT: posthog-mw-dev
      CLUSTER_NAME: posthog-mw-dev
      EKS_CLUSTER_NAME: posthog-mw-dev
      AWS_REGION: us-east-1
      # The per-PR CP assumes the SAME EKS Pod Identity role as the real mw-dev
      # control plane, so STS-brokered S3 activation works with no cred
      # injection. Built from the account-id secret (no account id committed) +
      # the non-sensitive role name.
      CP_POD_IDENTITY_ROLE: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/duckgres-control-plane-dev
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1

      - name: Configure AWS credentials (OIDC)
        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
        with:
          # Dedicated, stripped-down e2e role (NOT the account-admin
          # terraform-infra role): eks:DescribeCluster + Pod Identity
          # association calls + iam:PassRole on the CP role, plus an EKS access
          # entry for the kubectl it needs. Defined in posthog-cloud-infra
          # (mw-dev account). Account id comes from a repo secret so no AWS
          # account id is committed; the role name is not sensitive.
          role-to-assume: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/github-duckgres-e2e
          aws-region: us-east-1

      # Private EKS API — join the org tailnet so kubectl can reach it. OIDC/WIF,
      # no static key. Mirrors PostHog/hogland deploy.yml. The subnet router
      # advertises the mw-dev VPC incl. the private API endpoint.
      - name: Connect to Tailscale
        uses: tailscale/github-action@306e68a486fd2350f2bfc3b19fcd143891a4a2d8 # v4.1.2
        with:
          oauth-client-id: ${{ vars.TS_WIF_CLIENT_ID_MW_DEV }}
          audience: ${{ vars.TS_WIF_AUDIENCE_MW_DEV }}
          tags: tag:github-runner

      - name: Install kubectl
        uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1

      - name: Update kubeconfig
        # --alias so the context is named posthog-mw-dev (matches KUBE_CONTEXT /
        # run.sh's explicit --context). Without it the context defaults to the
        # full cluster ARN and `kubectl --context posthog-mw-dev` fails with
        # "context does not exist".
        run: aws eks update-kubeconfig --name "$CLUSTER_NAME" --region us-east-1 --alias "$KUBE_CONTEXT"

      - name: Deploy isolated namespace
        run: bash tests/e2e-mw-dev/run.sh deploy

      - name: Run e2e harness (in-cluster Job)
        run: bash tests/e2e-mw-dev/run.sh test

      - name: Collect diagnostics
        if: always()
        run: bash tests/e2e-mw-dev/run.sh diagnostics

      # Always tear down — deprovision the ci-pr ducklings (so no S3 / cnpg
      # role+db / lakekeeper CR leaks on shared infra) then delete the
      # namespace. Runs even on cancel/failure.
      - name: Teardown
        if: always()
        run: bash tests/e2e-mw-dev/run.sh teardown

  # Backstop sweep of orphaned per-PR namespaces. Scheduled-trigger only (the
  # PR/manual triggers run build+e2e instead). Reuses the same Tailscale + e2e
  # IAM role as the e2e job; run.sh e2e-cleanup discovers stale namespaces by the
  # managed-by=e2e-mw-dev label and reaps anything older than 6h. Named
  # e2e-cleanup (not "janitor") to avoid colliding with duckgres's own janitor.
  e2e-cleanup:
    if: github.event_name == 'schedule'
    runs-on: ubuntu-24.04-arm
    timeout-minutes: 20
    env:
      KUBE_CONTEXT: posthog-mw-dev
      CLUSTER_NAME: posthog-mw-dev
      EKS_CLUSTER_NAME: posthog-mw-dev
      AWS_REGION: us-east-1
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
      - name: Configure AWS credentials (OIDC)
        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
        with:
          role-to-assume: arn:aws:iam::${{ secrets.MW_DEV_ACCOUNT_ID }}:role/github-duckgres-e2e
          aws-region: us-east-1
      - name: Connect to Tailscale
        uses: tailscale/github-action@306e68a486fd2350f2bfc3b19fcd143891a4a2d8 # v4.1.2
        with:
          oauth-client-id: ${{ vars.TS_WIF_CLIENT_ID_MW_DEV }}
          audience: ${{ vars.TS_WIF_AUDIENCE_MW_DEV }}
          tags: tag:github-runner
      - name: Install kubectl
        uses: azure/setup-kubectl@776406bce94f63e41d621b960d78ee25c8b76ede # v4.0.1
      - name: Update kubeconfig
        run: aws eks update-kubeconfig --name "$CLUSTER_NAME" --region us-east-1 --alias "$KUBE_CONTEXT"
      - name: Sweep stale per-PR namespaces
        run: bash tests/e2e-mw-dev/run.sh e2e-cleanup