Skip to content

Commit 79cfde5

Browse files
committed
chore: infra rewrite
1 parent 9c0fef6 commit 79cfde5

61 files changed

Lines changed: 5123 additions & 5546 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/deploy.yml

Lines changed: 72 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,40 @@ jobs:
407407
SERVICES=$(echo "$BUILD_IMAGES_MATRIX" | jq -r '[.[].service] | join(",")')
408408
pnpm --filter infra wait-for-images --registry "$REGISTRY" --ns "$REGISTRY_NS" --tag "$TAG" --services "$SERVICES"
409409
410+
- name: Set generation + release SHA (immutable-node)
411+
working-directory: infra
412+
env:
413+
AWS_ACCESS_KEY_ID: ${{ secrets.SCW_ACCESS_KEY }}
414+
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCW_SECRET_KEY }}
415+
PULUMI_CONFIG_PASSPHRASE: ${{ secrets.PULUMI_CONFIG_PASSPHRASE }}
416+
STACK: ${{ needs.setup.outputs.pulumi_stack }}
417+
IMAGE_TAG: ${{ needs.setup.outputs.image_tag }}
418+
RUN_NUMBER: ${{ github.run_number }}
419+
HAS_YJS: ${{ needs.setup.outputs.yjs_url != '' && 'true' || 'false' }}
420+
run: |
421+
set -euo pipefail
422+
# Immutable-node model: each deploy provisions a NEW VM generation
423+
# `vm-<svc>-<gen>` with the image SHA baked into its cloud-init. The
424+
# generation number is the monotonic CI run number (no state to
425+
# persist between runs); the SHA is this commit. compute.ts reads
426+
# these to name + bake each generation; Pulumi's state deletes the
427+
# previous generation. The SHA must be a pinned commit, never :latest.
428+
case "$IMAGE_TAG" in
429+
''|latest|*:latest)
430+
echo "::error::refusing to deploy a non-pinned image tag '$IMAGE_TAG'"
431+
exit 1
432+
;;
433+
esac
434+
# ai reuses the backend image at the same SHA; cdc/frontend always
435+
# deploy; yjs only when enabled. Setting config for a disabled service
436+
# is harmless (compute only reads enabled ones), so the list is fixed.
437+
# Keys are underscore-flat under the `infra` namespace — a colon in the
438+
# key would collide with Pulumi's `<namespace>:<key>` syntax.
439+
for svc in backend cdc frontend yjs ai; do
440+
pulumi config set "infra:gen_$svc" "$RUN_NUMBER" --stack "$STACK"
441+
pulumi config set "infra:sha_$svc" "$IMAGE_TAG" --stack "$STACK"
442+
done
443+
410444
- name: Pulumi up
411445
working-directory: infra
412446
env:
@@ -456,6 +490,28 @@ jobs:
456490
# directly.
457491
pnpm --filter infra assert-vm-grants --application-name "$VM_READER_APP" --project-id "$SCW_DEFAULT_PROJECT_ID" --organization-id "$SCW_DEFAULT_ORGANIZATION_ID"
458492
493+
- name: Verify runtime secrets are deliverable
494+
# Second belt-and-suspenders preflight for a prod-down class: a `required`
495+
# runtime secret that cannot be written into the line-based
496+
# /opt/app/.env.runtime (e.g. a raw multi-line PEM) fails the on-VM
497+
# runtime-secret-sync, which by design blocks the service from booting —
498+
# exactly how a multi-line DATABASE_SSL_CA took the backend down. `pulumi
499+
# up` writing the value isn't enough; this read-only check fetches each
500+
# required secret the way a VM will and asserts it is single-line and
501+
# present BEFORE any VM rolls, failing the deploy with the offending env
502+
# vars instead of after a fleet-wide outage. Scoped to the services this
503+
# fork actually deploys (verify_rollout_matrix) so a disabled service's
504+
# optional secret never trips it.
505+
working-directory: infra
506+
env:
507+
SCW_SECRET_KEY: ${{ secrets.SCW_SECRET_KEY }}
508+
REGION: ${{ needs.setup.outputs.region }}
509+
SCW_DEFAULT_PROJECT_ID: ${{ secrets.SCW_PROJECT_ID }}
510+
SERVICES_MATRIX: ${{ needs.setup.outputs.verify_rollout_matrix }}
511+
run: |
512+
SERVICES=$(echo "$SERVICES_MATRIX" | jq -r '[.[].service] | join(",")')
513+
pnpm --filter infra assert-secrets-deliverable --region "$REGION" --project-id "$SCW_DEFAULT_PROJECT_ID" --services "$SERVICES"
514+
459515
- name: Capture stack outputs
460516
id: outputs
461517
working-directory: infra
@@ -512,26 +568,17 @@ jobs:
512568
|| echo "- **Domains:** no domain" >> $GITHUB_STEP_SUMMARY
513569
514570
# -------------------------------------------------------------------------
515-
# Roll services — for each backend service (backend, cdc, yjs, ai):
516-
# 1. PUT the new image SHA into s3://<deploy_tags_bucket>/deploy/<svc>.tag
517-
# 2. Poll the public health endpoint until X-App-Version == SHA
571+
# Verify the rollout — `pulumi up` above already provisioned the new VM
572+
# generation per service (image SHA baked into cloud-init, the migrate
573+
# companion run at the backend generation's boot). These jobs only confirm
574+
# each public service serves the expected SHA (X-App-Version == SHA).
518575
#
519-
# Matrix runs in parallel; `fail-fast: true` cancels siblings the moment one
520-
# service refuses to roll, so a broken cdc/yjs deploy doesn't have to wait
521-
# 5 minutes on a separately-stuck backend before the workflow turns red.
576+
# Backend first (it owns the expand migration), then the rest in parallel.
577+
# cdc has no public health endpoint; its replacement is confirmed indirectly
578+
# by the backend coming up healthy (the cdc worker reconnects to it).
522579
#
523-
# Tag rollback: re-running the workflow on the previous commit re-PUTs the
524-
# old SHA. The on-VM reconciler sees the change and rolls back the same way
525-
# it rolled forward — no separate code path. CI doesn't try to "undo" on
526-
# failure; that would race the reconciler's own rollback (which only the VM
527-
# has the local context to do safely).
528-
# -------------------------------------------------------------------------
529-
# -------------------------------------------------------------------------
530-
# Roll backend FIRST — backend is the schema owner. Its reconciler runs the
531-
# expand (additive) migration before swapping the app container, so the
532-
# backend deploy-tag write + health gate MUST go green before any other
533-
# service rolls. This guarantees the new schema is present before ai/yjs/cdc/
534-
# frontend pick up code that may depend on it. roll-rest gates on this job.
580+
# Rollback: re-run the workflow on the previous commit — it provisions a new
581+
# generation from that SHA the same way it rolled forward (no separate path).
535582
# -------------------------------------------------------------------------
536583
roll-backend:
537584
runs-on: ubuntu-latest
@@ -556,16 +603,10 @@ jobs:
556603
- service: backend
557604
health_url: ${{ needs.setup.outputs.backend_url }}
558605
env:
559-
AWS_ACCESS_KEY_ID: ${{ secrets.SCW_ACCESS_KEY }}
560-
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCW_SECRET_KEY }}
561-
TAG_BUCKET: ${{ needs.setup.outputs.deploy_tags_bucket }}
562606
REGION: ${{ needs.setup.outputs.region }}
563607
EXPECTED_SHA: ${{ needs.setup.outputs.image_tag }}
564608
SERVICE: ${{ matrix.service }}
565609
steps:
566-
# Checkout + Node are needed for the wait-for-version task below. The
567-
# deploy-tag write only needs the preinstalled aws CLI, but keeping setup
568-
# at the top means both steps share one toolchain install.
569610
- name: Checkout
570611
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
571612

@@ -581,52 +622,21 @@ jobs:
581622
- name: Install dependencies
582623
run: pnpm install --frozen-lockfile --filter infra...
583624

584-
- name: Write deploy tag for ${{ matrix.service }}
585-
run: |
586-
ENDPOINT="https://s3.$REGION.scw.cloud"
587-
# Immutability guard: the reconciler pulls whatever SHA we publish
588-
# here, and Pulumi can no longer reject a mutable tag at plan time
589-
# (image tags left cloud-init entirely). So the write path is now the
590-
# sole gate — refuse to publish an empty value or a mutable :latest.
591-
case "$EXPECTED_SHA" in
592-
''|latest|*:latest)
593-
echo "::error::refusing to publish non-pinned tag '$EXPECTED_SHA' to deploy/$SERVICE.tag"
594-
exit 1
595-
;;
596-
esac
597-
echo "Publishing $EXPECTED_SHA to s3://$TAG_BUCKET/deploy/$SERVICE.tag"
598-
# Plain text body, no newline — reconciler reads it as-is.
599-
printf '%s' "$EXPECTED_SHA" \
600-
| aws --endpoint-url "$ENDPOINT" s3 cp - "s3://$TAG_BUCKET/deploy/$SERVICE.tag" \
601-
--content-type 'text/plain'
602-
603625
- name: Wait for ${{ matrix.service }} to serve ${{ needs.setup.outputs.image_tag }}
604626
if: matrix.health_url != ''
605627
env:
606628
# URLs come from setup outputs (derived from shared/ appConfig).
607629
# appConfig URLs are canonical https:// origins; CI probes /health
608-
# directly without any scheme transformation.
630+
# directly. `pulumi up` already provisioned the new generation (SHA
631+
# baked into cloud-init); this only confirms it is serving. Frontend
632+
# (Caddy) returns 200, backend/yjs/ai 204; both emit X-App-Version.
609633
BASE: ${{ matrix.health_url }}
610-
# Status/header poll lives in infra/tasks/wait-for-version.ts (unit
611-
# tested) instead of an inline awk loop. Frontend (Caddy) returns 200,
612-
# backend/yjs/ai return 204; both emit X-App-Version. 100 × 3s = 5 min
613-
# budget per service; the matrix runs in parallel. The status triple
614-
# lets the poll fast-fail on a reconciler-reported failure (and surface
615-
# its phase/reason) instead of blindly waiting out the whole budget.
616-
run: pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA" --status-bucket "$TAG_BUCKET" --service "$SERVICE" --region "$REGION"
617-
618-
- name: Fetch ${{ matrix.service }} boot diagnostics on failure
619-
if: failure()
620-
env:
621-
STATE_BUCKET: ${{ needs.setup.outputs.state_bucket }}
622-
# Key selection (which stage markers / latest full log to show) lives in
623-
# the unit-tested infra/tasks/fetch-boot-diag.ts; the aws calls are the
624-
# only side effect. roll-backend already has Node installed above.
625-
run: pnpm --filter infra fetch-boot-diag --bucket "$STATE_BUCKET" --service "$SERVICE" --region "$REGION"
634+
run: pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA"
626635

627636
# -------------------------------------------------------------------------
628637
# Roll the remaining services — runs ONLY after roll-backend is green, so the
629-
# expand migration has already been applied. These four roll in parallel.
638+
# expand migration (run at the backend generation's boot) is applied. These
639+
# roll in parallel.
630640
# -------------------------------------------------------------------------
631641
roll-rest:
632642
runs-on: ubuntu-latest
@@ -650,16 +660,10 @@ jobs:
650660
# the backend image at this SHA; the probe confirms the AI LB serves it.
651661
include: ${{ fromJSON(needs.setup.outputs.roll_rest_matrix) }}
652662
env:
653-
AWS_ACCESS_KEY_ID: ${{ secrets.SCW_ACCESS_KEY }}
654-
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCW_SECRET_KEY }}
655-
TAG_BUCKET: ${{ needs.setup.outputs.deploy_tags_bucket }}
656663
REGION: ${{ needs.setup.outputs.region }}
657664
EXPECTED_SHA: ${{ needs.setup.outputs.image_tag }}
658665
SERVICE: ${{ matrix.service }}
659666
steps:
660-
# Checkout + Node are needed for the wait-for-version task below. The
661-
# deploy-tag write only needs the preinstalled aws CLI, but keeping setup
662-
# at the top means both steps share one toolchain install.
663667
- name: Checkout
664668
uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
665669

@@ -675,48 +679,11 @@ jobs:
675679
- name: Install dependencies
676680
run: pnpm install --frozen-lockfile --filter infra...
677681

678-
- name: Write deploy tag for ${{ matrix.service }}
679-
run: |
680-
ENDPOINT="https://s3.$REGION.scw.cloud"
681-
# Immutability guard: the reconciler pulls whatever SHA we publish
682-
# here, and Pulumi can no longer reject a mutable tag at plan time
683-
# (image tags left cloud-init entirely). So the write path is now the
684-
# sole gate — refuse to publish an empty value or a mutable :latest.
685-
case "$EXPECTED_SHA" in
686-
''|latest|*:latest)
687-
echo "::error::refusing to publish non-pinned tag '$EXPECTED_SHA' to deploy/$SERVICE.tag"
688-
exit 1
689-
;;
690-
esac
691-
echo "Publishing $EXPECTED_SHA to s3://$TAG_BUCKET/deploy/$SERVICE.tag"
692-
# Plain text body, no newline — reconciler reads it as-is.
693-
printf '%s' "$EXPECTED_SHA" \
694-
| aws --endpoint-url "$ENDPOINT" s3 cp - "s3://$TAG_BUCKET/deploy/$SERVICE.tag" \
695-
--content-type 'text/plain'
696-
697682
- name: Wait for ${{ matrix.service }} to serve ${{ needs.setup.outputs.image_tag }}
698683
if: matrix.health_url != ''
699684
env:
700-
# URLs come from setup outputs (derived from shared/ appConfig).
701-
# appConfig URLs are canonical https:// origins; CI probes /health
702-
# directly without any scheme transformation.
703685
BASE: ${{ matrix.health_url }}
704-
# Status/header poll lives in infra/tasks/wait-for-version.ts (unit
705-
# tested) instead of an inline awk loop. Frontend (Caddy) returns 200,
706-
# backend/yjs/ai return 204; both emit X-App-Version. 100 × 3s = 5 min
707-
# budget per service; the matrix runs in parallel. The status triple
708-
# lets the poll fast-fail on a reconciler-reported failure (and surface
709-
# its phase/reason) instead of blindly waiting out the whole budget.
710-
run: pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA" --status-bucket "$TAG_BUCKET" --service "$SERVICE" --region "$REGION"
711-
712-
- name: Fetch ${{ matrix.service }} boot diagnostics on failure
713-
if: failure()
714-
env:
715-
STATE_BUCKET: ${{ needs.setup.outputs.state_bucket }}
716-
# Key selection (which stage markers / latest full log to show) lives in
717-
# the unit-tested infra/tasks/fetch-boot-diag.ts; the aws calls are the
718-
# only side effect. roll-rest already has Node installed above.
719-
run: pnpm --filter infra fetch-boot-diag --bucket "$STATE_BUCKET" --service "$SERVICE" --region "$REGION"
686+
run: pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA"
720687

721688
# -------------------------------------------------------------------------
722689
# Verify rollout — backward-compatible alias that depends on the roll jobs.
@@ -742,10 +709,6 @@ jobs:
742709
matrix:
743710
include: ${{ fromJSON(needs.setup.outputs.verify_rollout_matrix) }}
744711
env:
745-
AWS_ACCESS_KEY_ID: ${{ secrets.SCW_ACCESS_KEY }}
746-
AWS_SECRET_ACCESS_KEY: ${{ secrets.SCW_SECRET_KEY }}
747-
TAG_BUCKET: ${{ needs.setup.outputs.deploy_tags_bucket }}
748-
REGION: ${{ needs.setup.outputs.region }}
749712
EXPECTED_SHA: ${{ needs.setup.outputs.image_tag }}
750713
SERVICE: ${{ matrix.service }}
751714
steps:

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ frontend/stats.html
5454
.junie
5555
.claude
5656
.custom
57+
.kilo
5758

5859
# Cella
5960
*storybook.log
@@ -75,3 +76,4 @@ terraform.tfstate*
7576
coverage/
7677
*.lcov
7778
.nyc_output/
79+

backend/package.json

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -35,65 +35,65 @@
3535
"s3:cors": "tsx scripts/set-bucket-cors.ts"
3636
},
3737
"dependencies": {
38-
"@ag-ui/core": "^0.0.56",
38+
"@ag-ui/core": "catalog:",
3939
"@asteasolutions/zod-to-openapi": "^8.5.0",
40-
"@aws-sdk/client-s3": "^3.1065.0",
41-
"@aws-sdk/s3-request-presigner": "^3.1065.0",
40+
"@aws-sdk/client-s3": "catalog:",
41+
"@aws-sdk/s3-request-presigner": "catalog:",
4242
"@blocknote/core": "^0.51.4",
4343
"@blocknote/server-util": "^0.51.4",
4444
"@dotenv-run/core": "^1.3.8",
4545
"@getbrevo/brevo": "^5.0.4",
46-
"@hono/node-server": "^2.0.4",
46+
"@hono/node-server": "catalog:",
4747
"@hono/otel": "^1.1.1",
4848
"@hono/zod-openapi": "^1.3.0",
4949
"@isaacs/ttlcache": "^2.1.5",
5050
"@napi-rs/canvas": "^1.0.0",
5151
"@opentelemetry/api": "^1.9.1",
52-
"@opentelemetry/auto-instrumentations-node": "^0.76.0",
53-
"@opentelemetry/exporter-logs-otlp-http": "^0.218.0",
54-
"@opentelemetry/exporter-metrics-otlp-http": "^0.218.0",
55-
"@opentelemetry/exporter-trace-otlp-http": "^0.218.0",
56-
"@opentelemetry/resources": "^2.7.1",
57-
"@opentelemetry/sdk-logs": "^0.218.0",
58-
"@opentelemetry/sdk-metrics": "^2.7.1",
59-
"@opentelemetry/sdk-node": "^0.218.0",
52+
"@opentelemetry/auto-instrumentations-node": "catalog:",
53+
"@opentelemetry/exporter-logs-otlp-http": "catalog:",
54+
"@opentelemetry/exporter-metrics-otlp-http": "catalog:",
55+
"@opentelemetry/exporter-trace-otlp-http": "catalog:",
56+
"@opentelemetry/resources": "catalog:",
57+
"@opentelemetry/sdk-logs": "catalog:",
58+
"@opentelemetry/sdk-metrics": "catalog:",
59+
"@opentelemetry/sdk-node": "catalog:",
6060
"@opentelemetry/semantic-conventions": "^1.40.0",
6161
"@oslojs/crypto": "^1.0.1",
6262
"@oslojs/encoding": "^1.1.0",
6363
"@oslojs/otp": "^1.1.0",
6464
"@oslojs/webauthn": "^1.0.0",
6565
"@t3-oss/env-core": "^0.13.11",
66-
"@tanstack/ai": "^0.28.0",
66+
"@tanstack/ai": "catalog:",
6767
"arctic": "^3.7.0",
6868
"drizzle-orm": "1.0.0-rc.3",
6969
"enforce-unique": "^1.3.0",
7070
"hono": "catalog:",
7171
"html-to-text": "^10.0.0",
7272
"i18next": "^26.3.1",
73-
"isbot": "^5.1.42",
73+
"isbot": "catalog:",
7474
"locales": "workspace:*",
7575
"lru-cache": "^11.5.1",
7676
"maxmind": "^5.0.0",
7777
"nanoid": "^5.1.11",
78-
"openai": "^6.42.0",
78+
"openai": "catalog:",
7979
"ora": "^9.4.0",
8080
"pg": "^8.21.0",
81-
"pg-boss": "^12.18.2",
81+
"pg-boss": "catalog:",
8282
"pg-logical-replication": "^2.5.0",
8383
"pino": "^10.3.1",
8484
"pino-http": "^11.0.0",
8585
"pino-opentelemetry-transport": "^3.0.0",
86-
"rate-limiter-flexible": "^11.2.0",
86+
"rate-limiter-flexible": "catalog:",
8787
"react": "^19.2.7",
8888
"react-dom": "^19.2.7",
8989
"react-i18next": "^17.0.7",
9090
"rehype": "^13.0.2",
9191
"rehype-stringify": "^10.0.1",
92-
"sanitize-html": "^2.17.3",
92+
"sanitize-html": "catalog:",
9393
"sdk": "workspace:*",
9494
"shared": "workspace:*",
9595
"slugify": "^1.6.9",
96-
"transloadit": "4.10.6",
96+
"transloadit": "catalog:",
9797
"ua-parser-js": "^2.0.10",
9898
"uuidv7": "^1.2.1",
9999
"ws": "^8.21.0",
@@ -108,7 +108,7 @@
108108
"@types/html-to-text": "^9.0.4",
109109
"@types/node": "catalog:",
110110
"@types/pg": "^8.20.0",
111-
"@types/react": "19.2.15",
111+
"@types/react": "catalog:",
112112
"@types/sanitize-html": "^2.16.1",
113113
"@types/ua-parser-js": "^0.7.39",
114114
"@types/ws": "^8.18.1",

backend/src/db/db.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ export type DbOrTx = DB | Tx;
2020
// The CA (Scaleway RDB instance cert) is provisioned automatically into the
2121
// DATABASE_SSL_CA runtime secret by `pulumi up`, so a missing value is a
2222
// misconfiguration we fail fast on rather than silently downgrading security.
23+
// The secret is base64-encoded (the PEM is multi-line and would break the
24+
// line-based `.env.runtime` delivery), so decode it back to PEM here.
2325
const sslConfig =
2426
env.NODE_ENV === 'production' && !env.NODB
2527
? (() => {
@@ -30,7 +32,7 @@ const sslConfig =
3032
"CLI → 'Apply infra change', or check the database-ssl-ca runtime secret.",
3133
);
3234
}
33-
return { ca: env.DATABASE_SSL_CA, rejectUnauthorized: true };
35+
return { ca: Buffer.from(env.DATABASE_SSL_CA, 'base64').toString('utf-8'), rejectUnauthorized: true };
3436
})()
3537
: undefined;
3638

bench/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
"devDependencies": {
2222
"@types/node": "catalog:",
2323
"@types/pg": "^8.20.0",
24-
"artillery": "^2.0.31",
25-
"artillery-plugin-ensure": "^1.25.0",
24+
"artillery": "catalog:",
25+
"artillery-plugin-ensure": "catalog:",
2626
"tsx": "catalog:",
2727
"typescript": "catalog:"
2828
}

0 commit comments

Comments
 (0)