@@ -407,6 +407,40 @@ jobs:
407407 SERVICES=$(echo "$BUILD_IMAGES_MATRIX" | jq -r '[.[].service] | join(",")')
408408 pnpm --filter infra wait-for-images --registry "$REGISTRY" --ns "$REGISTRY_NS" --tag "$TAG" --services "$SERVICES"
409409
410+ - name : Set generation + release SHA (immutable-node)
411+ working-directory : infra
412+ env :
413+ AWS_ACCESS_KEY_ID : ${{ secrets.SCW_ACCESS_KEY }}
414+ AWS_SECRET_ACCESS_KEY : ${{ secrets.SCW_SECRET_KEY }}
415+ PULUMI_CONFIG_PASSPHRASE : ${{ secrets.PULUMI_CONFIG_PASSPHRASE }}
416+ STACK : ${{ needs.setup.outputs.pulumi_stack }}
417+ IMAGE_TAG : ${{ needs.setup.outputs.image_tag }}
418+ RUN_NUMBER : ${{ github.run_number }}
419+ HAS_YJS : ${{ needs.setup.outputs.yjs_url != '' && 'true' || 'false' }}
420+ run : |
421+ set -euo pipefail
422+ # Immutable-node model: each deploy provisions a NEW VM generation
423+ # `vm-<svc>-<gen>` with the image SHA baked into its cloud-init. The
424+ # generation number is the monotonic CI run number (no state to
425+ # persist between runs); the SHA is this commit. compute.ts reads
426+ # these to name + bake each generation; Pulumi's state deletes the
427+ # previous generation. The SHA must be a pinned commit, never :latest.
428+ case "$IMAGE_TAG" in
429+ ''|latest|*:latest)
430+ echo "::error::refusing to deploy a non-pinned image tag '$IMAGE_TAG'"
431+ exit 1
432+ ;;
433+ esac
434+ # ai reuses the backend image at the same SHA; cdc/frontend always
435+ # deploy; yjs only when enabled. Setting config for a disabled service
436+ # is harmless (compute only reads enabled ones), so the list is fixed.
437+ # Keys are underscore-flat under the `infra` namespace — a colon in the
438+ # key would collide with Pulumi's `<namespace>:<key>` syntax.
439+ for svc in backend cdc frontend yjs ai; do
440+ pulumi config set "infra:gen_$svc" "$RUN_NUMBER" --stack "$STACK"
441+ pulumi config set "infra:sha_$svc" "$IMAGE_TAG" --stack "$STACK"
442+ done
443+
410444 - name : Pulumi up
411445 working-directory : infra
412446 env :
@@ -456,6 +490,28 @@ jobs:
456490 # directly.
457491 pnpm --filter infra assert-vm-grants --application-name "$VM_READER_APP" --project-id "$SCW_DEFAULT_PROJECT_ID" --organization-id "$SCW_DEFAULT_ORGANIZATION_ID"
458492
493+ - name : Verify runtime secrets are deliverable
494+ # Second belt-and-suspenders preflight for a prod-down class: a `required`
495+ # runtime secret that cannot be written into the line-based
496+ # /opt/app/.env.runtime (e.g. a raw multi-line PEM) fails the on-VM
497+ # runtime-secret-sync, which by design blocks the service from booting —
498+ # exactly how a multi-line DATABASE_SSL_CA took the backend down. `pulumi
499+ # up` writing the value isn't enough; this read-only check fetches each
500+ # required secret the way a VM will and asserts it is single-line and
501+ # present BEFORE any VM rolls, failing the deploy with the offending env
502+ # vars instead of after a fleet-wide outage. Scoped to the services this
503+ # fork actually deploys (verify_rollout_matrix) so a disabled service's
504+ # optional secret never trips it.
505+ working-directory : infra
506+ env :
507+ SCW_SECRET_KEY : ${{ secrets.SCW_SECRET_KEY }}
508+ REGION : ${{ needs.setup.outputs.region }}
509+ SCW_DEFAULT_PROJECT_ID : ${{ secrets.SCW_PROJECT_ID }}
510+ SERVICES_MATRIX : ${{ needs.setup.outputs.verify_rollout_matrix }}
511+ run : |
512+ SERVICES=$(echo "$SERVICES_MATRIX" | jq -r '[.[].service] | join(",")')
513+ pnpm --filter infra assert-secrets-deliverable --region "$REGION" --project-id "$SCW_DEFAULT_PROJECT_ID" --services "$SERVICES"
514+
459515 - name : Capture stack outputs
460516 id : outputs
461517 working-directory : infra
@@ -512,26 +568,17 @@ jobs:
512568 || echo "- **Domains:** no domain" >> $GITHUB_STEP_SUMMARY
513569
514570 # -------------------------------------------------------------------------
515- # Roll services — for each backend service (backend, cdc, yjs, ai):
516- # 1. PUT the new image SHA into s3://<deploy_tags_bucket>/deploy/<svc>.tag
517- # 2. Poll the public health endpoint until X-App-Version == SHA
571+ # Verify the rollout — `pulumi up` above already provisioned the new VM
572+ # generation per service (image SHA baked into cloud-init, the migrate
573+ # companion run at the backend generation's boot). These jobs only confirm
574+ # each public service serves the expected SHA (X-App-Version == SHA).
518575 #
519- # Matrix runs in parallel; `fail-fast: true` cancels siblings the moment one
520- # service refuses to roll, so a broken cdc/yjs deploy doesn't have to wait
521- # 5 minutes on a separately-stuck backend before the workflow turns red .
576+ # Backend first (it owns the expand migration), then the rest in parallel.
577+ # cdc has no public health endpoint; its replacement is confirmed indirectly
578+ # by the backend coming up healthy ( the cdc worker reconnects to it) .
522579 #
523- # Tag rollback: re-running the workflow on the previous commit re-PUTs the
524- # old SHA. The on-VM reconciler sees the change and rolls back the same way
525- # it rolled forward — no separate code path. CI doesn't try to "undo" on
526- # failure; that would race the reconciler's own rollback (which only the VM
527- # has the local context to do safely).
528- # -------------------------------------------------------------------------
529- # -------------------------------------------------------------------------
530- # Roll backend FIRST — backend is the schema owner. Its reconciler runs the
531- # expand (additive) migration before swapping the app container, so the
532- # backend deploy-tag write + health gate MUST go green before any other
533- # service rolls. This guarantees the new schema is present before ai/yjs/cdc/
534- # frontend pick up code that may depend on it. roll-rest gates on this job.
580+ # Rollback: re-run the workflow on the previous commit — it provisions a new
581+ # generation from that SHA the same way it rolled forward (no separate path).
535582 # -------------------------------------------------------------------------
536583 roll-backend :
537584 runs-on : ubuntu-latest
@@ -556,16 +603,10 @@ jobs:
556603 - service : backend
557604 health_url : ${{ needs.setup.outputs.backend_url }}
558605 env :
559- AWS_ACCESS_KEY_ID : ${{ secrets.SCW_ACCESS_KEY }}
560- AWS_SECRET_ACCESS_KEY : ${{ secrets.SCW_SECRET_KEY }}
561- TAG_BUCKET : ${{ needs.setup.outputs.deploy_tags_bucket }}
562606 REGION : ${{ needs.setup.outputs.region }}
563607 EXPECTED_SHA : ${{ needs.setup.outputs.image_tag }}
564608 SERVICE : ${{ matrix.service }}
565609 steps :
566- # Checkout + Node are needed for the wait-for-version task below. The
567- # deploy-tag write only needs the preinstalled aws CLI, but keeping setup
568- # at the top means both steps share one toolchain install.
569610 - name : Checkout
570611 uses : actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
571612
@@ -581,52 +622,21 @@ jobs:
581622 - name : Install dependencies
582623 run : pnpm install --frozen-lockfile --filter infra...
583624
584- - name : Write deploy tag for ${{ matrix.service }}
585- run : |
586- ENDPOINT="https://s3.$REGION.scw.cloud"
587- # Immutability guard: the reconciler pulls whatever SHA we publish
588- # here, and Pulumi can no longer reject a mutable tag at plan time
589- # (image tags left cloud-init entirely). So the write path is now the
590- # sole gate — refuse to publish an empty value or a mutable :latest.
591- case "$EXPECTED_SHA" in
592- ''|latest|*:latest)
593- echo "::error::refusing to publish non-pinned tag '$EXPECTED_SHA' to deploy/$SERVICE.tag"
594- exit 1
595- ;;
596- esac
597- echo "Publishing $EXPECTED_SHA to s3://$TAG_BUCKET/deploy/$SERVICE.tag"
598- # Plain text body, no newline — reconciler reads it as-is.
599- printf '%s' "$EXPECTED_SHA" \
600- | aws --endpoint-url "$ENDPOINT" s3 cp - "s3://$TAG_BUCKET/deploy/$SERVICE.tag" \
601- --content-type 'text/plain'
602-
603625 - name : Wait for ${{ matrix.service }} to serve ${{ needs.setup.outputs.image_tag }}
604626 if : matrix.health_url != ''
605627 env :
606628 # URLs come from setup outputs (derived from shared/ appConfig).
607629 # appConfig URLs are canonical https:// origins; CI probes /health
608- # directly without any scheme transformation.
630+ # directly. `pulumi up` already provisioned the new generation (SHA
631+ # baked into cloud-init); this only confirms it is serving. Frontend
632+ # (Caddy) returns 200, backend/yjs/ai 204; both emit X-App-Version.
609633 BASE : ${{ matrix.health_url }}
610- # Status/header poll lives in infra/tasks/wait-for-version.ts (unit
611- # tested) instead of an inline awk loop. Frontend (Caddy) returns 200,
612- # backend/yjs/ai return 204; both emit X-App-Version. 100 × 3s = 5 min
613- # budget per service; the matrix runs in parallel. The status triple
614- # lets the poll fast-fail on a reconciler-reported failure (and surface
615- # its phase/reason) instead of blindly waiting out the whole budget.
616- run : pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA" --status-bucket "$TAG_BUCKET" --service "$SERVICE" --region "$REGION"
617-
618- - name : Fetch ${{ matrix.service }} boot diagnostics on failure
619- if : failure()
620- env :
621- STATE_BUCKET : ${{ needs.setup.outputs.state_bucket }}
622- # Key selection (which stage markers / latest full log to show) lives in
623- # the unit-tested infra/tasks/fetch-boot-diag.ts; the aws calls are the
624- # only side effect. roll-backend already has Node installed above.
625- run : pnpm --filter infra fetch-boot-diag --bucket "$STATE_BUCKET" --service "$SERVICE" --region "$REGION"
634+ run : pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA"
626635
627636 # -------------------------------------------------------------------------
628637 # Roll the remaining services — runs ONLY after roll-backend is green, so the
629- # expand migration has already been applied. These four roll in parallel.
638+ # expand migration (run at the backend generation's boot) is applied. These
639+ # roll in parallel.
630640 # -------------------------------------------------------------------------
631641 roll-rest :
632642 runs-on : ubuntu-latest
@@ -650,16 +660,10 @@ jobs:
650660 # the backend image at this SHA; the probe confirms the AI LB serves it.
651661 include : ${{ fromJSON(needs.setup.outputs.roll_rest_matrix) }}
652662 env :
653- AWS_ACCESS_KEY_ID : ${{ secrets.SCW_ACCESS_KEY }}
654- AWS_SECRET_ACCESS_KEY : ${{ secrets.SCW_SECRET_KEY }}
655- TAG_BUCKET : ${{ needs.setup.outputs.deploy_tags_bucket }}
656663 REGION : ${{ needs.setup.outputs.region }}
657664 EXPECTED_SHA : ${{ needs.setup.outputs.image_tag }}
658665 SERVICE : ${{ matrix.service }}
659666 steps :
660- # Checkout + Node are needed for the wait-for-version task below. The
661- # deploy-tag write only needs the preinstalled aws CLI, but keeping setup
662- # at the top means both steps share one toolchain install.
663667 - name : Checkout
664668 uses : actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5
665669
@@ -675,48 +679,11 @@ jobs:
675679 - name : Install dependencies
676680 run : pnpm install --frozen-lockfile --filter infra...
677681
678- - name : Write deploy tag for ${{ matrix.service }}
679- run : |
680- ENDPOINT="https://s3.$REGION.scw.cloud"
681- # Immutability guard: the reconciler pulls whatever SHA we publish
682- # here, and Pulumi can no longer reject a mutable tag at plan time
683- # (image tags left cloud-init entirely). So the write path is now the
684- # sole gate — refuse to publish an empty value or a mutable :latest.
685- case "$EXPECTED_SHA" in
686- ''|latest|*:latest)
687- echo "::error::refusing to publish non-pinned tag '$EXPECTED_SHA' to deploy/$SERVICE.tag"
688- exit 1
689- ;;
690- esac
691- echo "Publishing $EXPECTED_SHA to s3://$TAG_BUCKET/deploy/$SERVICE.tag"
692- # Plain text body, no newline — reconciler reads it as-is.
693- printf '%s' "$EXPECTED_SHA" \
694- | aws --endpoint-url "$ENDPOINT" s3 cp - "s3://$TAG_BUCKET/deploy/$SERVICE.tag" \
695- --content-type 'text/plain'
696-
697682 - name : Wait for ${{ matrix.service }} to serve ${{ needs.setup.outputs.image_tag }}
698683 if : matrix.health_url != ''
699684 env :
700- # URLs come from setup outputs (derived from shared/ appConfig).
701- # appConfig URLs are canonical https:// origins; CI probes /health
702- # directly without any scheme transformation.
703685 BASE : ${{ matrix.health_url }}
704- # Status/header poll lives in infra/tasks/wait-for-version.ts (unit
705- # tested) instead of an inline awk loop. Frontend (Caddy) returns 200,
706- # backend/yjs/ai return 204; both emit X-App-Version. 100 × 3s = 5 min
707- # budget per service; the matrix runs in parallel. The status triple
708- # lets the poll fast-fail on a reconciler-reported failure (and surface
709- # its phase/reason) instead of blindly waiting out the whole budget.
710- run : pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA" --status-bucket "$TAG_BUCKET" --service "$SERVICE" --region "$REGION"
711-
712- - name : Fetch ${{ matrix.service }} boot diagnostics on failure
713- if : failure()
714- env :
715- STATE_BUCKET : ${{ needs.setup.outputs.state_bucket }}
716- # Key selection (which stage markers / latest full log to show) lives in
717- # the unit-tested infra/tasks/fetch-boot-diag.ts; the aws calls are the
718- # only side effect. roll-rest already has Node installed above.
719- run : pnpm --filter infra fetch-boot-diag --bucket "$STATE_BUCKET" --service "$SERVICE" --region "$REGION"
686+ run : pnpm --filter infra wait-for-version --url "$BASE/health" --sha "$EXPECTED_SHA"
720687
721688 # -------------------------------------------------------------------------
722689 # Verify rollout — backward-compatible alias that depends on the roll jobs.
@@ -742,10 +709,6 @@ jobs:
742709 matrix :
743710 include : ${{ fromJSON(needs.setup.outputs.verify_rollout_matrix) }}
744711 env :
745- AWS_ACCESS_KEY_ID : ${{ secrets.SCW_ACCESS_KEY }}
746- AWS_SECRET_ACCESS_KEY : ${{ secrets.SCW_SECRET_KEY }}
747- TAG_BUCKET : ${{ needs.setup.outputs.deploy_tags_bucket }}
748- REGION : ${{ needs.setup.outputs.region }}
749712 EXPECTED_SHA : ${{ needs.setup.outputs.image_tag }}
750713 SERVICE : ${{ matrix.service }}
751714 steps :
0 commit comments