From c4f1f96ac01b7437442be79ad1578a3084fb7a43 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 16:44:03 +0100 Subject: [PATCH 01/25] Environment selection --- .github/workflows/deploy.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 2477db934..82945a3d3 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -2,17 +2,24 @@ name: deploy on: workflow_dispatch: + inputs: + environment: + type: choice + options: [dev, staging, production] + required: true concurrency: group: deploy-${{ github.ref }} cancel-in-progress: false jobs: - ecr: + deploy: runs-on: ubuntu-latest + environment: ${{ inputs.environment }} permissions: id-token: write # This is required for requesting the JWT contents: read # This is required for actions/checkout + steps: - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd From 5b254bf0ce0acd12bd454ec445e40efa0ea06af4 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 17:11:12 +0100 Subject: [PATCH 02/25] Helm deployments (with hmpps charts) --- .github/workflows/deploy.yml | 80 +++--- .gitignore | 3 + helm_deploy/README.md | 101 ++++++++ helm_deploy/cats/.helmignore | 7 + helm_deploy/cats/Chart.lock | 12 + helm_deploy/cats/Chart.yaml | 32 +++ helm_deploy/cats/templates/_helpers.tpl | 43 ++++ helm_deploy/cats/templates/migrator-job.yaml | 38 +++ .../cats/templates/rabbitmq.yaml | 41 +++- helm_deploy/cats/templates/redis.yaml | 56 +++++ helm_deploy/cats/templates/seeder-job.yaml | 40 +++ helm_deploy/cats/values-dev.yaml | 19 ++ helm_deploy/cats/values-production.yaml | 34 +++ helm_deploy/cats/values-staging.yaml | 20 ++ helm_deploy/cats/values.yaml | 228 ++++++++++++++++++ infra/cats-deployment.yml | 113 --------- infra/cats-ingress.yml | 28 --- infra/cats-service.yml | 11 - infra/cats-worker-deployment.yml | 94 -------- infra/cats-worker-service.yml | 11 - infra/migrator-pod.yml | 41 ---- infra/rabbitmq-service.yml | 14 -- infra/redis-deployment.yml | 40 --- infra/redis-service.yml | 11 - infra/seeder-pod.yml | 43 ---- 25 files changed, 698 insertions(+), 462 deletions(-) create mode 100644 helm_deploy/README.md create mode 100644 helm_deploy/cats/.helmignore create mode 100644 helm_deploy/cats/Chart.lock create mode 100644 helm_deploy/cats/Chart.yaml create mode 100644 helm_deploy/cats/templates/_helpers.tpl create mode 100644 helm_deploy/cats/templates/migrator-job.yaml rename infra/rabbitmq-deployment.yml => helm_deploy/cats/templates/rabbitmq.yaml (50%) create mode 100644 helm_deploy/cats/templates/redis.yaml create mode 100644 helm_deploy/cats/templates/seeder-job.yaml create mode 100644 helm_deploy/cats/values-dev.yaml create mode 100644 helm_deploy/cats/values-production.yaml create mode 100644 helm_deploy/cats/values-staging.yaml create mode 100644 helm_deploy/cats/values.yaml delete mode 100644 infra/cats-deployment.yml delete mode 100644 infra/cats-ingress.yml delete mode 100644 infra/cats-service.yml delete mode 100644 infra/cats-worker-deployment.yml delete mode 100644 infra/cats-worker-service.yml delete mode 100644 infra/migrator-pod.yml delete mode 100644 infra/rabbitmq-service.yml delete mode 100644 infra/redis-deployment.yml delete mode 100644 infra/redis-service.yml delete mode 100644 infra/seeder-pod.yml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 82945a3d3..58abc8d7e 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -19,7 +19,7 @@ jobs: permissions: id-token: write # This is required for requesting the JWT contents: read # This is required for actions/checkout - + steps: - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -46,7 +46,7 @@ jobs: with: role-to-assume: ${{ secrets.ECR_ROLE_TO_ASSUME }} aws-region: ${{ vars.ECR_REGION }} - + - name: Login to ECR uses: aws-actions/amazon-ecr-login@33f92af657bba1882ab79d8621debd2f6769a0c9 id: login-ecr @@ -70,7 +70,7 @@ jobs: /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ /p:ContainerImageTag=worker-${{ github.sha }} - + - name: Build and Push DatabaseSeeding Container run: | dotnet publish src/DatabaseSeeding/DatabaseSeeding.csproj \ @@ -88,25 +88,11 @@ jobs: -t ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:migrator-${{ github.sha }} \ . docker push ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:migrator-${{ github.sha }} - + - name: Generate app version id: version run: echo "app_version=$(date +'%Y.%m.%d').${{ github.run_number }}" >> $GITHUB_OUTPUT - - name: Generate Kubernetes Manifests - run: | - mkdir -p deploy - for file in infra/*.yml; do - envsubst < "$file" > "deploy/$(basename "$file")" - done - env: - IMAGE_TAG: ${{ github.sha }} - APP_VERSION: ${{ steps.version.outputs.app_version }} - REGISTRY: ${{ steps.login-ecr.outputs.registry }} - REPOSITORY: ${{ vars.ECR_REPOSITORY }} - NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} - DOTNET_ENVIRONMENT: "Development" - - name: Configure kubectl run: | echo "${{ secrets.KUBE_CERT }}" > ca.crt @@ -118,33 +104,37 @@ jobs: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} KUBE_CLUSTER: ${{ secrets.KUBE_CLUSTER }} - - name: Run database migration - run: | - kubectl -n ${KUBE_NAMESPACE} delete pod -l app=migrator --wait=false || true - kubectl -n ${KUBE_NAMESPACE} apply -f deploy/migrator-pod.yml - if ! kubectl -n ${KUBE_NAMESPACE} wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod/migrator-${{ github.sha }}; then - echo "Migration pod did not succeed within timeout." - kubectl -n ${KUBE_NAMESPACE} describe pod/migrator-${{ github.sha }} || true - exit 1 - fi - env: - KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} - - - name: Run database seeding - run: | - kubectl -n ${KUBE_NAMESPACE} delete pod -l app=seeder --wait=false || true - kubectl -n ${KUBE_NAMESPACE} apply -f deploy/seeder-pod.yml - if ! kubectl -n ${KUBE_NAMESPACE} wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod/seeder-${{ github.sha }}; then - echo "Seeder pod did not succeed within timeout." - kubectl -n ${KUBE_NAMESPACE} describe pod/seeder-${{ github.sha }} || true - exit 1 - fi - env: - KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} - - - name: Deploy to Kubernetes + - name: Deploy CATS with Helm + # A single release deploys the web tier, worker, ephemeral RabbitMQ/Redis, and runs + # the migrator + seeder as pre-upgrade hook Jobs (Helm waits for them before rollout). run: | - rm -f deploy/migrator-pod.yml deploy/seeder-pod.yml - kubectl -n ${KUBE_NAMESPACE} apply -f deploy/ + set -euo pipefail + helm repo add hmpps-helm-charts https://ministryofjustice.github.io/hmpps-helm-charts + helm dependency build ./helm_deploy/cats + + IMAGE_REPOSITORY="${REGISTRY}/${REPOSITORY}" + + helm upgrade --install cats ./helm_deploy/cats \ + --namespace "${KUBE_NAMESPACE}" \ + --values ./helm_deploy/cats/values-${{ inputs.environment }}.yaml \ + --set serviceAccountName="${KUBE_NAMESPACE}" \ + --set app.serviceAccountName="${KUBE_NAMESPACE}" \ + --set app.image.repository="${IMAGE_REPOSITORY}" \ + --set app.image.tag="cats-${{ github.sha }}" \ + --set app.env.Sentry__Release="${APP_VERSION}" \ + --set app.env.AppConfigurationSettings__Version="${APP_VERSION}" \ + --set worker.serviceAccountName="${KUBE_NAMESPACE}" \ + --set worker.image.repository="${IMAGE_REPOSITORY}" \ + --set worker.image.tag="worker-${{ github.sha }}" \ + --set worker.env.Sentry__Release="${APP_VERSION}" \ + --set worker.env.AppConfigurationSettings__Version="${APP_VERSION}" \ + --set migrator.image.repository="${IMAGE_REPOSITORY}" \ + --set migrator.image.tag="migrator-${{ github.sha }}" \ + --set seeder.image.repository="${IMAGE_REPOSITORY}" \ + --set seeder.image.tag="seeder-${{ github.sha }}" \ + --wait --timeout 10m env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} + REGISTRY: ${{ steps.login-ecr.outputs.registry }} + REPOSITORY: ${{ vars.ECR_REPOSITORY }} + APP_VERSION: ${{ steps.version.outputs.app_version }} diff --git a/.gitignore b/.gitignore index 664db9536..1d3927c6b 100644 --- a/.gitignore +++ b/.gitignore @@ -481,3 +481,6 @@ aspire-output/ # ls cache files for C# develop extension *csproj.lscache + +# Helm chart dependencies (fetched via `helm dependency build`) +helm_deploy/*/charts/ diff --git a/helm_deploy/README.md b/helm_deploy/README.md new file mode 100644 index 000000000..24d3b42e2 --- /dev/null +++ b/helm_deploy/README.md @@ -0,0 +1,101 @@ +# CATS Helm deployment + +A **single Helm release** deploys the entire Case Assessment and Tracking System (CATS) +to the MoJ Cloud Platform. There is one templating mechanism at the deploy layer — Helm — +so there is no `envsubst` or `kubectl apply` of raw manifests in the pipeline. + +## What the release contains + +| Resource | Source | +|-------------------------------|---------------------------------------------------| +| Web tier (Blazor Server UI) | `generic-service` dependency, alias `app` | +| Worker (Quartz jobs) | `generic-service` dependency, alias `worker` | +| RabbitMQ (ephemeral) | local template `templates/rabbitmq.yaml` | +| Redis (ephemeral backplane) | local template `templates/redis.yaml` | +| DB migrator (run-once) | `templates/migrator-job.yaml` — pre-upgrade hook | +| DB seeder (run-once) | `templates/seeder-job.yaml` — pre-upgrade hook | +| Prometheus alerts | `generic-prometheus-alerts` dependency | + +The migrator (hook-weight `-5`) and seeder (hook-weight `0`) run as **`pre-install`/ +`pre-upgrade` Helm hooks**: Helm runs them in weight order and waits for each to succeed +before rolling out the app — so a failed migration fails the deploy. The web tier reaches +the worker at `http://cats-worker:8080` and the in-cluster broker/cache at +`rabbitmq-service:5672` / `redis-service:6379`. + +> RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the +> transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. + +## Layout + +``` +helm_deploy/cats/ + Chart.yaml # app + worker (aliased generic-service) + alerts + values.yaml # shared defaults + values-dev.yaml # namespace: cfocats-dev + values-staging.yaml + values-production.yaml + templates/ + _helpers.tpl + rabbitmq.yaml + redis.yaml + migrator-job.yaml + seeder-job.yaml +``` + +`infra/` now contains only `port-forward-deployment.yml`, a manual developer convenience +for reaching the RDS instance (applied ad hoc, not part of the pipeline). + +## Per-deploy values (supplied by CI) + +The image registry/tags, app version, and service account are passed at deploy time. Note +the four images share one ECR repository but use different tag prefixes +(`cats-`, `worker-`, `migrator-`, `seeder-`): + +```bash +helm dependency build ./helm_deploy/cats + +helm upgrade --install cats ./helm_deploy/cats \ + --namespace "$KUBE_NAMESPACE" \ + --values ./helm_deploy/cats/values-$ENV.yaml \ + --set serviceAccountName="$KUBE_NAMESPACE" \ + --set app.serviceAccountName="$KUBE_NAMESPACE" \ + --set app.image.repository="$REGISTRY/$ECR_REPOSITORY" \ + --set app.image.tag="cats-$SHA" \ + --set app.env.Sentry__Release="$APP_VERSION" \ + --set app.env.AppConfigurationSettings__Version="$APP_VERSION" \ + --set worker.serviceAccountName="$KUBE_NAMESPACE" \ + --set worker.image.repository="$REGISTRY/$ECR_REPOSITORY" \ + --set worker.image.tag="worker-$SHA" \ + --set worker.env.Sentry__Release="$APP_VERSION" \ + --set worker.env.AppConfigurationSettings__Version="$APP_VERSION" \ + --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ + --set migrator.image.tag="migrator-$SHA" \ + --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ + --set seeder.image.tag="seeder-$SHA" \ + --wait --timeout 10m +``` + +## Local validation + +```bash +helm dependency build ./helm_deploy/cats +helm lint ./helm_deploy/cats -f ./helm_deploy/cats/values-dev.yaml +helm template cats ./helm_deploy/cats \ + --namespace cfocats-dev --values ./helm_deploy/cats/values-dev.yaml +``` + +## First-time migration from the previous (kubectl) deploy + +The previous pipeline created differently-named objects. On the **first** Helm deploy to a +namespace that already ran the old pipeline, delete the legacy resources once so they don't +collide (notably the old ingress vs the new `cats-v1-2` ingress on the same host): + +```bash +kubectl -n delete deploy cats-deployment cats-worker-deployment \ + rabbitmq-deployment redis-deployment +kubectl -n delete svc cats-service cats-worker-service \ + rabbitmq-service redis-service +kubectl -n delete ingress cats-ingress +``` + +Helm then owns `cats`, `cats-worker`, `rabbitmq-*`, `redis-*`, `cats-v1-2` and the hook Jobs. diff --git a/helm_deploy/cats/.helmignore b/helm_deploy/cats/.helmignore new file mode 100644 index 000000000..c33b73443 --- /dev/null +++ b/helm_deploy/cats/.helmignore @@ -0,0 +1,7 @@ +.git/ +.gitignore +*.tmproj +*.bak +*.orig +.vscode/ +.idea/ diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock new file mode 100644 index 000000000..afa701e6e --- /dev/null +++ b/helm_deploy/cats/Chart.lock @@ -0,0 +1,12 @@ +dependencies: +- name: generic-service + repository: https://ministryofjustice.github.io/hmpps-helm-charts + version: 3.17.2 +- name: generic-service + repository: https://ministryofjustice.github.io/hmpps-helm-charts + version: 3.17.2 +- name: generic-prometheus-alerts + repository: https://ministryofjustice.github.io/hmpps-helm-charts + version: 1.17.1 +digest: sha256:47030f5b5200f30b142f40b35f6f26120db6ba6113a3c5c15ec6a8ea3911e2c4 +generated: "2026-06-20T16:28:55.187412+01:00" diff --git a/helm_deploy/cats/Chart.yaml b/helm_deploy/cats/Chart.yaml new file mode 100644 index 000000000..5b1324432 --- /dev/null +++ b/helm_deploy/cats/Chart.yaml @@ -0,0 +1,32 @@ +apiVersion: v2 +name: cats +description: | + Case Assessment and Tracking System (CATS) — HMPPS Creating Future Opportunities (CFO). + A single release deploying the Blazor Server web tier and the Quartz worker (both via the + HMPPS generic-service chart), the ephemeral in-cluster RabbitMQ and Redis dependencies, + and the database migrator/seeder as pre-upgrade Helm hook Jobs. +type: application + +# Version of this chart. Bump on every change to the chart/values. +version: "0.1.0" + +# Mirrors the application version; the running image is selected via image tags at deploy time. +appVersion: "0.1.0" + +dependencies: + # Web tier (Blazor Server UI) — ingress, SignalR sticky sessions, multiple replicas. + - name: generic-service + alias: app + version: "3.17.2" + repository: https://ministryofjustice.github.io/hmpps-helm-charts + + # Background worker (Quartz jobs) — single instance, no ingress. + - name: generic-service + alias: worker + version: "3.17.2" + repository: https://ministryofjustice.github.io/hmpps-helm-charts + + # Standard HMPPS Prometheus alert rules. + - name: generic-prometheus-alerts + version: "1.17.1" + repository: https://ministryofjustice.github.io/hmpps-helm-charts diff --git a/helm_deploy/cats/templates/_helpers.tpl b/helm_deploy/cats/templates/_helpers.tpl new file mode 100644 index 000000000..33ff54d35 --- /dev/null +++ b/helm_deploy/cats/templates/_helpers.tpl @@ -0,0 +1,43 @@ +{{/* +Service account name. Cloud Platform creates a service account named after the +namespace (with IRSA role bindings for S3/RDS), so default to the release namespace. +*/}} +{{- define "cats.serviceAccountName" -}} +{{- .Values.serviceAccountName | default .Release.Namespace -}} +{{- end -}} + +{{/* +Pod-level security context shared by the local (non generic-service) workloads. +*/}} +{{- define "cats.podSecurityContext" -}} +seccompProfile: + type: RuntimeDefault +runAsUser: 1001 +runAsGroup: 1001 +runAsNonRoot: true +{{- end -}} + +{{/* +Environment variables that expose the MSSQL connection details from the +rds-mssql-instance-output namespace secret, plus the composed connection string. +Used by the migrator and seeder Jobs. +*/}} +{{- define "cats.databaseEnv" -}} +- name: DATABASE_ADDRESS + valueFrom: + secretKeyRef: + name: rds-mssql-instance-output + key: rds_instance_address +- name: DATABASE_USERNAME + valueFrom: + secretKeyRef: + name: rds-mssql-instance-output + key: database_username +- name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: rds-mssql-instance-output + key: database_password +- name: ConnectionStrings__CatsDb + value: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" +{{- end -}} diff --git a/helm_deploy/cats/templates/migrator-job.yaml b/helm_deploy/cats/templates/migrator-job.yaml new file mode 100644 index 000000000..1120a7142 --- /dev/null +++ b/helm_deploy/cats/templates/migrator-job.yaml @@ -0,0 +1,38 @@ +{{- if .Values.migrator.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: cats-migrator + labels: + app: migrator + annotations: + # Run before the application is upgraded; deploy fails if the migration fails. + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: {{ .Values.migrator.backoffLimit }} + template: + metadata: + labels: + app: migrator + spec: + serviceAccountName: {{ include "cats.serviceAccountName" . }} + restartPolicy: Never + securityContext: + {{- include "cats.podSecurityContext" . | nindent 8 }} + containers: + - name: migrator + image: "{{ .Values.migrator.image.repository }}:{{ .Values.migrator.image.tag }}" + securityContext: + allowPrivilegeEscalation: false + privileged: false + capabilities: + drop: ["ALL"] + env: + {{- include "cats.databaseEnv" . | nindent 12 }} + {{- with .Values.migrator.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +{{- end }} diff --git a/infra/rabbitmq-deployment.yml b/helm_deploy/cats/templates/rabbitmq.yaml similarity index 50% rename from infra/rabbitmq-deployment.yml rename to helm_deploy/cats/templates/rabbitmq.yaml index 4ea6ba7d3..4e9f9ec93 100644 --- a/infra/rabbitmq-deployment.yml +++ b/helm_deploy/cats/templates/rabbitmq.yaml @@ -1,3 +1,4 @@ +{{- if .Values.rabbitmq.enabled }} apiVersion: apps/v1 kind: Deployment metadata: @@ -8,23 +9,19 @@ spec: replicas: 1 selector: matchLabels: - app: rabbitmq # this should match the selector in service.yml + app: rabbitmq template: metadata: labels: - app: rabbitmq # this should match the selector in service.yml + app: rabbitmq spec: securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - serviceAccountName: ${NAMESPACE} + {{- include "cats.podSecurityContext" . | nindent 8 }} + serviceAccountName: {{ include "cats.serviceAccountName" . }} containers: - name: rabbitmq - image: rabbitmq:4.3-management-alpine@sha256:1a43764bdcf116542e7c8c794adc67c79461727da16d474e9e21483fe7f716d3 - imagePullPolicy: Always + image: {{ .Values.rabbitmq.image | quote }} + imagePullPolicy: Always ports: - containerPort: 5672 - containerPort: 15672 @@ -41,4 +38,26 @@ spec: valueFrom: secretKeyRef: name: config - key: RABBIT_PASS \ No newline at end of file + key: RABBIT_PASS + {{- with .Values.rabbitmq.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +--- +apiVersion: v1 +kind: Service +metadata: + name: rabbitmq-service + labels: + app: rabbitmq +spec: + selector: + app: rabbitmq + ports: + - name: amqp + port: 5672 + targetPort: 5672 + - name: management + port: 15672 + targetPort: 15672 +{{- end }} diff --git a/helm_deploy/cats/templates/redis.yaml b/helm_deploy/cats/templates/redis.yaml new file mode 100644 index 000000000..27da27a53 --- /dev/null +++ b/helm_deploy/cats/templates/redis.yaml @@ -0,0 +1,56 @@ +{{- if .Values.redis.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-deployment + labels: + app: redis +spec: + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + securityContext: + {{- include "cats.podSecurityContext" . | nindent 8 }} + serviceAccountName: {{ include "cats.serviceAccountName" . }} + containers: + - name: redis + image: {{ .Values.redis.image | quote }} + imagePullPolicy: Always + # Used only as a SignalR backplane and Fusion cache, so all data is ephemeral + # and rebuildable. Disable RDB/AOF persistence to avoid the MISCONF + # "stop-writes-on-bgsave-error" failure when /data is not writable. + args: + - --save + - "" + - --appendonly + - "no" + ports: + - containerPort: 6379 + securityContext: + allowPrivilegeEscalation: false + privileged: false + {{- with .Values.redis.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +--- +apiVersion: v1 +kind: Service +metadata: + name: redis-service + labels: + app: redis +spec: + selector: + app: redis + ports: + - name: redis + port: 6379 + targetPort: 6379 +{{- end }} diff --git a/helm_deploy/cats/templates/seeder-job.yaml b/helm_deploy/cats/templates/seeder-job.yaml new file mode 100644 index 000000000..03929dcec --- /dev/null +++ b/helm_deploy/cats/templates/seeder-job.yaml @@ -0,0 +1,40 @@ +{{- if .Values.seeder.enabled }} +apiVersion: batch/v1 +kind: Job +metadata: + name: cats-seeder + labels: + app: seeder + annotations: + # Runs after the migrator (higher hook-weight) and before the application upgrade. + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "0" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + backoffLimit: {{ .Values.seeder.backoffLimit }} + template: + metadata: + labels: + app: seeder + spec: + serviceAccountName: {{ include "cats.serviceAccountName" . }} + restartPolicy: Never + securityContext: + {{- include "cats.podSecurityContext" . | nindent 8 }} + containers: + - name: seeder + image: "{{ .Values.seeder.image.repository }}:{{ .Values.seeder.image.tag }}" + securityContext: + allowPrivilegeEscalation: false + privileged: false + capabilities: + drop: ["ALL"] + env: + {{- include "cats.databaseEnv" . | nindent 12 }} + - name: DOTNET_ENVIRONMENT + value: {{ .Values.dotnetEnvironment | quote }} + {{- with .Values.seeder.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} +{{- end }} diff --git a/helm_deploy/cats/values-dev.yaml b/helm_deploy/cats/values-dev.yaml new file mode 100644 index 000000000..c59116040 --- /dev/null +++ b/helm_deploy/cats/values-dev.yaml @@ -0,0 +1,19 @@ +# Development overrides. Namespace: cfocats-dev +dotnetEnvironment: Development + +app: + replicaCount: 3 + ingress: + host: cfocats-dev.live.cloud-platform.service.justice.gov.uk + env: + DOTNET_ENVIRONMENT: "Development" + Sentry__Environment: "Development-CloudPlatform" + +worker: + replicaCount: 1 + env: + DOTNET_ENVIRONMENT: "Development" + Sentry__Environment: "Development-CloudPlatform" + +generic-prometheus-alerts: + alertSeverity: cfo-alerts-nonprod diff --git a/helm_deploy/cats/values-production.yaml b/helm_deploy/cats/values-production.yaml new file mode 100644 index 000000000..9feeaeeb6 --- /dev/null +++ b/helm_deploy/cats/values-production.yaml @@ -0,0 +1,34 @@ +# Production overrides. +# NOTE: confirm the production namespace / hostname before first deploy. +dotnetEnvironment: Production + +app: + replicaCount: 4 + ingress: + host: cfocats-production.live.cloud-platform.service.justice.gov.uk + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: "2" + memory: 2Gi + env: + DOTNET_ENVIRONMENT: "Production" + Sentry__Environment: "Production-CloudPlatform" + +worker: + replicaCount: 1 + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: "1" + memory: 1Gi + env: + DOTNET_ENVIRONMENT: "Production" + Sentry__Environment: "Production-CloudPlatform" + +generic-prometheus-alerts: + alertSeverity: cfo-alerts diff --git a/helm_deploy/cats/values-staging.yaml b/helm_deploy/cats/values-staging.yaml new file mode 100644 index 000000000..0dbadb8d0 --- /dev/null +++ b/helm_deploy/cats/values-staging.yaml @@ -0,0 +1,20 @@ +# Staging overrides. +# NOTE: confirm the staging namespace / hostname before first deploy. +dotnetEnvironment: Staging + +app: + replicaCount: 3 + ingress: + host: cfocats-staging.live.cloud-platform.service.justice.gov.uk + env: + DOTNET_ENVIRONMENT: "Staging" + Sentry__Environment: "Staging-CloudPlatform" + +worker: + replicaCount: 1 + env: + DOTNET_ENVIRONMENT: "Staging" + Sentry__Environment: "Staging-CloudPlatform" + +generic-prometheus-alerts: + alertSeverity: cfo-alerts-nonprod diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml new file mode 100644 index 000000000..7982d63a6 --- /dev/null +++ b/helm_deploy/cats/values.yaml @@ -0,0 +1,228 @@ +# Default values for the CATS umbrella chart (single release). +# Environment-specific overrides live in values-.yaml. +# +# generic-service values are nested under the dependency aliases `app:` (web) and +# `worker:`. Local workloads (rabbitmq, redis, migrator, seeder) and alerts use the +# top-level keys below. +# +# Values that change per deploy (image registry/tags, app version, service account) +# are supplied by CI via --set. See helm_deploy/README.md. + +# Service account for the local workloads (rabbitmq/redis/jobs). Defaults to the +# release namespace, which is the Cloud Platform IRSA-enabled service account. +serviceAccountName: "" + +# .NET environment for the seeder Job. The app/worker set their own DOTNET_ENVIRONMENT +# via app.env / worker.env. Overridden per environment. +dotnetEnvironment: Development + +# --------------------------------------------------------------------------- +# Web tier (Blazor Server UI) — generic-service +# --------------------------------------------------------------------------- +app: + nameOverride: cats + fullnameOverride: cats + # serviceAccountName is supplied by CI via --set (the namespace's IRSA account). + + replicaCount: 3 + + image: + repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats + tag: latest + pullPolicy: IfNotPresent + port: 8080 + + service: + enabled: true + type: ClusterIP + port: 8080 + + ingress: + enabled: true + path: / + healthPath: /health + tlsSecretName: "" + annotations: + nginx.ingress.kubernetes.io/affinity: "cookie" + nginx.ingress.kubernetes.io/session-cookie-name: "http-cookie" + nginx.ingress.kubernetes.io/session-cookie-expires: "172800" + nginx.ingress.kubernetes.io/session-cookie-max-age: "172800" + + startupProbe: + httpGet: + path: /alive + port: http + failureThreshold: 30 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /alive + port: http + periodSeconds: 20 + failureThreshold: 3 + + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: "1" + memory: 1Gi + + poddisruptionbudget: + enabled: true + minAvailable: 1 + + namespace_secrets: + rds-mssql-instance-output: + DATABASE_ADDRESS: "rds_instance_address" + DATABASE_USERNAME: "database_username" + DATABASE_PASSWORD: "database_password" + s3-bucket-output: + AWS__Bucket: "bucket_name" + config: + RABBIT_USER: "RABBIT_USER" + RABBIT_PASS: "RABBIT_PASS" + Sentry__Dsn: "SentryDsn" + + env: + ConnectionStrings__CatsDb: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" + ConnectionStrings__rabbit: "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" + ConnectionStrings__redis: "redis-service:6379" + AWS__RootFolder: "Files" + Features__UseWorkerForJobs: "true" + Features__PresenceHub__Enabled: "true" + Features__PresenceHub__RelayUserPresenceNotifications: "true" + Features__UseSignalRBackplane: "true" + WorkerOptions__BaseUrl: "http://cats-worker:8080" + # Overridden by CI at deploy time. + Sentry__Release: "0.0.0" + AppConfigurationSettings__Version: "0.0.0" + # DOTNET_ENVIRONMENT and Sentry__Environment are set per environment. + +# --------------------------------------------------------------------------- +# Background worker (Quartz jobs) — generic-service +# --------------------------------------------------------------------------- +worker: + nameOverride: cats-worker + fullnameOverride: cats-worker + + # MUST remain a single instance: Quartz jobs must not run concurrently across pods. + replicaCount: 1 + + autoscaling: + enabled: false + + # Recreate (not RollingUpdate) so a new worker pod never overlaps the old one. + strategy: + type: Recreate + + image: + repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats + tag: latest + pullPolicy: IfNotPresent + port: 8080 + + service: + enabled: true + type: ClusterIP + port: 8080 + + ingress: + enabled: false + + poddisruptionbudget: + enabled: false + + startupProbe: + httpGet: + path: /alive + port: http + failureThreshold: 30 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: http + periodSeconds: 10 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /alive + port: http + periodSeconds: 20 + failureThreshold: 3 + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + namespace_secrets: + rds-mssql-instance-output: + DATABASE_ADDRESS: "rds_instance_address" + DATABASE_USERNAME: "database_username" + DATABASE_PASSWORD: "database_password" + config: + RABBIT_USER: "RABBIT_USER" + RABBIT_PASS: "RABBIT_PASS" + Sentry__Dsn: "SentryDsn" + + env: + ConnectionStrings__CatsDb: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" + ConnectionStrings__rabbit: "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" + Sentry__Release: "0.0.0" + AppConfigurationSettings__Version: "0.0.0" + +# --------------------------------------------------------------------------- +# Ephemeral in-cluster dependencies (no persistence) +# --------------------------------------------------------------------------- +rabbitmq: + enabled: true + # rabbitmq:4.3-management-alpine + image: "rabbitmq:4.3-management-alpine@sha256:1a43764bdcf116542e7c8c794adc67c79461727da16d474e9e21483fe7f716d3" + resources: {} + +redis: + enabled: true + # redis:7.4-alpine + image: "redis:7.4-alpine@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7" + resources: {} + +# --------------------------------------------------------------------------- +# Database lifecycle Jobs (run as pre-upgrade Helm hooks) +# --------------------------------------------------------------------------- +migrator: + enabled: true + image: + repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats + # migrator-, supplied by CI via --set. + tag: latest + backoffLimit: 3 + resources: {} + +seeder: + enabled: true + image: + repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats + # seeder-, supplied by CI via --set. + tag: latest + backoffLimit: 3 + resources: {} + +# --------------------------------------------------------------------------- +# Prometheus alerts +# --------------------------------------------------------------------------- +generic-prometheus-alerts: + targetApplication: cats + businessUnit: hmpps + alertSeverity: cfo-alerts diff --git a/infra/cats-deployment.yml b/infra/cats-deployment.yml deleted file mode 100644 index 2870dc6dc..000000000 --- a/infra/cats-deployment.yml +++ /dev/null @@ -1,113 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cats-deployment - labels: - app: cats -spec: - replicas: 3 - selector: - matchLabels: - app: cats # this should match the selector in service.yml - template: - metadata: - labels: - app: cats # this should match the selector in service.yml - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - serviceAccountName: ${NAMESPACE} - containers: - - name: cats - image: ${REGISTRY}/${REPOSITORY}:cats-${IMAGE_TAG} - ports: - - containerPort: 8080 - startupProbe: - httpGet: - path: /alive - port: 8080 - failureThreshold: 30 - periodSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8080 - periodSeconds: 10 - failureThreshold: 3 - livenessProbe: - httpGet: - path: /alive - port: 8080 - periodSeconds: 20 - failureThreshold: 3 - securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: ["ALL"] - env: - - name: DATABASE_ADDRESS - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: rds_instance_address - - name: DATABASE_USERNAME - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_username - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_password - - name: AWS__Bucket - valueFrom: - secretKeyRef: - name: s3-bucket-output - key: bucket_name - - name: RABBIT_USER - valueFrom: - secretKeyRef: - name: config - key: RABBIT_USER - - name: RABBIT_PASS - valueFrom: - secretKeyRef: - name: config - key: RABBIT_PASS - - name: DOTNET_ENVIRONMENT - value: "${DOTNET_ENVIRONMENT}" - - name: ConnectionStrings__CatsDb - value: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" - - name: ConnectionStrings__rabbit - value: "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" - - name: ConnectionStrings__redis - value: "redis-service:6379" - - name: AWS__RootFolder - value: "Files" - - name: Sentry__Dsn - valueFrom: - secretKeyRef: - name: config - key: SentryDsn - - name: Sentry__Environment - value: "${DOTNET_ENVIRONMENT}-CloudPlatform" - - name: Sentry__Release - value: "${APP_VERSION}" - - name: AppConfigurationSettings__Version - value: "${APP_VERSION}" - - name: Features__UseWorkerForJobs - value: "true" - - name: Features__PresenceHub__Enabled - value: "true" - - name: Features__PresenceHub__RelayUserPresenceNotifications - value: "true" - - name: Features__UseSignalRBackplane - value: "true" - - name: WorkerOptions__BaseUrl - value: "http://cats-worker-service:8080" diff --git a/infra/cats-ingress.yml b/infra/cats-ingress.yml deleted file mode 100644 index 9ffde4ef4..000000000 --- a/infra/cats-ingress.yml +++ /dev/null @@ -1,28 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: cats-ingress - annotations: - external-dns.alpha.kubernetes.io/set-identifier: cats-ingress-${NAMESPACE}-green - external-dns.alpha.kubernetes.io/aws-weight: "100" - # Enable stickiness - nginx.ingress.kubernetes.io/affinity: "cookie" - nginx.ingress.kubernetes.io/session-cookie-name: "http-cookie" - nginx.ingress.kubernetes.io/session-cookie-expires: "172800" - nginx.ingress.kubernetes.io/session-cookie-max-age: "172800" -spec: - ingressClassName: default # modsec - tls: - - hosts: - - ${NAMESPACE}.live.cloud-platform.service.justice.gov.uk - rules: - - host: ${NAMESPACE}.live.cloud-platform.service.justice.gov.uk - http: - paths: - - path: / - pathType: ImplementationSpecific - backend: - service: - name: cats-service # this should match the metadata.name in service.yml - port: - number: 8080 diff --git a/infra/cats-service.yml b/infra/cats-service.yml deleted file mode 100644 index d4a74dc2a..000000000 --- a/infra/cats-service.yml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: cats-service -spec: - selector: - app: cats # this should match the pod label in deployment.yml - ports: - - name: http - port: 8080 - targetPort: 8080 diff --git a/infra/cats-worker-deployment.yml b/infra/cats-worker-deployment.yml deleted file mode 100644 index 1baed419e..000000000 --- a/infra/cats-worker-deployment.yml +++ /dev/null @@ -1,94 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cats-worker-deployment - labels: - app: cats-worker -spec: - replicas: 1 # Quartz jobs must not run concurrently across multiple pods - selector: - matchLabels: - app: cats-worker - template: - metadata: - labels: - app: cats-worker - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - serviceAccountName: ${NAMESPACE} - containers: - - name: cats-worker - image: ${REGISTRY}/${REPOSITORY}:worker-${IMAGE_TAG} - ports: - - containerPort: 8080 - startupProbe: - httpGet: - path: /alive - port: 8080 - failureThreshold: 30 - periodSeconds: 5 - readinessProbe: - httpGet: - path: /health - port: 8080 - periodSeconds: 10 - failureThreshold: 3 - livenessProbe: - httpGet: - path: /alive - port: 8080 - periodSeconds: 20 - failureThreshold: 3 - securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: ["ALL"] - env: - - name: DATABASE_ADDRESS - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: rds_instance_address - - name: DATABASE_USERNAME - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_username - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_password - - name: RABBIT_USER - valueFrom: - secretKeyRef: - name: config - key: RABBIT_USER - - name: RABBIT_PASS - valueFrom: - secretKeyRef: - name: config - key: RABBIT_PASS - - name: DOTNET_ENVIRONMENT - value: "${DOTNET_ENVIRONMENT}" - - name: ConnectionStrings__CatsDb - value: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" - - name: ConnectionStrings__rabbit - value: "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" - - name: Sentry__Dsn - valueFrom: - secretKeyRef: - name: config - key: SentryDsn - - name: Sentry__Environment - value: "${DOTNET_ENVIRONMENT}-CloudPlatform" - - name: Sentry__Release - value: "${APP_VERSION}" - - name: AppConfigurationSettings__Version - value: "${APP_VERSION}" diff --git a/infra/cats-worker-service.yml b/infra/cats-worker-service.yml deleted file mode 100644 index 6b52dc57e..000000000 --- a/infra/cats-worker-service.yml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: cats-worker-service -spec: - selector: - app: cats-worker - ports: - - name: http - port: 8080 - targetPort: 8080 diff --git a/infra/migrator-pod.yml b/infra/migrator-pod.yml deleted file mode 100644 index ae3fccfa0..000000000 --- a/infra/migrator-pod.yml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: migrator-${IMAGE_TAG} - labels: - app: migrator -spec: - serviceAccountName: ${NAMESPACE} - restartPolicy: OnFailure - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - containers: - - name: migrator - image: ${REGISTRY}/${REPOSITORY}:migrator-${IMAGE_TAG} - securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: ["ALL"] - env: - - name: DATABASE_ADDRESS - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: rds_instance_address - - name: DATABASE_USERNAME - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_username - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_password - - name: ConnectionStrings__CatsDb - value: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" diff --git a/infra/rabbitmq-service.yml b/infra/rabbitmq-service.yml deleted file mode 100644 index dab2d7963..000000000 --- a/infra/rabbitmq-service.yml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: rabbitmq-service -spec: - selector: - app: rabbitmq # this should match the pod label in deployment.yml - ports: - - name: amqp - port: 5672 - targetPort: 5672 - - name: management - port: 15672 - targetPort: 15672 \ No newline at end of file diff --git a/infra/redis-deployment.yml b/infra/redis-deployment.yml deleted file mode 100644 index 6acf8e3b3..000000000 --- a/infra/redis-deployment.yml +++ /dev/null @@ -1,40 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: redis-deployment - labels: - app: redis -spec: - replicas: 1 - selector: - matchLabels: - app: redis # this should match the selector in service.yml - template: - metadata: - labels: - app: redis # this should match the selector in service.yml - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - serviceAccountName: ${NAMESPACE} - containers: - - name: redis - image: redis:7.4-alpine@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7 - imagePullPolicy: Always - # Used only as a SignalR backplane and Fusion cache, so all data is - # ephemeral and rebuildable. Disable RDB/AOF persistence to avoid the - # MISCONF "stop-writes-on-bgsave-error" failure when /data is not writable. - args: - - --save - - "" - - --appendonly - - "no" - ports: - - containerPort: 6379 - securityContext: - allowPrivilegeEscalation: false - privileged: false diff --git a/infra/redis-service.yml b/infra/redis-service.yml deleted file mode 100644 index f7cd28ed2..000000000 --- a/infra/redis-service.yml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: redis-service -spec: - selector: - app: redis # this should match the pod label in deployment.yml - ports: - - name: redis - port: 6379 - targetPort: 6379 diff --git a/infra/seeder-pod.yml b/infra/seeder-pod.yml deleted file mode 100644 index ca8c18acb..000000000 --- a/infra/seeder-pod.yml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: seeder-${IMAGE_TAG} - labels: - app: seeder -spec: - serviceAccountName: ${NAMESPACE} - restartPolicy: OnFailure - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - containers: - - name: seeder - image: ${REGISTRY}/${REPOSITORY}:seeder-${IMAGE_TAG} - securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: ["ALL"] - env: - - name: DATABASE_ADDRESS - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: rds_instance_address - - name: DATABASE_USERNAME - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_username - - name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: database_password - - name: DOTNET_ENVIRONMENT - value: "${DOTNET_ENVIRONMENT}" - - name: ConnectionStrings__CatsDb - value: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" From c1c52398bca20e84243eaa9f702d36f86856031b Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 17:55:44 +0100 Subject: [PATCH 03/25] Pin helm setup --- .github/workflows/deploy.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 58abc8d7e..8f3da9ab0 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -93,6 +93,11 @@ jobs: id: version run: echo "app_version=$(date +'%Y.%m.%d').${{ github.run_number }}" >> $GITHUB_OUTPUT + - name: Setup Helm + uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1 + with: + version: v3.21.2 + - name: Configure kubectl run: | echo "${{ secrets.KUBE_CERT }}" > ca.crt From ed253002590f382708bed430e96adc0ca928c3a0 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 17:55:52 +0100 Subject: [PATCH 04/25] Add helm validation ci step --- .github/workflows/validate-helm.yml | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 .github/workflows/validate-helm.yml diff --git a/.github/workflows/validate-helm.yml b/.github/workflows/validate-helm.yml new file mode 100644 index 000000000..d37c5c00d --- /dev/null +++ b/.github/workflows/validate-helm.yml @@ -0,0 +1,58 @@ +name: Validate Helm + +on: + pull_request: + branches: + - main + paths: + - helm_deploy/** + - .github/workflows/validate-helm.yml + +permissions: + contents: read + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd + + - name: Setup Helm + uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1 + with: + version: v3.21.2 + + - name: Build chart dependencies + run: | + helm repo add hmpps-helm-charts https://ministryofjustice.github.io/hmpps-helm-charts + helm dependency build ./helm_deploy/cats + + - name: Lint and template all environments + run: | + set -euo pipefail + for env in dev staging production; do + echo "::group::helm lint ($env)" + helm lint ./helm_deploy/cats --values ./helm_deploy/cats/values-$env.yaml + echo "::endgroup::" + + echo "::group::helm template ($env)" + # Render with placeholder per-deploy values that CI normally supplies via --set, + # so templating exercises the same paths as a real deploy. + helm template cats ./helm_deploy/cats \ + --namespace "cfocats-$env" \ + --values ./helm_deploy/cats/values-$env.yaml \ + --set serviceAccountName="cfocats-$env" \ + --set app.serviceAccountName="cfocats-$env" \ + --set app.image.repository="example/cfocats" \ + --set app.image.tag="cats-validate" \ + --set worker.serviceAccountName="cfocats-$env" \ + --set worker.image.repository="example/cfocats" \ + --set worker.image.tag="worker-validate" \ + --set migrator.image.repository="example/cfocats" \ + --set migrator.image.tag="migrator-validate" \ + --set seeder.image.repository="example/cfocats" \ + --set seeder.image.tag="seeder-validate" \ + > /dev/null + echo "::endgroup::" + done From 51b9bbcb31ee9d425fa1ad85fe8a99b9725e0f99 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 18:28:50 +0100 Subject: [PATCH 05/25] Health checks for redis/rabbit + templates --- .github/workflows/deploy.yml | 2 +- helm_deploy/cats/templates/_helpers.tpl | 14 ++++++- helm_deploy/cats/templates/migrator-job.yaml | 5 +-- helm_deploy/cats/templates/rabbitmq.yaml | 20 +++++++++- helm_deploy/cats/templates/redis.yaml | 20 +++++++++- helm_deploy/cats/templates/seeder-job.yaml | 5 +-- helm_deploy/cats/values.yaml | 40 ++++++++++++++++---- 7 files changed, 85 insertions(+), 21 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 8f3da9ab0..f6a5db4e2 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -137,7 +137,7 @@ jobs: --set migrator.image.tag="migrator-${{ github.sha }}" \ --set seeder.image.repository="${IMAGE_REPOSITORY}" \ --set seeder.image.tag="seeder-${{ github.sha }}" \ - --wait --timeout 10m + --atomic --wait --timeout 10m env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} REGISTRY: ${{ steps.login-ecr.outputs.registry }} diff --git a/helm_deploy/cats/templates/_helpers.tpl b/helm_deploy/cats/templates/_helpers.tpl index 33ff54d35..c6609c8e5 100644 --- a/helm_deploy/cats/templates/_helpers.tpl +++ b/helm_deploy/cats/templates/_helpers.tpl @@ -17,6 +17,18 @@ runAsGroup: 1001 runAsNonRoot: true {{- end -}} +{{/* +Restricted container-level security context shared by the local workloads. Matches the +generic-service (app/worker) posture and the Cloud Platform Gatekeeper defaults. +*/}} +{{- define "cats.containerSecurityContext" -}} +allowPrivilegeEscalation: false +privileged: false +capabilities: + drop: + - ALL +{{- end -}} + {{/* Environment variables that expose the MSSQL connection details from the rds-mssql-instance-output namespace secret, plus the composed connection string. @@ -39,5 +51,5 @@ Used by the migrator and seeder Jobs. name: rds-mssql-instance-output key: database_password - name: ConnectionStrings__CatsDb - value: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" + value: {{ .Values.connectionStrings.catsDb | quote }} {{- end -}} diff --git a/helm_deploy/cats/templates/migrator-job.yaml b/helm_deploy/cats/templates/migrator-job.yaml index 1120a7142..edae3fd58 100644 --- a/helm_deploy/cats/templates/migrator-job.yaml +++ b/helm_deploy/cats/templates/migrator-job.yaml @@ -25,10 +25,7 @@ spec: - name: migrator image: "{{ .Values.migrator.image.repository }}:{{ .Values.migrator.image.tag }}" securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: ["ALL"] + {{- include "cats.containerSecurityContext" . | nindent 12 }} env: {{- include "cats.databaseEnv" . | nindent 12 }} {{- with .Values.migrator.resources }} diff --git a/helm_deploy/cats/templates/rabbitmq.yaml b/helm_deploy/cats/templates/rabbitmq.yaml index 4e9f9ec93..576c4202d 100644 --- a/helm_deploy/cats/templates/rabbitmq.yaml +++ b/helm_deploy/cats/templates/rabbitmq.yaml @@ -26,8 +26,24 @@ spec: - containerPort: 5672 - containerPort: 15672 securityContext: - allowPrivilegeEscalation: false - privileged: false + {{- include "cats.containerSecurityContext" . | nindent 12 }} + startupProbe: + exec: + command: ["rabbitmq-diagnostics", "-q", "ping"] + periodSeconds: 10 + failureThreshold: 30 + livenessProbe: + exec: + command: ["rabbitmq-diagnostics", "-q", "ping"] + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + readinessProbe: + exec: + command: ["rabbitmq-diagnostics", "-q", "check_port_connectivity"] + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 env: - name: RABBITMQ_DEFAULT_USER valueFrom: diff --git a/helm_deploy/cats/templates/redis.yaml b/helm_deploy/cats/templates/redis.yaml index 27da27a53..7e0c85744 100644 --- a/helm_deploy/cats/templates/redis.yaml +++ b/helm_deploy/cats/templates/redis.yaml @@ -33,8 +33,24 @@ spec: ports: - containerPort: 6379 securityContext: - allowPrivilegeEscalation: false - privileged: false + {{- include "cats.containerSecurityContext" . | nindent 12 }} + startupProbe: + exec: + command: ["redis-cli", "ping"] + periodSeconds: 5 + failureThreshold: 20 + livenessProbe: + exec: + command: ["redis-cli", "ping"] + periodSeconds: 15 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + exec: + command: ["redis-cli", "ping"] + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 {{- with .Values.redis.resources }} resources: {{- toYaml . | nindent 12 }} diff --git a/helm_deploy/cats/templates/seeder-job.yaml b/helm_deploy/cats/templates/seeder-job.yaml index 03929dcec..2d0247ba8 100644 --- a/helm_deploy/cats/templates/seeder-job.yaml +++ b/helm_deploy/cats/templates/seeder-job.yaml @@ -25,10 +25,7 @@ spec: - name: seeder image: "{{ .Values.seeder.image.repository }}:{{ .Values.seeder.image.tag }}" securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: ["ALL"] + {{- include "cats.containerSecurityContext" . | nindent 12 }} env: {{- include "cats.databaseEnv" . | nindent 12 }} - name: DOTNET_ENVIRONMENT diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 7982d63a6..e1f17b6f6 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -16,6 +16,20 @@ serviceAccountName: "" # via app.env / worker.env. Overridden per environment. dotnetEnvironment: Development +# --------------------------------------------------------------------------- +# Connection strings — single source of truth +# --------------------------------------------------------------------------- +# Defined once here and consumed by every workload so the composed strings can +# never drift between web, worker and the migrator/seeder Jobs: +# * app.env / worker.env reference the YAML anchors below (&catsDb etc.) +# * the cats.databaseEnv helper (migrator/seeder) reads .Values.connectionStrings +# The $(VAR) tokens are resolved by Kubernetes from each container's env in the +# same pod — namespace_secrets for app/worker, cats.databaseEnv for the Jobs. +connectionStrings: + catsDb: &catsDb "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" + rabbit: &rabbit "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" + redis: &redis "redis-service:6379" + # --------------------------------------------------------------------------- # Web tier (Blazor Server UI) — generic-service # --------------------------------------------------------------------------- @@ -92,9 +106,9 @@ app: Sentry__Dsn: "SentryDsn" env: - ConnectionStrings__CatsDb: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" - ConnectionStrings__rabbit: "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" - ConnectionStrings__redis: "redis-service:6379" + ConnectionStrings__CatsDb: *catsDb + ConnectionStrings__rabbit: *rabbit + ConnectionStrings__redis: *redis AWS__RootFolder: "Files" Features__UseWorkerForJobs: "true" Features__PresenceHub__Enabled: "true" @@ -178,8 +192,8 @@ worker: Sentry__Dsn: "SentryDsn" env: - ConnectionStrings__CatsDb: "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" - ConnectionStrings__rabbit: "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" + ConnectionStrings__CatsDb: *catsDb + ConnectionStrings__rabbit: *rabbit Sentry__Release: "0.0.0" AppConfigurationSettings__Version: "0.0.0" @@ -190,13 +204,25 @@ rabbitmq: enabled: true # rabbitmq:4.3-management-alpine image: "rabbitmq:4.3-management-alpine@sha256:1a43764bdcf116542e7c8c794adc67c79461727da16d474e9e21483fe7f716d3" - resources: {} + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi redis: enabled: true # redis:7.4-alpine image: "redis:7.4-alpine@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7" - resources: {} + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 250m + memory: 256Mi # --------------------------------------------------------------------------- # Database lifecycle Jobs (run as pre-upgrade Helm hooks) From a986944e23dd46d26268432e8762c08f97221426 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 18:47:18 +0100 Subject: [PATCH 06/25] Bump sdk --- global.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/global.json b/global.json index 8b2eee7ba..48c708fdf 100644 --- a/global.json +++ b/global.json @@ -1,6 +1,6 @@ { "sdk": { - "version": "10.0.300", + "version": "10.0.301", "rollForward": "disable", "allowPrerelease": false } From a82a3dd8b9e9d4b5faefbeb837d4344cb0d727d1 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 19:02:11 +0100 Subject: [PATCH 07/25] Add flag for enabling/disabling prometheus alerts Currently set to false for now --- helm_deploy/README.md | 7 ++++++- helm_deploy/cats/Chart.lock | 4 ++-- helm_deploy/cats/Chart.yaml | 4 +++- helm_deploy/cats/values.yaml | 6 ++++-- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 24d3b42e2..0720fc4aa 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -14,7 +14,7 @@ so there is no `envsubst` or `kubectl apply` of raw manifests in the pipeline. | Redis (ephemeral backplane) | local template `templates/redis.yaml` | | DB migrator (run-once) | `templates/migrator-job.yaml` — pre-upgrade hook | | DB seeder (run-once) | `templates/seeder-job.yaml` — pre-upgrade hook | -| Prometheus alerts | `generic-prometheus-alerts` dependency | +| Prometheus alerts | `generic-prometheus-alerts` dependency (off by default) | The migrator (hook-weight `-5`) and seeder (hook-weight `0`) run as **`pre-install`/ `pre-upgrade` Helm hooks**: Helm runs them in weight order and waits for each to succeed @@ -25,6 +25,11 @@ the worker at `http://cats-worker:8080` and the in-cluster broker/cache at > RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the > transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. +> **Prometheus alerts are disabled by default** (`generic-prometheus-alerts.enabled: false`). +> The rules only reach a human once an Alertmanager receiver is configured for the +> `alertSeverity` (a separate `cloud-platform-environments` change). To turn them on, +> set `generic-prometheus-alerts.enabled: true` in the relevant `values-.yaml`. + ## Layout ``` diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock index afa701e6e..c40012510 100644 --- a/helm_deploy/cats/Chart.lock +++ b/helm_deploy/cats/Chart.lock @@ -8,5 +8,5 @@ dependencies: - name: generic-prometheus-alerts repository: https://ministryofjustice.github.io/hmpps-helm-charts version: 1.17.1 -digest: sha256:47030f5b5200f30b142f40b35f6f26120db6ba6113a3c5c15ec6a8ea3911e2c4 -generated: "2026-06-20T16:28:55.187412+01:00" +digest: sha256:65428269d771a264e9e0c76fd61f1cec55fb40a4d0e2f55f2be59cca75ea1d0e +generated: "2026-06-20T18:55:41.454334+01:00" diff --git a/helm_deploy/cats/Chart.yaml b/helm_deploy/cats/Chart.yaml index 5b1324432..8459c72c0 100644 --- a/helm_deploy/cats/Chart.yaml +++ b/helm_deploy/cats/Chart.yaml @@ -26,7 +26,9 @@ dependencies: version: "3.17.2" repository: https://ministryofjustice.github.io/hmpps-helm-charts - # Standard HMPPS Prometheus alert rules. + # Standard HMPPS Prometheus alert rules. Disabled by default — enable once an + # Alertmanager receiver exists for the alertSeverity (see helm_deploy/README.md). - name: generic-prometheus-alerts version: "1.17.1" repository: https://ministryofjustice.github.io/hmpps-helm-charts + condition: generic-prometheus-alerts.enabled diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index e1f17b6f6..e07f178fb 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -246,9 +246,11 @@ seeder: resources: {} # --------------------------------------------------------------------------- -# Prometheus alerts -# --------------------------------------------------------------------------- +# Prometheus alerts. Disabled by default: the rules only reach a human once an +# Alertmanager receiver is configured for the alertSeverity below (a separate +# cloud-platform-environments change). Set enabled: true per environment to turn on. generic-prometheus-alerts: + enabled: false targetApplication: cats businessUnit: hmpps alertSeverity: cfo-alerts From 9da8e74f694d9ba232fcd58e450270a385e0f803 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 19:02:36 +0100 Subject: [PATCH 08/25] Fix for recreate strategy - most override rolling updates --- helm_deploy/cats/values.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index e07f178fb..50c9555a7 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -134,8 +134,11 @@ worker: enabled: false # Recreate (not RollingUpdate) so a new worker pod never overlaps the old one. + # rollingUpdate must be nulled: it deep-merges from the generic-service default and + # is forbidden by Kubernetes when type is Recreate. strategy: type: Recreate + rollingUpdate: null image: repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats From b9614d3e9917a6003d59f2cb6927b33c7c618112 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sat, 20 Jun 2026 21:04:51 +0100 Subject: [PATCH 09/25] Chart cleanup --- .github/workflows/deploy.yml | 45 +++++++-- .github/workflows/validate-helm.yml | 22 ++++- helm_deploy/README.md | 78 +++++++++++---- helm_deploy/cats/Chart.lock | 4 +- helm_deploy/cats/Chart.yaml | 10 +- helm_deploy/cats/templates/_helpers.tpl | 31 ------ helm_deploy/cats/templates/migrator-job.yaml | 31 +++--- helm_deploy/cats/templates/rabbitmq.yaml | 38 +++++--- helm_deploy/cats/templates/redis.yaml | 34 ++++--- helm_deploy/cats/templates/seeder-job.yaml | 33 +++---- helm_deploy/cats/values-dev.yaml | 1 - helm_deploy/cats/values-production.yaml | 1 - helm_deploy/cats/values-staging.yaml | 1 - helm_deploy/cats/values.yaml | 99 +++++--------------- 14 files changed, 228 insertions(+), 200 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f6a5db4e2..7cbc7ec32 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -109,14 +109,49 @@ jobs: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} KUBE_CLUSTER: ${{ secrets.KUBE_CLUSTER }} - - name: Deploy CATS with Helm - # A single release deploys the web tier, worker, ephemeral RabbitMQ/Redis, and runs - # the migrator + seeder as pre-upgrade hook Jobs (Helm waits for them before rollout). + - name: Build Helm dependencies run: | set -euo pipefail helm repo add hmpps-helm-charts https://ministryofjustice.github.io/hmpps-helm-charts helm dependency build ./helm_deploy/cats + - name: Run database migrations + run: | + set -euo pipefail + helm upgrade --install cats-migrate ./helm_deploy/cats \ + --namespace "${KUBE_NAMESPACE}" \ + --values ./helm_deploy/cats/values-${{ inputs.environment }}.yaml \ + --set application.enabled=false \ + --set job=migrate \ + --set serviceAccountName="${KUBE_NAMESPACE}" \ + --set migrator.image.repository="${REGISTRY}/${REPOSITORY}" \ + --set migrator.image.tag="migrator-${{ github.sha }}" \ + --wait --timeout 5m + env: + KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} + REGISTRY: ${{ steps.login-ecr.outputs.registry }} + REPOSITORY: ${{ vars.ECR_REPOSITORY }} + + - name: Seed the database + run: | + set -euo pipefail + helm upgrade --install cats-seed ./helm_deploy/cats \ + --namespace "${KUBE_NAMESPACE}" \ + --values ./helm_deploy/cats/values-${{ inputs.environment }}.yaml \ + --set application.enabled=false \ + --set job=seed \ + --set serviceAccountName="${KUBE_NAMESPACE}" \ + --set seeder.image.repository="${REGISTRY}/${REPOSITORY}" \ + --set seeder.image.tag="seeder-${{ github.sha }}" \ + --wait --timeout 5m + env: + KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} + REGISTRY: ${{ steps.login-ecr.outputs.registry }} + REPOSITORY: ${{ vars.ECR_REPOSITORY }} + + - name: Deploy CATS and Worker + run: | + set -euo pipefail IMAGE_REPOSITORY="${REGISTRY}/${REPOSITORY}" helm upgrade --install cats ./helm_deploy/cats \ @@ -133,10 +168,6 @@ jobs: --set worker.image.tag="worker-${{ github.sha }}" \ --set worker.env.Sentry__Release="${APP_VERSION}" \ --set worker.env.AppConfigurationSettings__Version="${APP_VERSION}" \ - --set migrator.image.repository="${IMAGE_REPOSITORY}" \ - --set migrator.image.tag="migrator-${{ github.sha }}" \ - --set seeder.image.repository="${IMAGE_REPOSITORY}" \ - --set seeder.image.tag="seeder-${{ github.sha }}" \ --atomic --wait --timeout 10m env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} diff --git a/.github/workflows/validate-helm.yml b/.github/workflows/validate-helm.yml index d37c5c00d..c74652e1e 100644 --- a/.github/workflows/validate-helm.yml +++ b/.github/workflows/validate-helm.yml @@ -36,7 +36,7 @@ jobs: helm lint ./helm_deploy/cats --values ./helm_deploy/cats/values-$env.yaml echo "::endgroup::" - echo "::group::helm template ($env)" + echo "::group::helm template app ($env)" # Render with placeholder per-deploy values that CI normally supplies via --set, # so templating exercises the same paths as a real deploy. helm template cats ./helm_deploy/cats \ @@ -49,8 +49,28 @@ jobs: --set worker.serviceAccountName="cfocats-$env" \ --set worker.image.repository="example/cfocats" \ --set worker.image.tag="worker-validate" \ + > /dev/null + echo "::endgroup::" + + echo "::group::helm template migrate ($env)" + helm template cats-migrate ./helm_deploy/cats \ + --namespace "cfocats-$env" \ + --values ./helm_deploy/cats/values-$env.yaml \ + --set application.enabled=false \ + --set job=migrate \ + --set serviceAccountName="cfocats-$env" \ --set migrator.image.repository="example/cfocats" \ --set migrator.image.tag="migrator-validate" \ + > /dev/null + echo "::endgroup::" + + echo "::group::helm template seed ($env)" + helm template cats-seed ./helm_deploy/cats \ + --namespace "cfocats-$env" \ + --values ./helm_deploy/cats/values-$env.yaml \ + --set application.enabled=false \ + --set job=seed \ + --set serviceAccountName="cfocats-$env" \ --set seeder.image.repository="example/cfocats" \ --set seeder.image.tag="seeder-validate" \ > /dev/null diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 0720fc4aa..860d007bb 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -1,10 +1,21 @@ # CATS Helm deployment -A **single Helm release** deploys the entire Case Assessment and Tracking System (CATS) -to the MoJ Cloud Platform. There is one templating mechanism at the deploy layer — Helm — +Helm deploys the entire Case Assessment and Tracking System (CATS) to the MoJ Cloud +Platform from **one chart**. There is one templating mechanism at the deploy layer — Helm — so there is no `envsubst` or `kubectl apply` of raw manifests in the pipeline. -## What the release contains +The CI pipeline installs **three releases from this single chart**, run in order so each +stage has isolated logs and its own timeout. Which slice of the chart renders is chosen by +two `--set` selectors — `application.enabled` (the long-running stack) and `job` (the one-off +Job to run) — so there are no per-mode values files: + +| Release | Contains | Selectors | +|----------------|--------------------------------------------|--------------------------------------------| +| `cats-migrate` | DB migrator Job | `application.enabled=false`, `job=migrate` | +| `cats-seed` | DB seeder Job | `application.enabled=false`, `job=seed` | +| `cats` | web tier, worker, ephemeral RabbitMQ/Redis | *(defaults: `application.enabled=true`, `job=""`)* | + +## What the `cats` release contains | Resource | Source | |-------------------------------|---------------------------------------------------| @@ -12,15 +23,21 @@ so there is no `envsubst` or `kubectl apply` of raw manifests in the pipeline. | Worker (Quartz jobs) | `generic-service` dependency, alias `worker` | | RabbitMQ (ephemeral) | local template `templates/rabbitmq.yaml` | | Redis (ephemeral backplane) | local template `templates/redis.yaml` | -| DB migrator (run-once) | `templates/migrator-job.yaml` — pre-upgrade hook | -| DB seeder (run-once) | `templates/seeder-job.yaml` — pre-upgrade hook | | Prometheus alerts | `generic-prometheus-alerts` dependency (off by default) | -The migrator (hook-weight `-5`) and seeder (hook-weight `0`) run as **`pre-install`/ -`pre-upgrade` Helm hooks**: Helm runs them in weight order and waits for each to succeed -before rolling out the app — so a failed migration fails the deploy. The web tier reaches -the worker at `http://cats-worker:8080` and the in-cluster broker/cache at -`rabbitmq-service:5672` / `redis-service:6379`. +The web tier reaches the worker at `http://cats-worker:8080` and the in-cluster broker/cache +at `rabbitmq-service:5672` / `redis-service:6379`. + +## Migrator / seeder Jobs + +The migrator and seeder live in this same chart but render **only when selected** +(`--set job=migrate` / `--set job=seed`); the default `job=""` renders neither. The pipeline +runs one at a time — with `--set application.enabled=false` so the app, worker and ephemeral +deps are skipped — installing each as its own release **before** the `cats` application +release. Each Job is named per release revision (`cats-migrator-`), so every deploy +runs a fresh Job — Job pod templates are immutable, so a stable name could not be re-applied +— and `helm upgrade --wait` blocks until it completes. A failed migration therefore fails +its own step (with the Job left in place for log inspection) and the app is never rolled out. > RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the > transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. @@ -35,7 +52,7 @@ the worker at `http://cats-worker:8080` and the in-cluster broker/cache at ``` helm_deploy/cats/ Chart.yaml # app + worker (aliased generic-service) + alerts - values.yaml # shared defaults + values.yaml # shared defaults (Jobs disabled) values-dev.yaml # namespace: cfocats-dev values-staging.yaml values-production.yaml @@ -54,11 +71,32 @@ for reaching the RDS instance (applied ad hoc, not part of the pipeline). The image registry/tags, app version, and service account are passed at deploy time. Note the four images share one ECR repository but use different tag prefixes -(`cats-`, `worker-`, `migrator-`, `seeder-`): +(`cats-`, `worker-`, `migrator-`, `seeder-`). The pipeline runs three releases in order: ```bash helm dependency build ./helm_deploy/cats +# 1. Migrate +helm upgrade --install cats-migrate ./helm_deploy/cats \ + --namespace "$KUBE_NAMESPACE" \ + --values ./helm_deploy/cats/values-$ENV.yaml \ + --set application.enabled=false --set job=migrate \ + --set serviceAccountName="$KUBE_NAMESPACE" \ + --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ + --set migrator.image.tag="migrator-$SHA" \ + --wait --timeout 5m + +# 2. Seed +helm upgrade --install cats-seed ./helm_deploy/cats \ + --namespace "$KUBE_NAMESPACE" \ + --values ./helm_deploy/cats/values-$ENV.yaml \ + --set application.enabled=false --set job=seed \ + --set serviceAccountName="$KUBE_NAMESPACE" \ + --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ + --set seeder.image.tag="seeder-$SHA" \ + --wait --timeout 5m + +# 3. Deploy the application helm upgrade --install cats ./helm_deploy/cats \ --namespace "$KUBE_NAMESPACE" \ --values ./helm_deploy/cats/values-$ENV.yaml \ @@ -73,11 +111,7 @@ helm upgrade --install cats ./helm_deploy/cats \ --set worker.image.tag="worker-$SHA" \ --set worker.env.Sentry__Release="$APP_VERSION" \ --set worker.env.AppConfigurationSettings__Version="$APP_VERSION" \ - --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ - --set migrator.image.tag="migrator-$SHA" \ - --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ - --set seeder.image.tag="seeder-$SHA" \ - --wait --timeout 10m + --atomic --wait --timeout 10m ``` ## Local validation @@ -85,8 +119,14 @@ helm upgrade --install cats ./helm_deploy/cats \ ```bash helm dependency build ./helm_deploy/cats helm lint ./helm_deploy/cats -f ./helm_deploy/cats/values-dev.yaml +# app release helm template cats ./helm_deploy/cats \ --namespace cfocats-dev --values ./helm_deploy/cats/values-dev.yaml +# migrate / seed releases +helm template cats-migrate ./helm_deploy/cats --namespace cfocats-dev \ + -f ./helm_deploy/cats/values-dev.yaml --set application.enabled=false --set job=migrate +helm template cats-seed ./helm_deploy/cats --namespace cfocats-dev \ + -f ./helm_deploy/cats/values-dev.yaml --set application.enabled=false --set job=seed ``` ## First-time migration from the previous (kubectl) deploy @@ -103,4 +143,6 @@ kubectl -n delete svc cats-service cats-worker-service \ kubectl -n delete ingress cats-ingress ``` -Helm then owns `cats`, `cats-worker`, `rabbitmq-*`, `redis-*`, `cats-v1-2` and the hook Jobs. +Helm then owns `cats`, `cats-worker`, `rabbitmq-*`, `redis-*` and `cats-v1-2` in the `cats` +release, with the migrator/seeder Jobs owned by the separate `cats-migrate` / `cats-seed` +releases. diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock index c40012510..86b6e6f4e 100644 --- a/helm_deploy/cats/Chart.lock +++ b/helm_deploy/cats/Chart.lock @@ -8,5 +8,5 @@ dependencies: - name: generic-prometheus-alerts repository: https://ministryofjustice.github.io/hmpps-helm-charts version: 1.17.1 -digest: sha256:65428269d771a264e9e0c76fd61f1cec55fb40a4d0e2f55f2be59cca75ea1d0e -generated: "2026-06-20T18:55:41.454334+01:00" +digest: sha256:46af457269c636c4c1c39c8b1d604212e9f88bf26a993637e1a8bf9da7288d9e +generated: "2026-06-20T20:17:02.299359+01:00" diff --git a/helm_deploy/cats/Chart.yaml b/helm_deploy/cats/Chart.yaml index 8459c72c0..eb3bebe90 100644 --- a/helm_deploy/cats/Chart.yaml +++ b/helm_deploy/cats/Chart.yaml @@ -2,9 +2,11 @@ apiVersion: v2 name: cats description: | Case Assessment and Tracking System (CATS) — HMPPS Creating Future Opportunities (CFO). - A single release deploying the Blazor Server web tier and the Quartz worker (both via the - HMPPS generic-service chart), the ephemeral in-cluster RabbitMQ and Redis dependencies, - and the database migrator/seeder as pre-upgrade Helm hook Jobs. + Deploys the Blazor Server web tier and the Quartz worker (both via the HMPPS + generic-service chart) and the ephemeral in-cluster RabbitMQ and Redis dependencies. + The same chart also provides the database migrator and seeder Jobs, which the CI + pipeline installs as their own short-lived releases (cats-migrate, cats-seed) before + the application release. type: application # Version of this chart. Bump on every change to the chart/values. @@ -19,12 +21,14 @@ dependencies: alias: app version: "3.17.2" repository: https://ministryofjustice.github.io/hmpps-helm-charts + condition: application.enabled # Background worker (Quartz jobs) — single instance, no ingress. - name: generic-service alias: worker version: "3.17.2" repository: https://ministryofjustice.github.io/hmpps-helm-charts + condition: application.enabled # Standard HMPPS Prometheus alert rules. Disabled by default — enable once an # Alertmanager receiver exists for the alertSeverity (see helm_deploy/README.md). diff --git a/helm_deploy/cats/templates/_helpers.tpl b/helm_deploy/cats/templates/_helpers.tpl index c6609c8e5..793bcf3b8 100644 --- a/helm_deploy/cats/templates/_helpers.tpl +++ b/helm_deploy/cats/templates/_helpers.tpl @@ -1,34 +1,3 @@ -{{/* -Service account name. Cloud Platform creates a service account named after the -namespace (with IRSA role bindings for S3/RDS), so default to the release namespace. -*/}} -{{- define "cats.serviceAccountName" -}} -{{- .Values.serviceAccountName | default .Release.Namespace -}} -{{- end -}} - -{{/* -Pod-level security context shared by the local (non generic-service) workloads. -*/}} -{{- define "cats.podSecurityContext" -}} -seccompProfile: - type: RuntimeDefault -runAsUser: 1001 -runAsGroup: 1001 -runAsNonRoot: true -{{- end -}} - -{{/* -Restricted container-level security context shared by the local workloads. Matches the -generic-service (app/worker) posture and the Cloud Platform Gatekeeper defaults. -*/}} -{{- define "cats.containerSecurityContext" -}} -allowPrivilegeEscalation: false -privileged: false -capabilities: - drop: - - ALL -{{- end -}} - {{/* Environment variables that expose the MSSQL connection details from the rds-mssql-instance-output namespace secret, plus the composed connection string. diff --git a/helm_deploy/cats/templates/migrator-job.yaml b/helm_deploy/cats/templates/migrator-job.yaml index edae3fd58..1a1204cf7 100644 --- a/helm_deploy/cats/templates/migrator-job.yaml +++ b/helm_deploy/cats/templates/migrator-job.yaml @@ -1,35 +1,38 @@ -{{- if .Values.migrator.enabled }} +{{- if eq .Values.job "migrate" }} apiVersion: batch/v1 kind: Job metadata: - name: cats-migrator + # Revision-suffixed so each upgrade of the cats-migrate release runs a fresh Job + # (a Job's pod template is immutable, so a stable name could not be re-applied). + name: cats-migrator-{{ .Release.Revision }} labels: app: migrator - annotations: - # Run before the application is upgraded; deploy fails if the migration fails. - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: backoffLimit: {{ .Values.migrator.backoffLimit }} + # Auto-clean finished Jobs; Helm also prunes the previous revision's Job on upgrade. + ttlSecondsAfterFinished: 3600 template: metadata: labels: app: migrator spec: - serviceAccountName: {{ include "cats.serviceAccountName" . }} + serviceAccountName: {{ .Values.serviceAccountName }} restartPolicy: Never securityContext: - {{- include "cats.podSecurityContext" . | nindent 8 }} + seccompProfile: + type: RuntimeDefault + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true containers: - name: migrator image: "{{ .Values.migrator.image.repository }}:{{ .Values.migrator.image.tag }}" securityContext: - {{- include "cats.containerSecurityContext" . | nindent 12 }} + allowPrivilegeEscalation: false + privileged: false + capabilities: + drop: + - ALL env: {{- include "cats.databaseEnv" . | nindent 12 }} - {{- with .Values.migrator.resources }} - resources: - {{- toYaml . | nindent 12 }} - {{- end }} {{- end }} diff --git a/helm_deploy/cats/templates/rabbitmq.yaml b/helm_deploy/cats/templates/rabbitmq.yaml index 576c4202d..755e1291d 100644 --- a/helm_deploy/cats/templates/rabbitmq.yaml +++ b/helm_deploy/cats/templates/rabbitmq.yaml @@ -1,4 +1,4 @@ -{{- if .Values.rabbitmq.enabled }} +{{- if .Values.application.enabled }} apiVersion: apps/v1 kind: Deployment metadata: @@ -16,8 +16,12 @@ spec: app: rabbitmq spec: securityContext: - {{- include "cats.podSecurityContext" . | nindent 8 }} - serviceAccountName: {{ include "cats.serviceAccountName" . }} + seccompProfile: + type: RuntimeDefault + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true + serviceAccountName: {{ .Values.serviceAccountName }} containers: - name: rabbitmq image: {{ .Values.rabbitmq.image | quote }} @@ -26,23 +30,24 @@ spec: - containerPort: 5672 - containerPort: 15672 securityContext: - {{- include "cats.containerSecurityContext" . | nindent 12 }} + allowPrivilegeEscalation: false + privileged: false startupProbe: - exec: - command: ["rabbitmq-diagnostics", "-q", "ping"] + tcpSocket: + port: 5672 periodSeconds: 10 failureThreshold: 30 livenessProbe: - exec: - command: ["rabbitmq-diagnostics", "-q", "ping"] + tcpSocket: + port: 5672 periodSeconds: 30 - timeoutSeconds: 10 + timeoutSeconds: 5 failureThreshold: 3 readinessProbe: - exec: - command: ["rabbitmq-diagnostics", "-q", "check_port_connectivity"] + tcpSocket: + port: 5672 periodSeconds: 15 - timeoutSeconds: 10 + timeoutSeconds: 5 failureThreshold: 3 env: - name: RABBITMQ_DEFAULT_USER @@ -55,10 +60,13 @@ spec: secretKeyRef: name: config key: RABBIT_PASS - {{- with .Values.rabbitmq.resources }} resources: - {{- toYaml . | nindent 12 }} - {{- end }} + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi --- apiVersion: v1 kind: Service diff --git a/helm_deploy/cats/templates/redis.yaml b/helm_deploy/cats/templates/redis.yaml index 7e0c85744..dda6d59f2 100644 --- a/helm_deploy/cats/templates/redis.yaml +++ b/helm_deploy/cats/templates/redis.yaml @@ -1,4 +1,4 @@ -{{- if .Values.redis.enabled }} +{{- if .Values.application.enabled }} apiVersion: apps/v1 kind: Deployment metadata: @@ -16,8 +16,12 @@ spec: app: redis spec: securityContext: - {{- include "cats.podSecurityContext" . | nindent 8 }} - serviceAccountName: {{ include "cats.serviceAccountName" . }} + seccompProfile: + type: RuntimeDefault + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true + serviceAccountName: {{ .Values.serviceAccountName }} containers: - name: redis image: {{ .Values.redis.image | quote }} @@ -33,28 +37,32 @@ spec: ports: - containerPort: 6379 securityContext: - {{- include "cats.containerSecurityContext" . | nindent 12 }} + allowPrivilegeEscalation: false + privileged: false startupProbe: - exec: - command: ["redis-cli", "ping"] + tcpSocket: + port: 6379 periodSeconds: 5 failureThreshold: 20 livenessProbe: - exec: - command: ["redis-cli", "ping"] + tcpSocket: + port: 6379 periodSeconds: 15 timeoutSeconds: 5 failureThreshold: 3 readinessProbe: - exec: - command: ["redis-cli", "ping"] + tcpSocket: + port: 6379 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3 - {{- with .Values.redis.resources }} resources: - {{- toYaml . | nindent 12 }} - {{- end }} + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 250m + memory: 256Mi --- apiVersion: v1 kind: Service diff --git a/helm_deploy/cats/templates/seeder-job.yaml b/helm_deploy/cats/templates/seeder-job.yaml index 2d0247ba8..b6dc77423 100644 --- a/helm_deploy/cats/templates/seeder-job.yaml +++ b/helm_deploy/cats/templates/seeder-job.yaml @@ -1,37 +1,38 @@ -{{- if .Values.seeder.enabled }} +{{- if eq .Values.job "seed" }} apiVersion: batch/v1 kind: Job metadata: - name: cats-seeder + # Revision-suffixed so each upgrade of the cats-seed release runs a fresh Job + # (a Job's pod template is immutable, so a stable name could not be re-applied). + name: cats-seeder-{{ .Release.Revision }} labels: app: seeder - annotations: - # Runs after the migrator (higher hook-weight) and before the application upgrade. - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "0" - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: backoffLimit: {{ .Values.seeder.backoffLimit }} + # Auto-clean finished Jobs; Helm also prunes the previous revision's Job on upgrade. + ttlSecondsAfterFinished: 3600 template: metadata: labels: app: seeder spec: - serviceAccountName: {{ include "cats.serviceAccountName" . }} + serviceAccountName: {{ .Values.serviceAccountName }} restartPolicy: Never securityContext: - {{- include "cats.podSecurityContext" . | nindent 8 }} + seccompProfile: + type: RuntimeDefault + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true containers: - name: seeder image: "{{ .Values.seeder.image.repository }}:{{ .Values.seeder.image.tag }}" securityContext: - {{- include "cats.containerSecurityContext" . | nindent 12 }} + allowPrivilegeEscalation: false + privileged: false + capabilities: + drop: + - ALL env: {{- include "cats.databaseEnv" . | nindent 12 }} - - name: DOTNET_ENVIRONMENT - value: {{ .Values.dotnetEnvironment | quote }} - {{- with .Values.seeder.resources }} - resources: - {{- toYaml . | nindent 12 }} - {{- end }} {{- end }} diff --git a/helm_deploy/cats/values-dev.yaml b/helm_deploy/cats/values-dev.yaml index c59116040..05014abc3 100644 --- a/helm_deploy/cats/values-dev.yaml +++ b/helm_deploy/cats/values-dev.yaml @@ -1,5 +1,4 @@ # Development overrides. Namespace: cfocats-dev -dotnetEnvironment: Development app: replicaCount: 3 diff --git a/helm_deploy/cats/values-production.yaml b/helm_deploy/cats/values-production.yaml index 9feeaeeb6..14d131c5c 100644 --- a/helm_deploy/cats/values-production.yaml +++ b/helm_deploy/cats/values-production.yaml @@ -1,6 +1,5 @@ # Production overrides. # NOTE: confirm the production namespace / hostname before first deploy. -dotnetEnvironment: Production app: replicaCount: 4 diff --git a/helm_deploy/cats/values-staging.yaml b/helm_deploy/cats/values-staging.yaml index 0dbadb8d0..7ce3e4638 100644 --- a/helm_deploy/cats/values-staging.yaml +++ b/helm_deploy/cats/values-staging.yaml @@ -1,6 +1,5 @@ # Staging overrides. # NOTE: confirm the staging namespace / hostname before first deploy. -dotnetEnvironment: Staging app: replicaCount: 3 diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 50c9555a7..5081f0e38 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -1,48 +1,28 @@ -# Default values for the CATS umbrella chart (single release). -# Environment-specific overrides live in values-.yaml. -# -# generic-service values are nested under the dependency aliases `app:` (web) and -# `worker:`. Local workloads (rabbitmq, redis, migrator, seeder) and alerts use the -# top-level keys below. -# -# Values that change per deploy (image registry/tags, app version, service account) -# are supplied by CI via --set. See helm_deploy/README.md. - -# Service account for the local workloads (rabbitmq/redis/jobs). Defaults to the -# release namespace, which is the Cloud Platform IRSA-enabled service account. serviceAccountName: "" -# .NET environment for the seeder Job. The app/worker set their own DOTNET_ENVIRONMENT -# via app.env / worker.env. Overridden per environment. -dotnetEnvironment: Development - # --------------------------------------------------------------------------- -# Connection strings — single source of truth +# Deploy-mode selectors (set by CI via --set; see README) # --------------------------------------------------------------------------- -# Defined once here and consumed by every workload so the composed strings can -# never drift between web, worker and the migrator/seeder Jobs: -# * app.env / worker.env reference the YAML anchors below (&catsDb etc.) -# * the cats.databaseEnv helper (migrator/seeder) reads .Values.connectionStrings -# The $(VAR) tokens are resolved by Kubernetes from each container's env in the -# same pod — namespace_secrets for app/worker, cats.databaseEnv for the Jobs. +# application.enabled — render the application stack (web + worker + rabbitmq/redis) +# job — "migrate" or "seed" renders that one-off Job; "" renders none +application: + enabled: true +job: "" + connectionStrings: catsDb: &catsDb "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" rabbit: &rabbit "amqp://$(RABBIT_USER):$(RABBIT_PASS)@rabbitmq-service:5672" redis: &redis "redis-service:6379" -# --------------------------------------------------------------------------- -# Web tier (Blazor Server UI) — generic-service -# --------------------------------------------------------------------------- app: nameOverride: cats fullnameOverride: cats - # serviceAccountName is supplied by CI via --set (the namespace's IRSA account). replicaCount: 3 image: - repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats - tag: latest + repository: "" # set by CI via --set + tag: "" # set by CI via --set pullPolicy: IfNotPresent port: 8080 @@ -115,34 +95,28 @@ app: Features__PresenceHub__RelayUserPresenceNotifications: "true" Features__UseSignalRBackplane: "true" WorkerOptions__BaseUrl: "http://cats-worker:8080" - # Overridden by CI at deploy time. - Sentry__Release: "0.0.0" - AppConfigurationSettings__Version: "0.0.0" - # DOTNET_ENVIRONMENT and Sentry__Environment are set per environment. + Sentry__Release: "0.0.0" # overridden by CI + AppConfigurationSettings__Version: "0.0.0" # overridden by CI # --------------------------------------------------------------------------- -# Background worker (Quartz jobs) — generic-service +# Background worker # --------------------------------------------------------------------------- worker: nameOverride: cats-worker fullnameOverride: cats-worker - # MUST remain a single instance: Quartz jobs must not run concurrently across pods. replicaCount: 1 autoscaling: enabled: false - # Recreate (not RollingUpdate) so a new worker pod never overlaps the old one. - # rollingUpdate must be nulled: it deep-merges from the generic-service default and - # is forbidden by Kubernetes when type is Recreate. strategy: type: Recreate rollingUpdate: null image: - repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats - tag: latest + repository: "" # set by CI via --set + tag: "" # set by CI via --set pullPolicy: IfNotPresent port: 8080 @@ -200,58 +174,29 @@ worker: Sentry__Release: "0.0.0" AppConfigurationSettings__Version: "0.0.0" -# --------------------------------------------------------------------------- -# Ephemeral in-cluster dependencies (no persistence) -# --------------------------------------------------------------------------- rabbitmq: - enabled: true - # rabbitmq:4.3-management-alpine image: "rabbitmq:4.3-management-alpine@sha256:1a43764bdcf116542e7c8c794adc67c79461727da16d474e9e21483fe7f716d3" - resources: - requests: - cpu: 100m - memory: 256Mi - limits: - cpu: 500m - memory: 512Mi redis: - enabled: true - # redis:7.4-alpine image: "redis:7.4-alpine@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7" - resources: - requests: - cpu: 50m - memory: 64Mi - limits: - cpu: 250m - memory: 256Mi -# --------------------------------------------------------------------------- -# Database lifecycle Jobs (run as pre-upgrade Helm hooks) -# --------------------------------------------------------------------------- migrator: - enabled: true image: - repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats - # migrator-, supplied by CI via --set. - tag: latest + repository: "" # set by CI via --set + tag: "" # set by CI via --set (migrator-) backoffLimit: 3 - resources: {} seeder: - enabled: true image: - repository: example.dkr.ecr.eu-west-2.amazonaws.com/cfocats - # seeder-, supplied by CI via --set. - tag: latest + repository: "" # set by CI via --set + tag: "" # set by CI via --set (seeder-) backoffLimit: 3 - resources: {} # --------------------------------------------------------------------------- -# Prometheus alerts. Disabled by default: the rules only reach a human once an -# Alertmanager receiver is configured for the alertSeverity below (a separate -# cloud-platform-environments change). Set enabled: true per environment to turn on. +# Prometheus alerts — disabled by default; rules only reach a human once an +# Alertmanager receiver exists for alertSeverity (a separate cloud-platform +# change). Set enabled: true per environment to turn on. +# --------------------------------------------------------------------------- generic-prometheus-alerts: enabled: false targetApplication: cats From bdb96e30464fc218d0d51e6df8c227617bdb7d41 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sun, 21 Jun 2026 00:00:37 +0100 Subject: [PATCH 10/25] Add gating to manifests --- .github/workflows/deploy.yml | 10 +++-- .github/workflows/validate-helm.yml | 10 +++-- helm_deploy/README.md | 47 +++++++++++--------- helm_deploy/cats/Chart.lock | 4 +- helm_deploy/cats/Chart.yaml | 4 +- helm_deploy/cats/templates/migrator-job.yaml | 2 +- helm_deploy/cats/templates/rabbitmq.yaml | 2 +- helm_deploy/cats/templates/redis.yaml | 2 +- helm_deploy/cats/templates/seeder-job.yaml | 2 +- helm_deploy/cats/values.yaml | 18 +++++--- 10 files changed, 58 insertions(+), 43 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 7cbc7ec32..7cf51b67d 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -121,8 +121,7 @@ jobs: helm upgrade --install cats-migrate ./helm_deploy/cats \ --namespace "${KUBE_NAMESPACE}" \ --values ./helm_deploy/cats/values-${{ inputs.environment }}.yaml \ - --set application.enabled=false \ - --set job=migrate \ + --set migrator.enabled=true \ --set serviceAccountName="${KUBE_NAMESPACE}" \ --set migrator.image.repository="${REGISTRY}/${REPOSITORY}" \ --set migrator.image.tag="migrator-${{ github.sha }}" \ @@ -138,8 +137,7 @@ jobs: helm upgrade --install cats-seed ./helm_deploy/cats \ --namespace "${KUBE_NAMESPACE}" \ --values ./helm_deploy/cats/values-${{ inputs.environment }}.yaml \ - --set application.enabled=false \ - --set job=seed \ + --set seeder.enabled=true \ --set serviceAccountName="${KUBE_NAMESPACE}" \ --set seeder.image.repository="${REGISTRY}/${REPOSITORY}" \ --set seeder.image.tag="seeder-${{ github.sha }}" \ @@ -157,6 +155,10 @@ jobs: helm upgrade --install cats ./helm_deploy/cats \ --namespace "${KUBE_NAMESPACE}" \ --values ./helm_deploy/cats/values-${{ inputs.environment }}.yaml \ + --set app.enabled=true \ + --set worker.enabled=true \ + --set rabbitmq.enabled=true \ + --set redis.enabled=true \ --set serviceAccountName="${KUBE_NAMESPACE}" \ --set app.serviceAccountName="${KUBE_NAMESPACE}" \ --set app.image.repository="${IMAGE_REPOSITORY}" \ diff --git a/.github/workflows/validate-helm.yml b/.github/workflows/validate-helm.yml index c74652e1e..81835cb2a 100644 --- a/.github/workflows/validate-helm.yml +++ b/.github/workflows/validate-helm.yml @@ -42,6 +42,10 @@ jobs: helm template cats ./helm_deploy/cats \ --namespace "cfocats-$env" \ --values ./helm_deploy/cats/values-$env.yaml \ + --set app.enabled=true \ + --set worker.enabled=true \ + --set rabbitmq.enabled=true \ + --set redis.enabled=true \ --set serviceAccountName="cfocats-$env" \ --set app.serviceAccountName="cfocats-$env" \ --set app.image.repository="example/cfocats" \ @@ -56,8 +60,7 @@ jobs: helm template cats-migrate ./helm_deploy/cats \ --namespace "cfocats-$env" \ --values ./helm_deploy/cats/values-$env.yaml \ - --set application.enabled=false \ - --set job=migrate \ + --set migrator.enabled=true \ --set serviceAccountName="cfocats-$env" \ --set migrator.image.repository="example/cfocats" \ --set migrator.image.tag="migrator-validate" \ @@ -68,8 +71,7 @@ jobs: helm template cats-seed ./helm_deploy/cats \ --namespace "cfocats-$env" \ --values ./helm_deploy/cats/values-$env.yaml \ - --set application.enabled=false \ - --set job=seed \ + --set seeder.enabled=true \ --set serviceAccountName="cfocats-$env" \ --set seeder.image.repository="example/cfocats" \ --set seeder.image.tag="seeder-validate" \ diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 860d007bb..528b6fb5f 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -5,15 +5,15 @@ Platform from **one chart**. There is one templating mechanism at the deploy lay so there is no `envsubst` or `kubectl apply` of raw manifests in the pipeline. The CI pipeline installs **three releases from this single chart**, run in order so each -stage has isolated logs and its own timeout. Which slice of the chart renders is chosen by -two `--set` selectors — `application.enabled` (the long-running stack) and `job` (the one-off -Job to run) — so there are no per-mode values files: +stage has isolated logs and its own timeout. Every workload is **off by default** and each +release opts in to exactly the components it needs via per-component `--set X.enabled=true` +flags — so there are no per-mode values files: -| Release | Contains | Selectors | -|----------------|--------------------------------------------|--------------------------------------------| -| `cats-migrate` | DB migrator Job | `application.enabled=false`, `job=migrate` | -| `cats-seed` | DB seeder Job | `application.enabled=false`, `job=seed` | -| `cats` | web tier, worker, ephemeral RabbitMQ/Redis | *(defaults: `application.enabled=true`, `job=""`)* | +| Release | Contains | Enabled components | +|----------------|--------------------------------------------|----------------------------------------------------------| +| `cats-migrate` | DB migrator Job | `migrator.enabled=true` | +| `cats-seed` | DB seeder Job | `seeder.enabled=true` | +| `cats` | web tier, worker, ephemeral RabbitMQ/Redis | `app.enabled`, `worker.enabled`, `rabbitmq.enabled`, `redis.enabled` = `true` | ## What the `cats` release contains @@ -30,14 +30,15 @@ at `rabbitmq-service:5672` / `redis-service:6379`. ## Migrator / seeder Jobs -The migrator and seeder live in this same chart but render **only when selected** -(`--set job=migrate` / `--set job=seed`); the default `job=""` renders neither. The pipeline -runs one at a time — with `--set application.enabled=false` so the app, worker and ephemeral -deps are skipped — installing each as its own release **before** the `cats` application -release. Each Job is named per release revision (`cats-migrator-`), so every deploy -runs a fresh Job — Job pod templates are immutable, so a stable name could not be re-applied -— and `helm upgrade --wait` blocks until it completes. A failed migration therefore fails -its own step (with the Job left in place for log inspection) and the app is never rolled out. +The migrator and seeder live in this same chart but render **only when enabled** +(`--set migrator.enabled=true` / `--set seeder.enabled=true`); by default every component is +off. The pipeline runs one at a time — enabling only that Job, so the app, worker and +ephemeral deps are skipped — installing each as its own release **before** the `cats` +application release. Each Job is named per release revision (`cats-migrator-`), so every +deploy runs a fresh Job — Job pod templates are immutable, so a stable name could not be +re-applied — and `helm upgrade --wait` blocks until it completes. A failed migration +therefore fails its own step (with the Job left in place for log inspection) and the app is +never rolled out. > RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the > transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. @@ -80,7 +81,7 @@ helm dependency build ./helm_deploy/cats helm upgrade --install cats-migrate ./helm_deploy/cats \ --namespace "$KUBE_NAMESPACE" \ --values ./helm_deploy/cats/values-$ENV.yaml \ - --set application.enabled=false --set job=migrate \ + --set migrator.enabled=true \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ --set migrator.image.tag="migrator-$SHA" \ @@ -90,7 +91,7 @@ helm upgrade --install cats-migrate ./helm_deploy/cats \ helm upgrade --install cats-seed ./helm_deploy/cats \ --namespace "$KUBE_NAMESPACE" \ --values ./helm_deploy/cats/values-$ENV.yaml \ - --set application.enabled=false --set job=seed \ + --set seeder.enabled=true \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ --set seeder.image.tag="seeder-$SHA" \ @@ -100,6 +101,8 @@ helm upgrade --install cats-seed ./helm_deploy/cats \ helm upgrade --install cats ./helm_deploy/cats \ --namespace "$KUBE_NAMESPACE" \ --values ./helm_deploy/cats/values-$ENV.yaml \ + --set app.enabled=true --set worker.enabled=true \ + --set rabbitmq.enabled=true --set redis.enabled=true \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set app.serviceAccountName="$KUBE_NAMESPACE" \ --set app.image.repository="$REGISTRY/$ECR_REPOSITORY" \ @@ -121,12 +124,14 @@ helm dependency build ./helm_deploy/cats helm lint ./helm_deploy/cats -f ./helm_deploy/cats/values-dev.yaml # app release helm template cats ./helm_deploy/cats \ - --namespace cfocats-dev --values ./helm_deploy/cats/values-dev.yaml + --namespace cfocats-dev --values ./helm_deploy/cats/values-dev.yaml \ + --set app.enabled=true --set worker.enabled=true \ + --set rabbitmq.enabled=true --set redis.enabled=true # migrate / seed releases helm template cats-migrate ./helm_deploy/cats --namespace cfocats-dev \ - -f ./helm_deploy/cats/values-dev.yaml --set application.enabled=false --set job=migrate + -f ./helm_deploy/cats/values-dev.yaml --set migrator.enabled=true helm template cats-seed ./helm_deploy/cats --namespace cfocats-dev \ - -f ./helm_deploy/cats/values-dev.yaml --set application.enabled=false --set job=seed + -f ./helm_deploy/cats/values-dev.yaml --set seeder.enabled=true ``` ## First-time migration from the previous (kubectl) deploy diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock index 86b6e6f4e..53dc0154e 100644 --- a/helm_deploy/cats/Chart.lock +++ b/helm_deploy/cats/Chart.lock @@ -8,5 +8,5 @@ dependencies: - name: generic-prometheus-alerts repository: https://ministryofjustice.github.io/hmpps-helm-charts version: 1.17.1 -digest: sha256:46af457269c636c4c1c39c8b1d604212e9f88bf26a993637e1a8bf9da7288d9e -generated: "2026-06-20T20:17:02.299359+01:00" +digest: sha256:fc136023a49121310e9edcb1214b9265e3d8cf4459251eeb98ede9a10ebd5b89 +generated: "2026-06-20T21:22:56.468025+01:00" diff --git a/helm_deploy/cats/Chart.yaml b/helm_deploy/cats/Chart.yaml index eb3bebe90..5d7fba640 100644 --- a/helm_deploy/cats/Chart.yaml +++ b/helm_deploy/cats/Chart.yaml @@ -21,14 +21,14 @@ dependencies: alias: app version: "3.17.2" repository: https://ministryofjustice.github.io/hmpps-helm-charts - condition: application.enabled + condition: app.enabled # Background worker (Quartz jobs) — single instance, no ingress. - name: generic-service alias: worker version: "3.17.2" repository: https://ministryofjustice.github.io/hmpps-helm-charts - condition: application.enabled + condition: worker.enabled # Standard HMPPS Prometheus alert rules. Disabled by default — enable once an # Alertmanager receiver exists for the alertSeverity (see helm_deploy/README.md). diff --git a/helm_deploy/cats/templates/migrator-job.yaml b/helm_deploy/cats/templates/migrator-job.yaml index 1a1204cf7..ae686a930 100644 --- a/helm_deploy/cats/templates/migrator-job.yaml +++ b/helm_deploy/cats/templates/migrator-job.yaml @@ -1,4 +1,4 @@ -{{- if eq .Values.job "migrate" }} +{{- if .Values.migrator.enabled }} apiVersion: batch/v1 kind: Job metadata: diff --git a/helm_deploy/cats/templates/rabbitmq.yaml b/helm_deploy/cats/templates/rabbitmq.yaml index 755e1291d..1a97241e5 100644 --- a/helm_deploy/cats/templates/rabbitmq.yaml +++ b/helm_deploy/cats/templates/rabbitmq.yaml @@ -1,4 +1,4 @@ -{{- if .Values.application.enabled }} +{{- if .Values.rabbitmq.enabled }} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/helm_deploy/cats/templates/redis.yaml b/helm_deploy/cats/templates/redis.yaml index dda6d59f2..168f606b9 100644 --- a/helm_deploy/cats/templates/redis.yaml +++ b/helm_deploy/cats/templates/redis.yaml @@ -1,4 +1,4 @@ -{{- if .Values.application.enabled }} +{{- if .Values.redis.enabled }} apiVersion: apps/v1 kind: Deployment metadata: diff --git a/helm_deploy/cats/templates/seeder-job.yaml b/helm_deploy/cats/templates/seeder-job.yaml index b6dc77423..af01cd59b 100644 --- a/helm_deploy/cats/templates/seeder-job.yaml +++ b/helm_deploy/cats/templates/seeder-job.yaml @@ -1,4 +1,4 @@ -{{- if eq .Values.job "seed" }} +{{- if .Values.seeder.enabled }} apiVersion: batch/v1 kind: Job metadata: diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 5081f0e38..cf1245bce 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -1,13 +1,13 @@ serviceAccountName: "" # --------------------------------------------------------------------------- -# Deploy-mode selectors (set by CI via --set; see README) +# Component toggles — every workload is off by default and each release opts in +# via --set (see README). The shared values-.yaml files are used by all +# three releases, so enabling happens per release in CI, not in those files: +# app deploy : --set app.enabled=true worker.enabled=true rabbitmq.enabled=true redis.enabled=true +# migrate : --set migrator.enabled=true +# seed : --set seeder.enabled=true # --------------------------------------------------------------------------- -# application.enabled — render the application stack (web + worker + rabbitmq/redis) -# job — "migrate" or "seed" renders that one-off Job; "" renders none -application: - enabled: true -job: "" connectionStrings: catsDb: &catsDb "Server=$(DATABASE_ADDRESS);Database=CatsDb;User Id=$(DATABASE_USERNAME);Password=$(DATABASE_PASSWORD);TrustServerCertificate=True;" @@ -15,6 +15,7 @@ connectionStrings: redis: &redis "redis-service:6379" app: + enabled: false nameOverride: cats fullnameOverride: cats @@ -102,6 +103,7 @@ app: # Background worker # --------------------------------------------------------------------------- worker: + enabled: false nameOverride: cats-worker fullnameOverride: cats-worker @@ -175,18 +177,22 @@ worker: AppConfigurationSettings__Version: "0.0.0" rabbitmq: + enabled: false image: "rabbitmq:4.3-management-alpine@sha256:1a43764bdcf116542e7c8c794adc67c79461727da16d474e9e21483fe7f716d3" redis: + enabled: false image: "redis:7.4-alpine@sha256:b1addbe72465a718643cff9e60a58e6df1841e29d6d7d60c9a85d8d72f08d1a7" migrator: + enabled: false image: repository: "" # set by CI via --set tag: "" # set by CI via --set (migrator-) backoffLimit: 3 seeder: + enabled: false image: repository: "" # set by CI via --set tag: "" # set by CI via --set (seeder-) From 8b90b6d0f7e64b40a2a7a45ebdfc905713a913fd Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sun, 21 Jun 2026 00:01:56 +0100 Subject: [PATCH 11/25] Add --wait-for-jobs to wait until completion --- .github/workflows/deploy.yml | 4 ++-- helm_deploy/README.md | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 7cf51b67d..253610e82 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -125,7 +125,7 @@ jobs: --set serviceAccountName="${KUBE_NAMESPACE}" \ --set migrator.image.repository="${REGISTRY}/${REPOSITORY}" \ --set migrator.image.tag="migrator-${{ github.sha }}" \ - --wait --timeout 5m + --wait --wait-for-jobs --timeout 5m env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} REGISTRY: ${{ steps.login-ecr.outputs.registry }} @@ -141,7 +141,7 @@ jobs: --set serviceAccountName="${KUBE_NAMESPACE}" \ --set seeder.image.repository="${REGISTRY}/${REPOSITORY}" \ --set seeder.image.tag="seeder-${{ github.sha }}" \ - --wait --timeout 5m + --wait --wait-for-jobs --timeout 5m env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} REGISTRY: ${{ steps.login-ecr.outputs.registry }} diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 528b6fb5f..0341eaf55 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -36,7 +36,8 @@ off. The pipeline runs one at a time — enabling only that Job, so the app, wor ephemeral deps are skipped — installing each as its own release **before** the `cats` application release. Each Job is named per release revision (`cats-migrator-`), so every deploy runs a fresh Job — Job pod templates are immutable, so a stable name could not be -re-applied — and `helm upgrade --wait` blocks until it completes. A failed migration +re-applied — and `helm upgrade --wait --wait-for-jobs` blocks until it completes (`--wait` +alone does **not** wait for Jobs, only for Pods/Deployments). A failed migration therefore fails its own step (with the Job left in place for log inspection) and the app is never rolled out. @@ -85,7 +86,7 @@ helm upgrade --install cats-migrate ./helm_deploy/cats \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ --set migrator.image.tag="migrator-$SHA" \ - --wait --timeout 5m + --wait --wait-for-jobs --timeout 5m # 2. Seed helm upgrade --install cats-seed ./helm_deploy/cats \ @@ -95,7 +96,7 @@ helm upgrade --install cats-seed ./helm_deploy/cats \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ --set seeder.image.tag="seeder-$SHA" \ - --wait --timeout 5m + --wait --wait-for-jobs --timeout 5m # 3. Deploy the application helm upgrade --install cats ./helm_deploy/cats \ From a7bc4fb70890a45ca890f2d01ca69f8464f6fe34 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sun, 21 Jun 2026 00:30:50 +0100 Subject: [PATCH 12/25] Cleanup prometheus alerts (temporarily disabled) --- helm_deploy/README.md | 11 ++++++----- helm_deploy/cats/Chart.lock | 7 ++----- helm_deploy/cats/Chart.yaml | 19 +++++++------------ helm_deploy/cats/values-dev.yaml | 5 +++-- helm_deploy/cats/values-production.yaml | 5 +++-- helm_deploy/cats/values-staging.yaml | 5 +++-- helm_deploy/cats/values.yaml | 15 +++++---------- 7 files changed, 29 insertions(+), 38 deletions(-) diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 0341eaf55..9a992b179 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -23,7 +23,6 @@ flags — so there are no per-mode values files: | Worker (Quartz jobs) | `generic-service` dependency, alias `worker` | | RabbitMQ (ephemeral) | local template `templates/rabbitmq.yaml` | | Redis (ephemeral backplane) | local template `templates/redis.yaml` | -| Prometheus alerts | `generic-prometheus-alerts` dependency (off by default) | The web tier reaches the worker at `http://cats-worker:8080` and the in-cluster broker/cache at `rabbitmq-service:5672` / `redis-service:6379`. @@ -44,10 +43,12 @@ never rolled out. > RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the > transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. -> **Prometheus alerts are disabled by default** (`generic-prometheus-alerts.enabled: false`). -> The rules only reach a human once an Alertmanager receiver is configured for the -> `alertSeverity` (a separate `cloud-platform-environments` change). To turn them on, -> set `generic-prometheus-alerts.enabled: true` in the relevant `values-.yaml`. +> **Prometheus alerts are commented out** for now. The `generic-prometheus-alerts` +> dependency (Chart.yaml) and its values blocks (`values.yaml` + each `values-.yaml`) +> are left in place but commented, because the rules only reach a human once an Alertmanager +> receiver is configured for the `alertSeverity` (a separate `cloud-platform-environments` +> change). To re-enable: uncomment those blocks and run +> `helm dependency update ./helm_deploy/cats` to refresh `Chart.lock`. ## Layout diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock index 53dc0154e..716430d81 100644 --- a/helm_deploy/cats/Chart.lock +++ b/helm_deploy/cats/Chart.lock @@ -5,8 +5,5 @@ dependencies: - name: generic-service repository: https://ministryofjustice.github.io/hmpps-helm-charts version: 3.17.2 -- name: generic-prometheus-alerts - repository: https://ministryofjustice.github.io/hmpps-helm-charts - version: 1.17.1 -digest: sha256:fc136023a49121310e9edcb1214b9265e3d8cf4459251eeb98ede9a10ebd5b89 -generated: "2026-06-20T21:22:56.468025+01:00" +digest: sha256:10c652856891266a73e6a1796ccf775df48e540e7dfac61812fd28cbf09b8a68 +generated: "2026-06-21T00:22:42.226118+01:00" diff --git a/helm_deploy/cats/Chart.yaml b/helm_deploy/cats/Chart.yaml index 5d7fba640..cc0c2edb4 100644 --- a/helm_deploy/cats/Chart.yaml +++ b/helm_deploy/cats/Chart.yaml @@ -1,12 +1,7 @@ apiVersion: v2 name: cats description: | - Case Assessment and Tracking System (CATS) — HMPPS Creating Future Opportunities (CFO). - Deploys the Blazor Server web tier and the Quartz worker (both via the HMPPS - generic-service chart) and the ephemeral in-cluster RabbitMQ and Redis dependencies. - The same chart also provides the database migrator and seeder Jobs, which the CI - pipeline installs as their own short-lived releases (cats-migrate, cats-seed) before - the application release. + HMPPS - Case Assessment and Tracking System (CATS) type: application # Version of this chart. Bump on every change to the chart/values. @@ -30,9 +25,9 @@ dependencies: repository: https://ministryofjustice.github.io/hmpps-helm-charts condition: worker.enabled - # Standard HMPPS Prometheus alert rules. Disabled by default — enable once an - # Alertmanager receiver exists for the alertSeverity (see helm_deploy/README.md). - - name: generic-prometheus-alerts - version: "1.17.1" - repository: https://ministryofjustice.github.io/hmpps-helm-charts - condition: generic-prometheus-alerts.enabled + # todo: enable prometheus alerts + # https://user-guide.cloud-platform.service.justice.gov.uk/documentation/monitoring-an-app/how-to-create-alarms.html#creating-your-own-custom-alerts + # uncomment and `helm dependency update ./helm_deploy/cats` to refresh Chart.lock. + # - name: generic-prometheus-alerts + # version: "1.17.1" + # repository: https://ministryofjustice.github.io/hmpps-helm-charts diff --git a/helm_deploy/cats/values-dev.yaml b/helm_deploy/cats/values-dev.yaml index 05014abc3..544e4adba 100644 --- a/helm_deploy/cats/values-dev.yaml +++ b/helm_deploy/cats/values-dev.yaml @@ -14,5 +14,6 @@ worker: DOTNET_ENVIRONMENT: "Development" Sentry__Environment: "Development-CloudPlatform" -generic-prometheus-alerts: - alertSeverity: cfo-alerts-nonprod +# todo: enable prometheus alerts +# generic-prometheus-alerts: +# alertSeverity: cfo-alerts-nonprod diff --git a/helm_deploy/cats/values-production.yaml b/helm_deploy/cats/values-production.yaml index 14d131c5c..d984b39bd 100644 --- a/helm_deploy/cats/values-production.yaml +++ b/helm_deploy/cats/values-production.yaml @@ -29,5 +29,6 @@ worker: DOTNET_ENVIRONMENT: "Production" Sentry__Environment: "Production-CloudPlatform" -generic-prometheus-alerts: - alertSeverity: cfo-alerts +# todo: enable prometheus alerts +# generic-prometheus-alerts: +# alertSeverity: cfo-alerts diff --git a/helm_deploy/cats/values-staging.yaml b/helm_deploy/cats/values-staging.yaml index 7ce3e4638..ddf84bab2 100644 --- a/helm_deploy/cats/values-staging.yaml +++ b/helm_deploy/cats/values-staging.yaml @@ -15,5 +15,6 @@ worker: DOTNET_ENVIRONMENT: "Staging" Sentry__Environment: "Staging-CloudPlatform" -generic-prometheus-alerts: - alertSeverity: cfo-alerts-nonprod +# todo: enable prometheus alerts +# generic-prometheus-alerts: +# alertSeverity: cfo-alerts-nonprod diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index cf1245bce..2ab403e3a 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -198,13 +198,8 @@ seeder: tag: "" # set by CI via --set (seeder-) backoffLimit: 3 -# --------------------------------------------------------------------------- -# Prometheus alerts — disabled by default; rules only reach a human once an -# Alertmanager receiver exists for alertSeverity (a separate cloud-platform -# change). Set enabled: true per environment to turn on. -# --------------------------------------------------------------------------- -generic-prometheus-alerts: - enabled: false - targetApplication: cats - businessUnit: hmpps - alertSeverity: cfo-alerts +# todo: enable prometheus alerts +# generic-prometheus-alerts: +# targetApplication: cats +# businessUnit: hmpps +# alertSeverity: cfo-alerts From 332ec6dc846d8fd73b165428e91b4a0f7f86fbe7 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sun, 21 Jun 2026 10:30:14 +0100 Subject: [PATCH 13/25] ModSec: enable WAF (detection only) --- helm_deploy/README.md | 33 +++++++++++++++++++++++++ helm_deploy/cats/Chart.lock | 2 +- helm_deploy/cats/values-production.yaml | 6 ++--- helm_deploy/cats/values-staging.yaml | 3 --- helm_deploy/cats/values.yaml | 20 +++++++++++++++ 5 files changed, 57 insertions(+), 7 deletions(-) diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 9a992b179..10e10b809 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -50,6 +50,39 @@ never rolled out. > change). To re-enable: uncomment those blocks and run > `helm dependency update ./helm_deploy/cats` to refresh `Chart.lock`. +## ModSecurity WAF + +The web tier's ingress runs behind Cloud Platform's ModSecurity ingress controllers +(OWASP Core Rule Set, anomaly scoring). Config lives entirely in values — the +`generic-service` subchart renders the annotations from `app.ingress`: + +| Setting | Where | Value | +| --- | --- | --- | +| `className` | `values.yaml` / `values-production.yaml` | `modsec-non-prod` (dev/staging), `modsec` (production) | +| `modsecurity_enabled` | `values.yaml` | `true` | +| `modsecurity_github_team` | `values.yaml` | `hmpps-creating-future-opportunities-devs` (controls who can read the logs) | +| `modsecurity_mode` | `values.yaml` (+ overlays) | `DetectionOnly` (current) → `On` (block) | + +`modsecurity_mode` is a local knob fed into the snippet's `SecRuleEngine`. The snippet +also pins Paranoia Level 1 and tags every event with the GitHub team + namespace so the +logs are reachable in OpenSearch (`live_kubernetes_ingress*`, search `ModSecurity`). + +> **Class default is non-prod by design.** The subchart hardcodes the *production* +> `modsec` class whenever `modsecurity_enabled` is true and no `className` is set, so the +> base `values.yaml` pins `modsec-non-prod` and only `values-production.yaml` overrides it +> to `modsec`. + +### Phased rollout (monitor → block) + +Everything currently runs in **`DetectionOnly`** (logs, never blocks). To promote: + +1. Exercise CATS in dev/staging — uploads, long forms, SignalR/WebSocket sessions. +2. Review WAF hits in OpenSearch; add `SecRuleRemoveById ` lines to the snippet for + any false positives. +3. Flip dev/staging to blocking: set `modsecurity_mode: On` in `values-dev.yaml` / + `values-staging.yaml`. +4. Once staging is clean, set `modsecurity_mode: On` in `values-production.yaml`. + ## Layout ``` diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock index 716430d81..1da374c32 100644 --- a/helm_deploy/cats/Chart.lock +++ b/helm_deploy/cats/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: https://ministryofjustice.github.io/hmpps-helm-charts version: 3.17.2 digest: sha256:10c652856891266a73e6a1796ccf775df48e540e7dfac61812fd28cbf09b8a68 -generated: "2026-06-21T00:22:42.226118+01:00" +generated: "2026-06-21T00:51:49.053976+01:00" diff --git a/helm_deploy/cats/values-production.yaml b/helm_deploy/cats/values-production.yaml index d984b39bd..9778db03b 100644 --- a/helm_deploy/cats/values-production.yaml +++ b/helm_deploy/cats/values-production.yaml @@ -1,10 +1,10 @@ -# Production overrides. -# NOTE: confirm the production namespace / hostname before first deploy. - app: replicaCount: 4 ingress: host: cfocats-production.live.cloud-platform.service.justice.gov.uk + className: modsec + # modsecurity_mode: On # Default to modsecurity_mode "DetectionOnly" until staging is tuned, then "On". + resources: requests: cpu: 500m diff --git a/helm_deploy/cats/values-staging.yaml b/helm_deploy/cats/values-staging.yaml index ddf84bab2..788ce9e5b 100644 --- a/helm_deploy/cats/values-staging.yaml +++ b/helm_deploy/cats/values-staging.yaml @@ -1,6 +1,3 @@ -# Staging overrides. -# NOTE: confirm the staging namespace / hostname before first deploy. - app: replicaCount: 3 ingress: diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 2ab403e3a..7667f897c 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -37,6 +37,26 @@ app: path: / healthPath: /health tlsSecretName: "" + # ModSecurity WAF. Default class is the non-prod controller; production overrides to "modsec". + className: modsec-non-prod + modsecurity_enabled: true + # WAF engine: "DetectionOnly" (log only) or "On" (block). Flip per-env in overlays once tuned. + modsecurity_mode: DetectionOnly + modsecurity_snippet: | + SecRuleEngine {{ .Values.ingress.modsecurity_mode }} + SecAuditEngine On + SecAuditLog /var/log/nginx/error.log + SecAuditLogType Serial + SecDefaultAction "phase:2,pass,log,tag:github_team=hmpps-creating-future-opportunities-devs,tag:namespace={{ .Release.Namespace }}" + SecRuleUpdateActionById 949110 "t:none,deny,status:406,logdata:%{SERVER_NAME}" + SecRuleUpdateActionById 959100 "t:none,deny,status:406,logdata:%{SERVER_NAME}" + SecAction \ + "id:900000,\ + phase:1,\ + nolog,\ + pass,\ + t:none,\ + setvar:tx.paranoia_level=1" annotations: nginx.ingress.kubernetes.io/affinity: "cookie" nginx.ingress.kubernetes.io/session-cookie-name: "http-cookie" From 5d9ddd32690bacce857ed6af34d5d3a59c4da711 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sun, 21 Jun 2026 14:12:06 +0100 Subject: [PATCH 14/25] use pods instead of jobs for migrate/seeding --- .github/workflows/deploy.yml | 6 ++-- helm_deploy/README.md | 37 ++++++++++++------- helm_deploy/cats/templates/migrator-job.yaml | 38 -------------------- helm_deploy/cats/templates/migrator-pod.yaml | 31 ++++++++++++++++ helm_deploy/cats/templates/seeder-job.yaml | 38 -------------------- helm_deploy/cats/templates/seeder-pod.yaml | 31 ++++++++++++++++ helm_deploy/cats/values-production.yaml | 1 + helm_deploy/cats/values-staging.yaml | 1 + helm_deploy/cats/values.yaml | 2 -- 9 files changed, 93 insertions(+), 92 deletions(-) delete mode 100644 helm_deploy/cats/templates/migrator-job.yaml create mode 100644 helm_deploy/cats/templates/migrator-pod.yaml delete mode 100644 helm_deploy/cats/templates/seeder-job.yaml create mode 100644 helm_deploy/cats/templates/seeder-pod.yaml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 253610e82..27b03bece 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -125,7 +125,8 @@ jobs: --set serviceAccountName="${KUBE_NAMESPACE}" \ --set migrator.image.repository="${REGISTRY}/${REPOSITORY}" \ --set migrator.image.tag="migrator-${{ github.sha }}" \ - --wait --wait-for-jobs --timeout 5m + --timeout 5m + kubectl -n "${KUBE_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod -l app=migrator env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} REGISTRY: ${{ steps.login-ecr.outputs.registry }} @@ -141,7 +142,8 @@ jobs: --set serviceAccountName="${KUBE_NAMESPACE}" \ --set seeder.image.repository="${REGISTRY}/${REPOSITORY}" \ --set seeder.image.tag="seeder-${{ github.sha }}" \ - --wait --wait-for-jobs --timeout 5m + --timeout 5m + kubectl -n "${KUBE_NAMESPACE}" wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod -l app=seeder env: KUBE_NAMESPACE: ${{ secrets.KUBE_NAMESPACE }} REGISTRY: ${{ steps.login-ecr.outputs.registry }} diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 10e10b809..619b0bb76 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -27,18 +27,29 @@ flags — so there are no per-mode values files: The web tier reaches the worker at `http://cats-worker:8080` and the in-cluster broker/cache at `rabbitmq-service:5672` / `redis-service:6379`. -## Migrator / seeder Jobs +## Migrator / seeder Pods The migrator and seeder live in this same chart but render **only when enabled** (`--set migrator.enabled=true` / `--set seeder.enabled=true`); by default every component is -off. The pipeline runs one at a time — enabling only that Job, so the app, worker and +off. The pipeline runs one at a time — enabling only that component, so the app, worker and ephemeral deps are skipped — installing each as its own release **before** the `cats` -application release. Each Job is named per release revision (`cats-migrator-`), so every -deploy runs a fresh Job — Job pod templates are immutable, so a stable name could not be -re-applied — and `helm upgrade --wait --wait-for-jobs` blocks until it completes (`--wait` -alone does **not** wait for Jobs, only for Pods/Deployments). A failed migration -therefore fails its own step (with the Job left in place for log inspection) and the app is -never rolled out. +application release. Each is named per release revision (`cats-migrator-`), so every +deploy runs a fresh Pod (a Pod spec is immutable, so a stable name could not be re-applied) +and Helm prunes the previous revision's Pod on upgrade. `restartPolicy: OnFailure` retries a +transient failure in place. + +They are plain **Pods**, not Jobs, deliberately. Measured on Cloud Platform, the migrate/seed +*work* takes 2–15s, but a **Job** object takes a further **~70–90s** to be marked `Complete` +after its pod has already reached `Succeeded` — control-plane (kube-controller-manager) +latency that no chart/Job setting can influence. A bare Pod's `Succeeded` phase is set by the +kubelet ~1s after the container exits, so the pipeline waits on that instead. + +Because they run to completion, the pipeline does **not** use `helm --wait` (Helm judges a Pod +ready via its `Ready` condition, which is never true for a `Succeeded` pod, so `--wait` would +hang until timeout even on success). Each step runs `helm upgrade` (no wait) then +`kubectl wait --for=jsonpath='{.status.phase}'=Succeeded -l app=`. A failed +migration therefore fails its own step (Pod left in place, with `describe`/`logs` dumped for +inspection) and the app is never rolled out. > RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the > transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. @@ -96,8 +107,8 @@ helm_deploy/cats/ _helpers.tpl rabbitmq.yaml redis.yaml - migrator-job.yaml - seeder-job.yaml + migrator-pod.yaml + seeder-pod.yaml ``` `infra/` now contains only `port-forward-deployment.yml`, a manual developer convenience @@ -120,7 +131,8 @@ helm upgrade --install cats-migrate ./helm_deploy/cats \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ --set migrator.image.tag="migrator-$SHA" \ - --wait --wait-for-jobs --timeout 5m + --timeout 5m +kubectl -n "$KUBE_NAMESPACE" wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod -l app=migrator # 2. Seed helm upgrade --install cats-seed ./helm_deploy/cats \ @@ -130,7 +142,8 @@ helm upgrade --install cats-seed ./helm_deploy/cats \ --set serviceAccountName="$KUBE_NAMESPACE" \ --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ --set seeder.image.tag="seeder-$SHA" \ - --wait --wait-for-jobs --timeout 5m + --timeout 5m +kubectl -n "$KUBE_NAMESPACE" wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod -l app=seeder # 3. Deploy the application helm upgrade --install cats ./helm_deploy/cats \ diff --git a/helm_deploy/cats/templates/migrator-job.yaml b/helm_deploy/cats/templates/migrator-job.yaml deleted file mode 100644 index ae686a930..000000000 --- a/helm_deploy/cats/templates/migrator-job.yaml +++ /dev/null @@ -1,38 +0,0 @@ -{{- if .Values.migrator.enabled }} -apiVersion: batch/v1 -kind: Job -metadata: - # Revision-suffixed so each upgrade of the cats-migrate release runs a fresh Job - # (a Job's pod template is immutable, so a stable name could not be re-applied). - name: cats-migrator-{{ .Release.Revision }} - labels: - app: migrator -spec: - backoffLimit: {{ .Values.migrator.backoffLimit }} - # Auto-clean finished Jobs; Helm also prunes the previous revision's Job on upgrade. - ttlSecondsAfterFinished: 3600 - template: - metadata: - labels: - app: migrator - spec: - serviceAccountName: {{ .Values.serviceAccountName }} - restartPolicy: Never - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - containers: - - name: migrator - image: "{{ .Values.migrator.image.repository }}:{{ .Values.migrator.image.tag }}" - securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: - - ALL - env: - {{- include "cats.databaseEnv" . | nindent 12 }} -{{- end }} diff --git a/helm_deploy/cats/templates/migrator-pod.yaml b/helm_deploy/cats/templates/migrator-pod.yaml new file mode 100644 index 000000000..d205e1db6 --- /dev/null +++ b/helm_deploy/cats/templates/migrator-pod.yaml @@ -0,0 +1,31 @@ +{{- if .Values.migrator.enabled }} +apiVersion: v1 +kind: Pod +metadata: + # Revision-suffixed so each upgrade of the cats-migrate release runs a fresh Pod + # (a Pod spec is immutable, so a stable name could not be re-applied). Helm prunes + # the previous revision's Pod on upgrade. + name: cats-migrator-{{ .Release.Revision }} + labels: + app: migrator +spec: + serviceAccountName: {{ .Values.serviceAccountName }} + restartPolicy: OnFailure + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true + containers: + - name: migrator + image: "{{ .Values.migrator.image.repository }}:{{ .Values.migrator.image.tag }}" + securityContext: + allowPrivilegeEscalation: false + privileged: false + capabilities: + drop: + - ALL + env: + {{- include "cats.databaseEnv" . | nindent 8 }} +{{- end }} diff --git a/helm_deploy/cats/templates/seeder-job.yaml b/helm_deploy/cats/templates/seeder-job.yaml deleted file mode 100644 index af01cd59b..000000000 --- a/helm_deploy/cats/templates/seeder-job.yaml +++ /dev/null @@ -1,38 +0,0 @@ -{{- if .Values.seeder.enabled }} -apiVersion: batch/v1 -kind: Job -metadata: - # Revision-suffixed so each upgrade of the cats-seed release runs a fresh Job - # (a Job's pod template is immutable, so a stable name could not be re-applied). - name: cats-seeder-{{ .Release.Revision }} - labels: - app: seeder -spec: - backoffLimit: {{ .Values.seeder.backoffLimit }} - # Auto-clean finished Jobs; Helm also prunes the previous revision's Job on upgrade. - ttlSecondsAfterFinished: 3600 - template: - metadata: - labels: - app: seeder - spec: - serviceAccountName: {{ .Values.serviceAccountName }} - restartPolicy: Never - securityContext: - seccompProfile: - type: RuntimeDefault - runAsUser: 1001 - runAsGroup: 1001 - runAsNonRoot: true - containers: - - name: seeder - image: "{{ .Values.seeder.image.repository }}:{{ .Values.seeder.image.tag }}" - securityContext: - allowPrivilegeEscalation: false - privileged: false - capabilities: - drop: - - ALL - env: - {{- include "cats.databaseEnv" . | nindent 12 }} -{{- end }} diff --git a/helm_deploy/cats/templates/seeder-pod.yaml b/helm_deploy/cats/templates/seeder-pod.yaml new file mode 100644 index 000000000..48027f6b0 --- /dev/null +++ b/helm_deploy/cats/templates/seeder-pod.yaml @@ -0,0 +1,31 @@ +{{- if .Values.seeder.enabled }} +apiVersion: v1 +kind: Pod +metadata: + # Revision-suffixed so each upgrade of the cats-seed release runs a fresh Pod + # (a Pod spec is immutable, so a stable name could not be re-applied). Helm prunes + # the previous revision's Pod on upgrade. + name: cats-seeder-{{ .Release.Revision }} + labels: + app: seeder +spec: + serviceAccountName: {{ .Values.serviceAccountName }} + restartPolicy: OnFailure + securityContext: + seccompProfile: + type: RuntimeDefault + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true + containers: + - name: seeder + image: "{{ .Values.seeder.image.repository }}:{{ .Values.seeder.image.tag }}" + securityContext: + allowPrivilegeEscalation: false + privileged: false + capabilities: + drop: + - ALL + env: + {{- include "cats.databaseEnv" . | nindent 8 }} +{{- end }} diff --git a/helm_deploy/cats/values-production.yaml b/helm_deploy/cats/values-production.yaml index 9778db03b..978c1eb90 100644 --- a/helm_deploy/cats/values-production.yaml +++ b/helm_deploy/cats/values-production.yaml @@ -28,6 +28,7 @@ worker: env: DOTNET_ENVIRONMENT: "Production" Sentry__Environment: "Production-CloudPlatform" + Features__PresenceHub__RelayUserPresenceNotifications: "false" # todo: enable prometheus alerts # generic-prometheus-alerts: diff --git a/helm_deploy/cats/values-staging.yaml b/helm_deploy/cats/values-staging.yaml index 788ce9e5b..66949b4e4 100644 --- a/helm_deploy/cats/values-staging.yaml +++ b/helm_deploy/cats/values-staging.yaml @@ -11,6 +11,7 @@ worker: env: DOTNET_ENVIRONMENT: "Staging" Sentry__Environment: "Staging-CloudPlatform" + Features__PresenceHub__RelayUserPresenceNotifications: "false" # todo: enable prometheus alerts # generic-prometheus-alerts: diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 7667f897c..6d03c700f 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -209,14 +209,12 @@ migrator: image: repository: "" # set by CI via --set tag: "" # set by CI via --set (migrator-) - backoffLimit: 3 seeder: enabled: false image: repository: "" # set by CI via --set tag: "" # set by CI via --set (seeder-) - backoffLimit: 3 # todo: enable prometheus alerts # generic-prometheus-alerts: From e0116b0826a0514aaacf865aa5b9d505b2f13f89 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Sun, 21 Jun 2026 15:31:09 +0100 Subject: [PATCH 15/25] Remove/ignore Chart.lock --- .github/workflows/deploy.yml | 2 +- .github/workflows/validate-helm.yml | 2 +- .gitignore | 3 ++- helm_deploy/README.md | 6 +++--- helm_deploy/cats/Chart.lock | 9 --------- helm_deploy/cats/Chart.yaml | 2 +- 6 files changed, 8 insertions(+), 16 deletions(-) delete mode 100644 helm_deploy/cats/Chart.lock diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 27b03bece..ff33bccd0 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -113,7 +113,7 @@ jobs: run: | set -euo pipefail helm repo add hmpps-helm-charts https://ministryofjustice.github.io/hmpps-helm-charts - helm dependency build ./helm_deploy/cats + helm dependency update ./helm_deploy/cats - name: Run database migrations run: | diff --git a/.github/workflows/validate-helm.yml b/.github/workflows/validate-helm.yml index 81835cb2a..0101f378f 100644 --- a/.github/workflows/validate-helm.yml +++ b/.github/workflows/validate-helm.yml @@ -26,7 +26,7 @@ jobs: - name: Build chart dependencies run: | helm repo add hmpps-helm-charts https://ministryofjustice.github.io/hmpps-helm-charts - helm dependency build ./helm_deploy/cats + helm dependency update ./helm_deploy/cats - name: Lint and template all environments run: | diff --git a/.gitignore b/.gitignore index 1d3927c6b..800ed87f3 100644 --- a/.gitignore +++ b/.gitignore @@ -482,5 +482,6 @@ aspire-output/ # ls cache files for C# develop extension *csproj.lscache -# Helm chart dependencies (fetched via `helm dependency build`) +# Helm helm_deploy/*/charts/ +helm_deploy/*/Chart.lock diff --git a/helm_deploy/README.md b/helm_deploy/README.md index 619b0bb76..2cccab98d 100644 --- a/helm_deploy/README.md +++ b/helm_deploy/README.md @@ -59,7 +59,7 @@ inspection) and the app is never rolled out. > are left in place but commented, because the rules only reach a human once an Alertmanager > receiver is configured for the `alertSeverity` (a separate `cloud-platform-environments` > change). To re-enable: uncomment those blocks and run -> `helm dependency update ./helm_deploy/cats` to refresh `Chart.lock`. +> `helm dependency update ./helm_deploy/cats` to fetch the new dependency. ## ModSecurity WAF @@ -121,7 +121,7 @@ the four images share one ECR repository but use different tag prefixes (`cats-`, `worker-`, `migrator-`, `seeder-`). The pipeline runs three releases in order: ```bash -helm dependency build ./helm_deploy/cats +helm dependency update ./helm_deploy/cats # 1. Migrate helm upgrade --install cats-migrate ./helm_deploy/cats \ @@ -168,7 +168,7 @@ helm upgrade --install cats ./helm_deploy/cats \ ## Local validation ```bash -helm dependency build ./helm_deploy/cats +helm dependency update ./helm_deploy/cats helm lint ./helm_deploy/cats -f ./helm_deploy/cats/values-dev.yaml # app release helm template cats ./helm_deploy/cats \ diff --git a/helm_deploy/cats/Chart.lock b/helm_deploy/cats/Chart.lock deleted file mode 100644 index 1da374c32..000000000 --- a/helm_deploy/cats/Chart.lock +++ /dev/null @@ -1,9 +0,0 @@ -dependencies: -- name: generic-service - repository: https://ministryofjustice.github.io/hmpps-helm-charts - version: 3.17.2 -- name: generic-service - repository: https://ministryofjustice.github.io/hmpps-helm-charts - version: 3.17.2 -digest: sha256:10c652856891266a73e6a1796ccf775df48e540e7dfac61812fd28cbf09b8a68 -generated: "2026-06-21T00:51:49.053976+01:00" diff --git a/helm_deploy/cats/Chart.yaml b/helm_deploy/cats/Chart.yaml index cc0c2edb4..39ad19796 100644 --- a/helm_deploy/cats/Chart.yaml +++ b/helm_deploy/cats/Chart.yaml @@ -27,7 +27,7 @@ dependencies: # todo: enable prometheus alerts # https://user-guide.cloud-platform.service.justice.gov.uk/documentation/monitoring-an-app/how-to-create-alarms.html#creating-your-own-custom-alerts - # uncomment and `helm dependency update ./helm_deploy/cats` to refresh Chart.lock. + # uncomment and re-run `helm dependency update ./helm_deploy/cats` to fetch it. # - name: generic-prometheus-alerts # version: "1.17.1" # repository: https://ministryofjustice.github.io/hmpps-helm-charts From 57b8dc6c054931b30c02e52d160899b8e3385eb9 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 11:01:22 +0100 Subject: [PATCH 16/25] Use CP's modsec defaults --- helm_deploy/cats/values.yaml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 6d03c700f..11b37251b 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -43,20 +43,9 @@ app: # WAF engine: "DetectionOnly" (log only) or "On" (block). Flip per-env in overlays once tuned. modsecurity_mode: DetectionOnly modsecurity_snippet: | - SecRuleEngine {{ .Values.ingress.modsecurity_mode }} SecAuditEngine On - SecAuditLog /var/log/nginx/error.log - SecAuditLogType Serial + SecRuleEngine {{ .Values.ingress.modsecurity_mode }} SecDefaultAction "phase:2,pass,log,tag:github_team=hmpps-creating-future-opportunities-devs,tag:namespace={{ .Release.Namespace }}" - SecRuleUpdateActionById 949110 "t:none,deny,status:406,logdata:%{SERVER_NAME}" - SecRuleUpdateActionById 959100 "t:none,deny,status:406,logdata:%{SERVER_NAME}" - SecAction \ - "id:900000,\ - phase:1,\ - nolog,\ - pass,\ - t:none,\ - setvar:tx.paranoia_level=1" annotations: nginx.ingress.kubernetes.io/affinity: "cookie" nginx.ingress.kubernetes.io/session-cookie-name: "http-cookie" From 345acafc01be89c608a35e508e6c9a0d3c278093 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 14:35:16 +0100 Subject: [PATCH 17/25] Translate port-forward-deployment to helm --- .github/workflows/deploy.yml | 1 + .github/workflows/validate-helm.yml | 1 + .../rds-port-forward-deployment.yaml | 34 +++++++++++++++++++ helm_deploy/cats/values.yaml | 11 ++++++ infra/port-forward-deployment.yml | 32 ----------------- 5 files changed, 47 insertions(+), 32 deletions(-) create mode 100644 helm_deploy/cats/templates/rds-port-forward-deployment.yaml delete mode 100644 infra/port-forward-deployment.yml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index ff33bccd0..8a506c8b3 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -161,6 +161,7 @@ jobs: --set worker.enabled=true \ --set rabbitmq.enabled=true \ --set redis.enabled=true \ + --set rdsPortForward.enabled=true \ --set serviceAccountName="${KUBE_NAMESPACE}" \ --set app.serviceAccountName="${KUBE_NAMESPACE}" \ --set app.image.repository="${IMAGE_REPOSITORY}" \ diff --git a/.github/workflows/validate-helm.yml b/.github/workflows/validate-helm.yml index 0101f378f..2ca1b0437 100644 --- a/.github/workflows/validate-helm.yml +++ b/.github/workflows/validate-helm.yml @@ -46,6 +46,7 @@ jobs: --set worker.enabled=true \ --set rabbitmq.enabled=true \ --set redis.enabled=true \ + --set rdsPortForward.enabled=true \ --set serviceAccountName="cfocats-$env" \ --set app.serviceAccountName="cfocats-$env" \ --set app.image.repository="example/cfocats" \ diff --git a/helm_deploy/cats/templates/rds-port-forward-deployment.yaml b/helm_deploy/cats/templates/rds-port-forward-deployment.yaml new file mode 100644 index 000000000..5bd03d83a --- /dev/null +++ b/helm_deploy/cats/templates/rds-port-forward-deployment.yaml @@ -0,0 +1,34 @@ +{{- if .Values.rdsPortForward.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rds-port-forward-deployment + labels: + app: rds-port-forward +spec: + replicas: 1 + selector: + matchLabels: + app: rds-port-forward + template: + metadata: + labels: + app: rds-port-forward + spec: + serviceAccountName: {{ .Values.serviceAccountName }} + containers: + - name: rds-port-forward + image: {{ .Values.rdsPortForward.image | quote }} + ports: + - containerPort: {{ .Values.rdsPortForward.localPort }} # this is your LOCAL_PORT inside the pod + env: + - name: REMOTE_HOST + valueFrom: + secretKeyRef: + name: rds-mssql-instance-output + key: rds_instance_address + - name: LOCAL_PORT + value: {{ .Values.rdsPortForward.localPort | quote }} # what the pod listens on + - name: REMOTE_PORT + value: {{ .Values.rdsPortForward.remotePort | quote }} # SQL Server in the cloud +{{- end }} diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 11b37251b..2a7cd8800 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -205,6 +205,17 @@ seeder: repository: "" # set by CI via --set tag: "" # set by CI via --set (seeder-) +# --------------------------------------------------------------------------- +# RDS port-forward — an ad-hoc helper Deployment that bridges to the cloud RDS +# SQL Server so it can be reached via `kubectl port-forward`. Off by default; +# enable on demand with --set rdsPortForward.enabled=true. +# --------------------------------------------------------------------------- +rdsPortForward: + enabled: false + image: "ministryofjustice/port-forward@sha256:eaed873978acf6ccf23f08315f56ef71f0a7ffcd6a0bea821459bfb363b65e76" + localPort: 11433 # port the pod listens on (your LOCAL_PORT) + remotePort: 1433 # SQL Server port in the cloud + # todo: enable prometheus alerts # generic-prometheus-alerts: # targetApplication: cats diff --git a/infra/port-forward-deployment.yml b/infra/port-forward-deployment.yml deleted file mode 100644 index a31abad30..000000000 --- a/infra/port-forward-deployment.yml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: port-forward-deployment - labels: - app: port-forward -spec: - replicas: 1 - selector: - matchLabels: - app: port-forward - template: - metadata: - labels: - app: port-forward - spec: - serviceAccountName: ${NAMESPACE} - containers: - - name: port-forward - image: ministryofjustice/port-forward@sha256:eaed873978acf6ccf23f08315f56ef71f0a7ffcd6a0bea821459bfb363b65e76 - ports: - - containerPort: 11433 # this is your LOCAL_PORT inside the pod - env: - - name: REMOTE_HOST - valueFrom: - secretKeyRef: - name: rds-mssql-instance-output - key: rds_instance_address - - name: LOCAL_PORT - value: "11433" # what the pod listens on - - name: REMOTE_PORT - value: "1433" # SQL Server in the cloud \ No newline at end of file From 90e50d1dffa3c588c45733847f01f041d3c1a847 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 14:35:25 +0100 Subject: [PATCH 18/25] Remove readme --- helm_deploy/README.md | 201 ------------------------------------------ 1 file changed, 201 deletions(-) delete mode 100644 helm_deploy/README.md diff --git a/helm_deploy/README.md b/helm_deploy/README.md deleted file mode 100644 index 2cccab98d..000000000 --- a/helm_deploy/README.md +++ /dev/null @@ -1,201 +0,0 @@ -# CATS Helm deployment - -Helm deploys the entire Case Assessment and Tracking System (CATS) to the MoJ Cloud -Platform from **one chart**. There is one templating mechanism at the deploy layer — Helm — -so there is no `envsubst` or `kubectl apply` of raw manifests in the pipeline. - -The CI pipeline installs **three releases from this single chart**, run in order so each -stage has isolated logs and its own timeout. Every workload is **off by default** and each -release opts in to exactly the components it needs via per-component `--set X.enabled=true` -flags — so there are no per-mode values files: - -| Release | Contains | Enabled components | -|----------------|--------------------------------------------|----------------------------------------------------------| -| `cats-migrate` | DB migrator Job | `migrator.enabled=true` | -| `cats-seed` | DB seeder Job | `seeder.enabled=true` | -| `cats` | web tier, worker, ephemeral RabbitMQ/Redis | `app.enabled`, `worker.enabled`, `rabbitmq.enabled`, `redis.enabled` = `true` | - -## What the `cats` release contains - -| Resource | Source | -|-------------------------------|---------------------------------------------------| -| Web tier (Blazor Server UI) | `generic-service` dependency, alias `app` | -| Worker (Quartz jobs) | `generic-service` dependency, alias `worker` | -| RabbitMQ (ephemeral) | local template `templates/rabbitmq.yaml` | -| Redis (ephemeral backplane) | local template `templates/redis.yaml` | - -The web tier reaches the worker at `http://cats-worker:8080` and the in-cluster broker/cache -at `rabbitmq-service:5672` / `redis-service:6379`. - -## Migrator / seeder Pods - -The migrator and seeder live in this same chart but render **only when enabled** -(`--set migrator.enabled=true` / `--set seeder.enabled=true`); by default every component is -off. The pipeline runs one at a time — enabling only that component, so the app, worker and -ephemeral deps are skipped — installing each as its own release **before** the `cats` -application release. Each is named per release revision (`cats-migrator-`), so every -deploy runs a fresh Pod (a Pod spec is immutable, so a stable name could not be re-applied) -and Helm prunes the previous revision's Pod on upgrade. `restartPolicy: OnFailure` retries a -transient failure in place. - -They are plain **Pods**, not Jobs, deliberately. Measured on Cloud Platform, the migrate/seed -*work* takes 2–15s, but a **Job** object takes a further **~70–90s** to be marked `Complete` -after its pod has already reached `Succeeded` — control-plane (kube-controller-manager) -latency that no chart/Job setting can influence. A bare Pod's `Succeeded` phase is set by the -kubelet ~1s after the container exits, so the pipeline waits on that instead. - -Because they run to completion, the pipeline does **not** use `helm --wait` (Helm judges a Pod -ready via its `Ready` condition, which is never true for a `Succeeded` pod, so `--wait` would -hang until timeout even on success). Each step runs `helm upgrade` (no wait) then -`kubectl wait --for=jsonpath='{.status.phase}'=Succeeded -l app=`. A failed -migration therefore fails its own step (Pod left in place, with `describe`/`logs` dumped for -inspection) and the app is never rolled out. - -> RabbitMQ and Redis are **ephemeral** (no persistence): RabbitMQ carries only the -> transient Rebus message flow and Redis is purely a SignalR backplane / Fusion cache. - -> **Prometheus alerts are commented out** for now. The `generic-prometheus-alerts` -> dependency (Chart.yaml) and its values blocks (`values.yaml` + each `values-.yaml`) -> are left in place but commented, because the rules only reach a human once an Alertmanager -> receiver is configured for the `alertSeverity` (a separate `cloud-platform-environments` -> change). To re-enable: uncomment those blocks and run -> `helm dependency update ./helm_deploy/cats` to fetch the new dependency. - -## ModSecurity WAF - -The web tier's ingress runs behind Cloud Platform's ModSecurity ingress controllers -(OWASP Core Rule Set, anomaly scoring). Config lives entirely in values — the -`generic-service` subchart renders the annotations from `app.ingress`: - -| Setting | Where | Value | -| --- | --- | --- | -| `className` | `values.yaml` / `values-production.yaml` | `modsec-non-prod` (dev/staging), `modsec` (production) | -| `modsecurity_enabled` | `values.yaml` | `true` | -| `modsecurity_github_team` | `values.yaml` | `hmpps-creating-future-opportunities-devs` (controls who can read the logs) | -| `modsecurity_mode` | `values.yaml` (+ overlays) | `DetectionOnly` (current) → `On` (block) | - -`modsecurity_mode` is a local knob fed into the snippet's `SecRuleEngine`. The snippet -also pins Paranoia Level 1 and tags every event with the GitHub team + namespace so the -logs are reachable in OpenSearch (`live_kubernetes_ingress*`, search `ModSecurity`). - -> **Class default is non-prod by design.** The subchart hardcodes the *production* -> `modsec` class whenever `modsecurity_enabled` is true and no `className` is set, so the -> base `values.yaml` pins `modsec-non-prod` and only `values-production.yaml` overrides it -> to `modsec`. - -### Phased rollout (monitor → block) - -Everything currently runs in **`DetectionOnly`** (logs, never blocks). To promote: - -1. Exercise CATS in dev/staging — uploads, long forms, SignalR/WebSocket sessions. -2. Review WAF hits in OpenSearch; add `SecRuleRemoveById ` lines to the snippet for - any false positives. -3. Flip dev/staging to blocking: set `modsecurity_mode: On` in `values-dev.yaml` / - `values-staging.yaml`. -4. Once staging is clean, set `modsecurity_mode: On` in `values-production.yaml`. - -## Layout - -``` -helm_deploy/cats/ - Chart.yaml # app + worker (aliased generic-service) + alerts - values.yaml # shared defaults (Jobs disabled) - values-dev.yaml # namespace: cfocats-dev - values-staging.yaml - values-production.yaml - templates/ - _helpers.tpl - rabbitmq.yaml - redis.yaml - migrator-pod.yaml - seeder-pod.yaml -``` - -`infra/` now contains only `port-forward-deployment.yml`, a manual developer convenience -for reaching the RDS instance (applied ad hoc, not part of the pipeline). - -## Per-deploy values (supplied by CI) - -The image registry/tags, app version, and service account are passed at deploy time. Note -the four images share one ECR repository but use different tag prefixes -(`cats-`, `worker-`, `migrator-`, `seeder-`). The pipeline runs three releases in order: - -```bash -helm dependency update ./helm_deploy/cats - -# 1. Migrate -helm upgrade --install cats-migrate ./helm_deploy/cats \ - --namespace "$KUBE_NAMESPACE" \ - --values ./helm_deploy/cats/values-$ENV.yaml \ - --set migrator.enabled=true \ - --set serviceAccountName="$KUBE_NAMESPACE" \ - --set migrator.image.repository="$REGISTRY/$ECR_REPOSITORY" \ - --set migrator.image.tag="migrator-$SHA" \ - --timeout 5m -kubectl -n "$KUBE_NAMESPACE" wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod -l app=migrator - -# 2. Seed -helm upgrade --install cats-seed ./helm_deploy/cats \ - --namespace "$KUBE_NAMESPACE" \ - --values ./helm_deploy/cats/values-$ENV.yaml \ - --set seeder.enabled=true \ - --set serviceAccountName="$KUBE_NAMESPACE" \ - --set seeder.image.repository="$REGISTRY/$ECR_REPOSITORY" \ - --set seeder.image.tag="seeder-$SHA" \ - --timeout 5m -kubectl -n "$KUBE_NAMESPACE" wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=300s pod -l app=seeder - -# 3. Deploy the application -helm upgrade --install cats ./helm_deploy/cats \ - --namespace "$KUBE_NAMESPACE" \ - --values ./helm_deploy/cats/values-$ENV.yaml \ - --set app.enabled=true --set worker.enabled=true \ - --set rabbitmq.enabled=true --set redis.enabled=true \ - --set serviceAccountName="$KUBE_NAMESPACE" \ - --set app.serviceAccountName="$KUBE_NAMESPACE" \ - --set app.image.repository="$REGISTRY/$ECR_REPOSITORY" \ - --set app.image.tag="cats-$SHA" \ - --set app.env.Sentry__Release="$APP_VERSION" \ - --set app.env.AppConfigurationSettings__Version="$APP_VERSION" \ - --set worker.serviceAccountName="$KUBE_NAMESPACE" \ - --set worker.image.repository="$REGISTRY/$ECR_REPOSITORY" \ - --set worker.image.tag="worker-$SHA" \ - --set worker.env.Sentry__Release="$APP_VERSION" \ - --set worker.env.AppConfigurationSettings__Version="$APP_VERSION" \ - --atomic --wait --timeout 10m -``` - -## Local validation - -```bash -helm dependency update ./helm_deploy/cats -helm lint ./helm_deploy/cats -f ./helm_deploy/cats/values-dev.yaml -# app release -helm template cats ./helm_deploy/cats \ - --namespace cfocats-dev --values ./helm_deploy/cats/values-dev.yaml \ - --set app.enabled=true --set worker.enabled=true \ - --set rabbitmq.enabled=true --set redis.enabled=true -# migrate / seed releases -helm template cats-migrate ./helm_deploy/cats --namespace cfocats-dev \ - -f ./helm_deploy/cats/values-dev.yaml --set migrator.enabled=true -helm template cats-seed ./helm_deploy/cats --namespace cfocats-dev \ - -f ./helm_deploy/cats/values-dev.yaml --set seeder.enabled=true -``` - -## First-time migration from the previous (kubectl) deploy - -The previous pipeline created differently-named objects. On the **first** Helm deploy to a -namespace that already ran the old pipeline, delete the legacy resources once so they don't -collide (notably the old ingress vs the new `cats-v1-2` ingress on the same host): - -```bash -kubectl -n delete deploy cats-deployment cats-worker-deployment \ - rabbitmq-deployment redis-deployment -kubectl -n delete svc cats-service cats-worker-service \ - rabbitmq-service redis-service -kubectl -n delete ingress cats-ingress -``` - -Helm then owns `cats`, `cats-worker`, `rabbitmq-*`, `redis-*` and `cats-v1-2` in the `cats` -release, with the migrator/seeder Jobs owned by the separate `cats-migrate` / `cats-seed` -releases. From 80a69eec8b8f472f95a1023adf245cf6ce732a24 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 15:34:39 +0100 Subject: [PATCH 19/25] Reset global.json --- global.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/global.json b/global.json index 48c708fdf..8b2eee7ba 100644 --- a/global.json +++ b/global.json @@ -1,6 +1,6 @@ { "sdk": { - "version": "10.0.301", + "version": "10.0.300", "rollForward": "disable", "allowPrerelease": false } From d4c3c672bf3900c1e575f5ce1479de0cb03c520d Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 16:14:10 +0100 Subject: [PATCH 20/25] Pin .NET runtime/sdk version in Dockerfile --- src/Database/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Database/Dockerfile b/src/Database/Dockerfile index c90657531..ac7c49d7d 100644 --- a/src/Database/Dockerfile +++ b/src/Database/Dockerfile @@ -1,4 +1,4 @@ -FROM mcr.microsoft.com/dotnet/sdk:10.0@sha256:548d93f8a18a1acbe6cc127bc4f47281430d34a9e35c18afa80a8d6741c2adc3 AS build +FROM mcr.microsoft.com/dotnet/sdk:10.0.300@sha256:c0790639332692a0d56cdd81ed581cfd24d040d9839764c138994866df89a3b6 AS build WORKDIR /src # Install sqlpackage to a fixed, non-root path @@ -12,7 +12,7 @@ COPY src/Database/CatsDb/ src/Database/CatsDb/ RUN dotnet build src/Database/CatsDb/CatsDb.sqlproj --configuration Release -FROM mcr.microsoft.com/dotnet/runtime:10.0@sha256:58318ab0733b63d3ac0d7609c46f2718244e623a176f45991ee01fad46fbf880 AS final +FROM mcr.microsoft.com/dotnet/runtime:10.0.9@sha256:23d7a947c6cf4160e37bde9e394c13ab8c725355288850d42e83d31126928d70 AS final WORKDIR /app # Copy sqlpackage and make it accessible to non-root users From 57085f22ba4f4e7df38d82290dc61bc74380f1ad Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 16:18:27 +0100 Subject: [PATCH 21/25] Pin dotnet container publishes to immutable digests --- .github/workflows/deploy.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 8a506c8b3..66e6cd815 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -57,6 +57,7 @@ jobs: --configuration Release \ --no-build \ /t:PublishContainer \ + /p:ContainerBaseImage=mcr.microsoft.com/dotnet/aspnet:10.0.9@sha256:f30905fa931d5d33242b1c52ee3b4319e5eb4914821d79d4fde0d59eafdaea66 \ /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ /p:ContainerImageTag=cats-${{ github.sha }} @@ -67,6 +68,7 @@ jobs: --configuration Release \ --no-build \ /t:PublishContainer \ + /p:ContainerBaseImage=mcr.microsoft.com/dotnet/aspnet:10.0.9@sha256:f30905fa931d5d33242b1c52ee3b4319e5eb4914821d79d4fde0d59eafdaea66 \ /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ /p:ContainerImageTag=worker-${{ github.sha }} @@ -77,6 +79,7 @@ jobs: --configuration Release \ --no-build \ /t:PublishContainer \ + /p:ContainerBaseImage=mcr.microsoft.com/dotnet/runtime:10.0.9@sha256:23d7a947c6cf4160e37bde9e394c13ab8c725355288850d42e83d31126928d70 \ /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ /p:ContainerImageTag=seeder-${{ github.sha }} From f4b2c40b7c99359cf5cc7a7c3f3611a7a1d7f899 Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 16:25:54 +0100 Subject: [PATCH 22/25] Remove redundant sqlpackage install --- .github/workflows/deploy.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 66e6cd815..3c6349166 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -32,9 +32,6 @@ jobs: - name: Restore dependencies run: dotnet restore - - name: Install dotnet sql package - run: dotnet tool install --global microsoft.sqlpackage --version 170.3.93 - - name: Build run: dotnet build --configuration Release --no-restore From b0d90fedfc5b5443c9fbc04b9ad41c0266f31aaf Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Mon, 22 Jun 2026 17:34:13 +0100 Subject: [PATCH 23/25] Add dockerfile's to individual projects Limitations with dotnet publish container means digests cannot be appended to pin an image --- .dockerignore | 9 ++++++++ .github/workflows/deploy.yml | 39 +++++++++++++--------------------- src/Database/Dockerfile | 2 +- src/DatabaseSeeding/Dockerfile | 20 +++++++++++++++++ src/Server.UI/Dockerfile | 22 +++++++++++++++++++ src/Worker/Dockerfile | 22 +++++++++++++++++++ 6 files changed, 89 insertions(+), 25 deletions(-) create mode 100644 .dockerignore create mode 100644 src/DatabaseSeeding/Dockerfile create mode 100644 src/Server.UI/Dockerfile create mode 100644 src/Worker/Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..307975134 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +.git +.github +**/bin +**/obj +test +**/.vs +**/.idea +**/.vscode +*.user diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 3c6349166..027feac54 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -50,36 +50,27 @@ jobs: - name: Build and Push Server.UI Container run: | - dotnet publish src/Server.UI/Server.UI.csproj \ - --configuration Release \ - --no-build \ - /t:PublishContainer \ - /p:ContainerBaseImage=mcr.microsoft.com/dotnet/aspnet:10.0.9@sha256:f30905fa931d5d33242b1c52ee3b4319e5eb4914821d79d4fde0d59eafdaea66 \ - /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ - /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ - /p:ContainerImageTag=cats-${{ github.sha }} + docker build \ + -f src/Server.UI/Dockerfile \ + -t ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:cats-${{ github.sha }} \ + . + docker push ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:cats-${{ github.sha }} - name: Build and Push Worker Container run: | - dotnet publish src/Worker/Worker.csproj \ - --configuration Release \ - --no-build \ - /t:PublishContainer \ - /p:ContainerBaseImage=mcr.microsoft.com/dotnet/aspnet:10.0.9@sha256:f30905fa931d5d33242b1c52ee3b4319e5eb4914821d79d4fde0d59eafdaea66 \ - /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ - /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ - /p:ContainerImageTag=worker-${{ github.sha }} + docker build \ + -f src/Worker/Dockerfile \ + -t ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:worker-${{ github.sha }} \ + . + docker push ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:worker-${{ github.sha }} - name: Build and Push DatabaseSeeding Container run: | - dotnet publish src/DatabaseSeeding/DatabaseSeeding.csproj \ - --configuration Release \ - --no-build \ - /t:PublishContainer \ - /p:ContainerBaseImage=mcr.microsoft.com/dotnet/runtime:10.0.9@sha256:23d7a947c6cf4160e37bde9e394c13ab8c725355288850d42e83d31126928d70 \ - /p:ContainerRegistry=${{ steps.login-ecr.outputs.registry }} \ - /p:ContainerRepository=${{ vars.ECR_REPOSITORY }} \ - /p:ContainerImageTag=seeder-${{ github.sha }} + docker build \ + -f src/DatabaseSeeding/Dockerfile \ + -t ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:seeder-${{ github.sha }} \ + . + docker push ${{ steps.login-ecr.outputs.registry }}/${{ vars.ECR_REPOSITORY }}:seeder-${{ github.sha }} - name: Build and Push DatabaseMigrator Container run: | diff --git a/src/Database/Dockerfile b/src/Database/Dockerfile index ac7c49d7d..732cac416 100644 --- a/src/Database/Dockerfile +++ b/src/Database/Dockerfile @@ -12,7 +12,7 @@ COPY src/Database/CatsDb/ src/Database/CatsDb/ RUN dotnet build src/Database/CatsDb/CatsDb.sqlproj --configuration Release -FROM mcr.microsoft.com/dotnet/runtime:10.0.9@sha256:23d7a947c6cf4160e37bde9e394c13ab8c725355288850d42e83d31126928d70 AS final +FROM mcr.microsoft.com/dotnet/runtime:10.0.9@sha256:58318ab0733b63d3ac0d7609c46f2718244e623a176f45991ee01fad46fbf880 AS final WORKDIR /app # Copy sqlpackage and make it accessible to non-root users diff --git a/src/DatabaseSeeding/Dockerfile b/src/DatabaseSeeding/Dockerfile new file mode 100644 index 000000000..b05f8d6a9 --- /dev/null +++ b/src/DatabaseSeeding/Dockerfile @@ -0,0 +1,20 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0.300@sha256:c0790639332692a0d56cdd81ed581cfd24d040d9839764c138994866df89a3b6 AS build +WORKDIR /src + +# Copy solution-level config required to restore and build +COPY Directory.Build.props Directory.Packages.props NuGet.config global.json ./ +COPY src/ src/ + +RUN dotnet restore src/DatabaseSeeding/DatabaseSeeding.csproj +RUN dotnet publish src/DatabaseSeeding/DatabaseSeeding.csproj \ + --configuration Release \ + --no-restore \ + --output /app/publish + + +FROM mcr.microsoft.com/dotnet/runtime:10.0.9@sha256:58318ab0733b63d3ac0d7609c46f2718244e623a176f45991ee01fad46fbf880 AS final +WORKDIR /app + +COPY --from=build /app/publish ./ + +ENTRYPOINT ["dotnet", "DatabaseSeeding.dll"] diff --git a/src/Server.UI/Dockerfile b/src/Server.UI/Dockerfile new file mode 100644 index 000000000..1af0998e8 --- /dev/null +++ b/src/Server.UI/Dockerfile @@ -0,0 +1,22 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0.300@sha256:c0790639332692a0d56cdd81ed581cfd24d040d9839764c138994866df89a3b6 AS build +WORKDIR /src + +# Copy solution-level config required to restore and build +COPY Directory.Build.props Directory.Packages.props NuGet.config global.json ./ +COPY src/ src/ + +RUN dotnet restore src/Server.UI/Server.UI.csproj +RUN dotnet publish src/Server.UI/Server.UI.csproj \ + --configuration Release \ + --no-restore \ + --output /app/publish + + +FROM mcr.microsoft.com/dotnet/aspnet:10.0.9@sha256:ddcf70ad1ab963a4fcd41fbd722a6b660e404e87567cfbd46fd2809c21b02088 AS final +WORKDIR /app + +COPY --from=build /app/publish ./ +USER $APP_UID + +EXPOSE 8080 +ENTRYPOINT ["dotnet", "Cfo.Cats.Server.UI.dll"] diff --git a/src/Worker/Dockerfile b/src/Worker/Dockerfile new file mode 100644 index 000000000..3174c4084 --- /dev/null +++ b/src/Worker/Dockerfile @@ -0,0 +1,22 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0.300@sha256:c0790639332692a0d56cdd81ed581cfd24d040d9839764c138994866df89a3b6 AS build +WORKDIR /src + +# Copy solution-level config required to restore and build +COPY Directory.Build.props Directory.Packages.props NuGet.config global.json ./ +COPY src/ src/ + +RUN dotnet restore src/Worker/Worker.csproj +RUN dotnet publish src/Worker/Worker.csproj \ + --configuration Release \ + --no-restore \ + --output /app/publish + + +FROM mcr.microsoft.com/dotnet/aspnet:10.0.9@sha256:ddcf70ad1ab963a4fcd41fbd722a6b660e404e87567cfbd46fd2809c21b02088 AS final +WORKDIR /app + +COPY --from=build /app/publish ./ +USER $APP_UID + +EXPOSE 8080 +ENTRYPOINT ["dotnet", "Cfo.Cats.Worker.dll"] From 328a4c826999b1030cb56353f20e5e0e47ab4fe8 Mon Sep 17 00:00:00 2001 From: Sam Gibson <140488216+samgibsonmoj@users.noreply.github.com> Date: Tue, 23 Jun 2026 09:23:14 +0100 Subject: [PATCH 24/25] Update helper comment Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- helm_deploy/cats/templates/_helpers.tpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm_deploy/cats/templates/_helpers.tpl b/helm_deploy/cats/templates/_helpers.tpl index 793bcf3b8..147787657 100644 --- a/helm_deploy/cats/templates/_helpers.tpl +++ b/helm_deploy/cats/templates/_helpers.tpl @@ -1,7 +1,7 @@ {{/* Environment variables that expose the MSSQL connection details from the rds-mssql-instance-output namespace secret, plus the composed connection string. -Used by the migrator and seeder Jobs. +Used by the migrator and seeder Pods. */}} {{- define "cats.databaseEnv" -}} - name: DATABASE_ADDRESS From d69fa4a8a4d5b3b52c935fd05bc4d8d2be04611b Mon Sep 17 00:00:00 2001 From: Sam Gibson Date: Tue, 23 Jun 2026 10:17:36 +0100 Subject: [PATCH 25/25] Explicit port forward deploy for environments, excluding non-dev envs --- .github/workflows/deploy.yml | 1 - .github/workflows/validate-helm.yml | 1 - helm_deploy/cats/values-dev.yaml | 4 ++++ helm_deploy/cats/values-production.yaml | 4 ++++ helm_deploy/cats/values-staging.yaml | 4 ++++ helm_deploy/cats/values.yaml | 4 ++-- 6 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 027feac54..139a6d770 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -152,7 +152,6 @@ jobs: --set worker.enabled=true \ --set rabbitmq.enabled=true \ --set redis.enabled=true \ - --set rdsPortForward.enabled=true \ --set serviceAccountName="${KUBE_NAMESPACE}" \ --set app.serviceAccountName="${KUBE_NAMESPACE}" \ --set app.image.repository="${IMAGE_REPOSITORY}" \ diff --git a/.github/workflows/validate-helm.yml b/.github/workflows/validate-helm.yml index 2ca1b0437..0101f378f 100644 --- a/.github/workflows/validate-helm.yml +++ b/.github/workflows/validate-helm.yml @@ -46,7 +46,6 @@ jobs: --set worker.enabled=true \ --set rabbitmq.enabled=true \ --set redis.enabled=true \ - --set rdsPortForward.enabled=true \ --set serviceAccountName="cfocats-$env" \ --set app.serviceAccountName="cfocats-$env" \ --set app.image.repository="example/cfocats" \ diff --git a/helm_deploy/cats/values-dev.yaml b/helm_deploy/cats/values-dev.yaml index 544e4adba..8df5b263f 100644 --- a/helm_deploy/cats/values-dev.yaml +++ b/helm_deploy/cats/values-dev.yaml @@ -14,6 +14,10 @@ worker: DOTNET_ENVIRONMENT: "Development" Sentry__Environment: "Development-CloudPlatform" +# RDS bridge - disabled in non-development environments +rdsPortForward: + enabled: true + # todo: enable prometheus alerts # generic-prometheus-alerts: # alertSeverity: cfo-alerts-nonprod diff --git a/helm_deploy/cats/values-production.yaml b/helm_deploy/cats/values-production.yaml index 978c1eb90..88592e30d 100644 --- a/helm_deploy/cats/values-production.yaml +++ b/helm_deploy/cats/values-production.yaml @@ -30,6 +30,10 @@ worker: Sentry__Environment: "Production-CloudPlatform" Features__PresenceHub__RelayUserPresenceNotifications: "false" +# RDS bridge - disabled in non-development environments +rdsPortForward: + enabled: false + # todo: enable prometheus alerts # generic-prometheus-alerts: # alertSeverity: cfo-alerts diff --git a/helm_deploy/cats/values-staging.yaml b/helm_deploy/cats/values-staging.yaml index 66949b4e4..3f7554bb0 100644 --- a/helm_deploy/cats/values-staging.yaml +++ b/helm_deploy/cats/values-staging.yaml @@ -13,6 +13,10 @@ worker: Sentry__Environment: "Staging-CloudPlatform" Features__PresenceHub__RelayUserPresenceNotifications: "false" +# RDS bridge - disabled in non-development environments +rdsPortForward: + enabled: false + # todo: enable prometheus alerts # generic-prometheus-alerts: # alertSeverity: cfo-alerts-nonprod diff --git a/helm_deploy/cats/values.yaml b/helm_deploy/cats/values.yaml index 2a7cd8800..a09120e33 100644 --- a/helm_deploy/cats/values.yaml +++ b/helm_deploy/cats/values.yaml @@ -207,8 +207,8 @@ seeder: # --------------------------------------------------------------------------- # RDS port-forward — an ad-hoc helper Deployment that bridges to the cloud RDS -# SQL Server so it can be reached via `kubectl port-forward`. Off by default; -# enable on demand with --set rdsPortForward.enabled=true. +# SQL Server so it can be reached via `kubectl port-forward`. Off in +# non-development environments. # --------------------------------------------------------------------------- rdsPortForward: enabled: false