From 2dca5c8a780c43914a6143c22a7b38eac4ba8c6d Mon Sep 17 00:00:00 2001 From: Sarah Young Date: Mon, 2 Jun 2025 13:46:08 +0100 Subject: [PATCH 1/5] Introduce SolidQueue ECS task * Create ECS service, ECS task definition, security group, and relevant security group rules. I used a pattern I noticed in mailchimp sync where I took the exported ECS task container definition and override some parts of it --- infra/modules/forms-runner/queue_worker.tf | 99 ++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 infra/modules/forms-runner/queue_worker.tf diff --git a/infra/modules/forms-runner/queue_worker.tf b/infra/modules/forms-runner/queue_worker.tf new file mode 100644 index 000000000..26d3f95bb --- /dev/null +++ b/infra/modules/forms-runner/queue_worker.tf @@ -0,0 +1,99 @@ +locals { + queue_worker_name = "forms-runner-queue-worker" + + # Take the exported task container definition and override some parts of it + queue_worker_container_definitions = merge( + module.ecs_service.task_container_definition, + { + name = local.queue_worker_name, + command = ["bin/jobs"] + + healthCheck = { + command = ["CMD-SHELL", "test -f tmp/solidqueue_healthcheck || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 10 + } + + logConfiguration = { + logDriver = "awslogs", + options = { + awslogs-group = module.ecs_service.application_log_group_name, + awslogs-region = "eu-west-2", + awslogs-stream-prefix = "forms-runner-${var.env_name}-queue-worker" + } + }, + } + ) +} + +resource "aws_ecs_task_definition" "queue_worker" { + family = "${var.env_name}-${local.queue_worker_name}" + container_definitions = jsonencode([local.queue_worker_container_definitions]) + execution_role_arn = module.ecs_service.task_definition.execution_role_arn + task_role_arn = module.ecs_service.task_definition.task_role_arn + requires_compatibilities = module.ecs_service.task_definition.requires_compatibilities + cpu = module.ecs_service.task_definition.cpu + memory = module.ecs_service.task_definition.memory + network_mode = "awsvpc" + + runtime_platform { + operating_system_family = "LINUX" + cpu_architecture = "ARM64" + } +} + +resource "aws_ecs_service" "queue_worker" { + #checkov:skip=CKV_AWS_332:We don't want to target "LATEST" and get a surprise when a new version is released. + #checkov:skip=CKV2_FORMS_AWS_2:The queue worker currently doesn't autoscale, revisit this decision by 23/06/2025 + name = local.queue_worker_name + cluster = var.ecs_cluster_arn + desired_count = 3 + + task_definition = aws_ecs_task_definition.queue_worker.arn + deployment_maximum_percent = "200" + deployment_minimum_healthy_percent = "100" + + launch_type = "FARGATE" + platform_version = "1.4.0" + + lifecycle { + prevent_destroy = true # ECS services cannot be destructively replaced without downtime. This helps to avoid accidentally doing so. + } + + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.queue_worker.id] + assign_public_ip = false + } +} + +resource "aws_security_group" "queue_worker" { + name = local.queue_worker_name + description = "Restrict all ingress, allow egress to VPC, RDS, and internet" + vpc_id = var.vpc_id + egress { + description = "Permit outbound to VPC CIDR on 443" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = [var.vpc_cidr_block] + } + + egress { + description = "Permit outbound to the RDS postgres port 5432" + from_port = 5432 + to_port = 5432 + protocol = "tcp" + cidr_blocks = [var.vpc_cidr_block] + } + + egress { + description = "Permit outbound 443 to the internet" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } +} \ No newline at end of file From eb4f41fe6eee24b562d8989bf66a88767ad45066 Mon Sep 17 00:00:00 2001 From: Sarah Young Date: Mon, 2 Jun 2025 13:59:25 +0100 Subject: [PATCH 2/5] Add required secrets * Provide the worker task access to a subset of secrets (I checked with the devs and it will only need access to these secrets, the worker won't need access to NOTIFY_API_KEY or SUBMISSION_STATUS_API) --- infra/modules/forms-runner/queue_worker.tf | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/infra/modules/forms-runner/queue_worker.tf b/infra/modules/forms-runner/queue_worker.tf index 26d3f95bb..95bf25f78 100644 --- a/infra/modules/forms-runner/queue_worker.tf +++ b/infra/modules/forms-runner/queue_worker.tf @@ -23,7 +23,30 @@ locals { awslogs-region = "eu-west-2", awslogs-stream-prefix = "forms-runner-${var.env_name}-queue-worker" } - }, + } + + secrets = [ + { + name = "SETTINGS__FORMS_API__AUTH_KEY", + valueFrom = "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-${var.env_name}/forms-api-key" + }, + { + name = "SETTINGS__SENTRY__DSN", + valueFrom = "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-queue-worker-${var.env_name}/sentry/dsn" + }, + { + name = "SECRET_KEY_BASE", + valueFrom = "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-${var.env_name}/secret-key-base" + }, + { + name = "DATABASE_URL", + valueFrom = "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-${var.env_name}/database/url" + }, + { + name = "QUEUE_DATABASE_URL", + valueFrom = "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-queue-${var.env_name}/database/url" + } + ] } ) } From ec7129baf384b99d7da467a3edd6eb9c091d0037 Mon Sep 17 00:00:00 2001 From: Sarah Young Date: Mon, 2 Jun 2025 14:03:20 +0100 Subject: [PATCH 3/5] Add IAM roles * Create a specific IAM task execution role for the queue worker with permissions to read relevant Parameter Store values (again, I don't think the queue worker needs access to all the secrets forms-runner has access to). * This code somewhat duplicates what's in `ecs-service/iam.tf` and the policies declared in `forms-runner/main.tf` but I think the duplication is simpler than forcing the ECS task into the ecs-service module format. * The worker will use forms-runner task role (`module.ecs_service.task_definition.task_role_arn`) to ensure it has access to necessary forms-runner related resources (I believe this is required because the worker errors if I use a custom role is since it requires access to the submission_email_ses_bounces_and_complaints SQS queue). --- infra/modules/forms-runner/queue_worker.tf | 58 +++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/infra/modules/forms-runner/queue_worker.tf b/infra/modules/forms-runner/queue_worker.tf index 95bf25f78..be793b493 100644 --- a/infra/modules/forms-runner/queue_worker.tf +++ b/infra/modules/forms-runner/queue_worker.tf @@ -54,7 +54,7 @@ locals { resource "aws_ecs_task_definition" "queue_worker" { family = "${var.env_name}-${local.queue_worker_name}" container_definitions = jsonencode([local.queue_worker_container_definitions]) - execution_role_arn = module.ecs_service.task_definition.execution_role_arn + execution_role_arn = aws_iam_role.ecs_task_exec_role.arn task_role_arn = module.ecs_service.task_definition.task_role_arn requires_compatibilities = module.ecs_service.task_definition.requires_compatibilities cpu = module.ecs_service.task_definition.cpu @@ -119,4 +119,60 @@ resource "aws_security_group" "queue_worker" { protocol = "tcp" cidr_blocks = ["0.0.0.0/0"] } +} + +resource "aws_iam_role" "ecs_task_exec_role" { + name = "${var.env_name}-${local.queue_worker_name}-ecs-task-exec" + description = "Used by ECS to create forms-runner-queue-worker task" + assume_role_policy = data.aws_iam_policy_document.ecs_task_exec_role_assume_role.json +} + +data "aws_iam_policy_document" "ecs_task_exec_role_assume_role" { + statement { + sid = "AllowECS" + actions = ["sts:AssumeRole"] + effect = "Allow" + + principals { + type = "Service" + identifiers = ["ecs-tasks.amazonaws.com"] + } + } +} + +resource "aws_iam_role_policy_attachment" "ecs_task_exec_standard_policy" { + role = aws_iam_role.ecs_task_exec_role.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +resource "aws_iam_policy" "ecs_task_exec_additional_policy" { + name = "${var.env_name}-${local.queue_worker_name}-ecs-task-additional-policies" + policy = data.aws_iam_policy_document.queue_worker_ecs_task_exec_additional_policy.json +} + +resource "aws_iam_role_policy_attachment" "ecs_task_exec_additional_policy" { + role = aws_iam_role.ecs_task_exec_role.name + policy_arn = aws_iam_policy.ecs_task_exec_additional_policy.arn +} + +data "aws_iam_policy_document" "queue_worker_ecs_task_exec_additional_policy" { + statement { + actions = [ + "ssm:DescribeParameters" + ] + resources = ["*"] + effect = "Allow" + } + statement { + actions = [ + "ssm:GetParameters" + ] + resources = [ + "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-queue-worker-${var.env_name}/sentry/dsn", + "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-${var.env_name}/secret-key-base", + "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-${var.env_name}/database/url", + "arn:aws:ssm:eu-west-2:${data.aws_caller_identity.current.account_id}:parameter/forms-runner-queue-${var.env_name}/database/url" + ] + effect = "Allow" + } } \ No newline at end of file From 526b2d8c19f6dcc4daa19cc93be62ec3bf34cfb5 Mon Sep 17 00:00:00 2001 From: Sarah Young Date: Tue, 3 Jun 2025 14:23:09 +0100 Subject: [PATCH 4/5] Update queue worker desired count * Make the `desired_count` have a minimum capacity for each environment (for production this is 6, for all other environments it's 1). * The capacity is set relatively low for environments (except production) because so far the demand on the queue hasn't been high (queue length rarely exceeds 1) -- Cat raised the good point that we don't want to over-provision --- infra/deployments/forms/forms-runner/main.tf | 1 + infra/deployments/forms/inputs.tf | 1 + infra/deployments/forms/tfvars/dev.tfvars | 1 + infra/deployments/forms/tfvars/production.tfvars | 1 + infra/deployments/forms/tfvars/staging.tfvars | 1 + infra/deployments/forms/tfvars/user-research.tfvars | 1 + infra/modules/forms-runner/queue_worker.tf | 2 +- infra/modules/forms-runner/variables.tf | 5 +++++ 8 files changed, 12 insertions(+), 1 deletion(-) diff --git a/infra/deployments/forms/forms-runner/main.tf b/infra/deployments/forms/forms-runner/main.tf index 6bec2f877..e7a7d58f1 100644 --- a/infra/deployments/forms/forms-runner/main.tf +++ b/infra/deployments/forms/forms-runner/main.tf @@ -67,4 +67,5 @@ module "forms_runner" { send_logs_to_cyber = var.send_logs_to_cyber bounces_and_complaints_kms_key_arn = data.terraform_remote_state.forms_ses.outputs.submission_email_bounces_and_complaints_kms_key_arn deliveries_kms_key_arn = data.terraform_remote_state.forms_ses.outputs.submission_email_successful_deliveries_kms_key_arn + queue_worker_capacity = var.forms_runner_settings.queue_worker_capacity } diff --git a/infra/deployments/forms/inputs.tf b/infra/deployments/forms/inputs.tf index 7f8d3bb3b..7c4c2176d 100644 --- a/infra/deployments/forms/inputs.tf +++ b/infra/deployments/forms/inputs.tf @@ -182,6 +182,7 @@ variable "forms_runner_settings" { allow_human_readonly_roles_to_assume_submissions_to_runner_role = bool ses_submission_email_from_email_address = string ses_submission_email_reply_to_email_address = string + queue_worker_capacity = string }) } diff --git a/infra/deployments/forms/tfvars/dev.tfvars b/infra/deployments/forms/tfvars/dev.tfvars index 8da441889..9d03ed636 100644 --- a/infra/deployments/forms/tfvars/dev.tfvars +++ b/infra/deployments/forms/tfvars/dev.tfvars @@ -105,6 +105,7 @@ forms_runner_settings = { allow_human_readonly_roles_to_assume_submissions_to_runner_role = true ses_submission_email_from_email_address = "no-reply@dev.forms.service.gov.uk" ses_submission_email_reply_to_email_address = "no-reply@dev.forms.service.gov.uk" + queue_worker_capacity = 1 } scheduled_smoke_tests_settings = { enable_scheduled_smoke_tests = true diff --git a/infra/deployments/forms/tfvars/production.tfvars b/infra/deployments/forms/tfvars/production.tfvars index df49d67f4..76587d367 100644 --- a/infra/deployments/forms/tfvars/production.tfvars +++ b/infra/deployments/forms/tfvars/production.tfvars @@ -150,6 +150,7 @@ forms_runner_settings = { allow_human_readonly_roles_to_assume_submissions_to_runner_role = false ses_submission_email_from_email_address = "no-reply@forms.service.gov.uk" ses_submission_email_reply_to_email_address = "no-reply@forms.service.gov.uk" + queue_worker_capacity = 6 } scheduled_smoke_tests_settings = { enable_scheduled_smoke_tests = true diff --git a/infra/deployments/forms/tfvars/staging.tfvars b/infra/deployments/forms/tfvars/staging.tfvars index 8294e6a14..0fdcde308 100644 --- a/infra/deployments/forms/tfvars/staging.tfvars +++ b/infra/deployments/forms/tfvars/staging.tfvars @@ -70,6 +70,7 @@ forms_runner_settings = { allow_human_readonly_roles_to_assume_submissions_to_runner_role = false ses_submission_email_from_email_address = "no-reply@staging.forms.service.gov.uk" ses_submission_email_reply_to_email_address = "no-reply@staging.forms.service.gov.uk" + queue_worker_capacity = 1 } scheduled_smoke_tests_settings = { enable_scheduled_smoke_tests = true diff --git a/infra/deployments/forms/tfvars/user-research.tfvars b/infra/deployments/forms/tfvars/user-research.tfvars index 0d71fa55d..2865e5e0e 100644 --- a/infra/deployments/forms/tfvars/user-research.tfvars +++ b/infra/deployments/forms/tfvars/user-research.tfvars @@ -68,6 +68,7 @@ forms_runner_settings = { ses_submission_email_reply_to_email_address = "no-reply@research.forms.service.gov.uk" allow_human_readonly_roles_to_assume_submissions_to_s3_role = false allow_human_readonly_roles_to_assume_submissions_to_runner_role = false + queue_worker_capacity = 1 } scheduled_smoke_tests_settings = { enable_scheduled_smoke_tests = false diff --git a/infra/modules/forms-runner/queue_worker.tf b/infra/modules/forms-runner/queue_worker.tf index be793b493..549856bda 100644 --- a/infra/modules/forms-runner/queue_worker.tf +++ b/infra/modules/forms-runner/queue_worker.tf @@ -72,7 +72,7 @@ resource "aws_ecs_service" "queue_worker" { #checkov:skip=CKV2_FORMS_AWS_2:The queue worker currently doesn't autoscale, revisit this decision by 23/06/2025 name = local.queue_worker_name cluster = var.ecs_cluster_arn - desired_count = 3 + desired_count = var.queue_worker_capacity task_definition = aws_ecs_task_definition.queue_worker.arn deployment_maximum_percent = "200" diff --git a/infra/modules/forms-runner/variables.tf b/infra/modules/forms-runner/variables.tf index ad37d211a..8b92492bb 100644 --- a/infra/modules/forms-runner/variables.tf +++ b/infra/modules/forms-runner/variables.tf @@ -169,3 +169,8 @@ variable "deliveries_kms_key_arn" { type = string description = "The ARN of the KMS key to decrypt messages on the submission deliveries SQS queue" } + +variable "queue_worker_capacity" { + type = number + description = "Sets the desired number of tasks for the SolidQueue worker" +} \ No newline at end of file From 7685bf2c51da03f20a7d4fe75c3f27fab2b7879a Mon Sep 17 00:00:00 2001 From: Sarah Young Date: Wed, 4 Jun 2025 15:44:42 +0100 Subject: [PATCH 5/5] Fix rebase error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add SSM parameter created from PR #1591 * I rebased PR #1591 on to this branch in the GitHub UI (not a good idea), and it absorbed the queue_worker.tf file from PR #1591 as a separate file instead of merging the files together. * This happened because I didn't take into account the lack of shared file history from git's perspective. * The queue_worker.tf on the `create-solidqueue-worker-ecs-task` branch was never branched off from the queue_worker.tf in main (i.e., it was created independently rather than modified from the original). Therefore, Git doesn’t know they’re related. It treats them as two different files that just happen to have the same name. --- infra/modules/forms-runner/queue-worker.tf | 17 ----------------- infra/modules/forms-runner/queue_worker.tf | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 17 deletions(-) delete mode 100644 infra/modules/forms-runner/queue-worker.tf diff --git a/infra/modules/forms-runner/queue-worker.tf b/infra/modules/forms-runner/queue-worker.tf deleted file mode 100644 index f761a4d03..000000000 --- a/infra/modules/forms-runner/queue-worker.tf +++ /dev/null @@ -1,17 +0,0 @@ -locals { - queue_worker_name = "forms-runner-queue-worker" -} - -resource "aws_ssm_parameter" "queue_worker_sentry_dsn" { - #checkov:skip=CKV_AWS_337:The parameter is already using the default key - name = "/${local.queue_worker_name}-${var.env_name}/sentry/dsn" - type = "SecureString" - value = "dummy_value" - - description = "Sentry DSN value for ${local.queue_worker_name} in the ${var.env_name} environment" - - lifecycle { - ignore_changes = [value] - prevent_destroy = true - } -} \ No newline at end of file diff --git a/infra/modules/forms-runner/queue_worker.tf b/infra/modules/forms-runner/queue_worker.tf index 549856bda..3b4eae797 100644 --- a/infra/modules/forms-runner/queue_worker.tf +++ b/infra/modules/forms-runner/queue_worker.tf @@ -175,4 +175,18 @@ data "aws_iam_policy_document" "queue_worker_ecs_task_exec_additional_policy" { ] effect = "Allow" } +} + +resource "aws_ssm_parameter" "queue_worker_sentry_dsn" { + #checkov:skip=CKV_AWS_337:The parameter is already using the default key + name = "/${local.queue_worker_name}-${var.env_name}/sentry/dsn" + type = "SecureString" + value = "dummy_value" + + description = "Sentry DSN value for ${local.queue_worker_name} in the ${var.env_name} environment" + + lifecycle { + ignore_changes = [value] + prevent_destroy = true + } } \ No newline at end of file