Skip to content

Commit 71f11d2

Browse files
committed
Enable ADOT and OpenTelemetry tracing in forms-runner
Configure the ADOT sidecar for forms-runner and forms-queue-worker, and enable OpenTelemetry in the application containers. This is only enabled in dev and staging for now. This specifically configures OpenTelemetry to export traces to AWS X-Ray. A new buildspec was required for db-migrate, as the original task failed when the ADOT sidecar was added to the main forms-runner task definition. The new version retrieves the app task from the existing task definition, rather than just fetching the first container definition. This also makes it possible to enable the ADOT sidecar and OpenTelemetry in other apps. Only forms-runner is configured to use it for now.
1 parent 34fb0e5 commit 71f11d2

15 files changed

Lines changed: 350 additions & 13 deletions

File tree

infra/deployments/forms/forms-runner/main.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ module "forms_runner" {
4949
enable_maintenance_mode = var.forms_runner_settings.enable_maintenance_mode
5050
cloudwatch_metrics_enabled = var.forms_runner_settings.cloudwatch_metrics_enabled
5151
analytics_enabled = var.forms_runner_settings.analytics_enabled
52+
enable_opentelemetry = var.forms_runner_settings.enable_opentelemetry
5253
deploy_account_id = var.deploy_account_id
5354
ses_submission_email_from_email_address = var.forms_runner_settings.ses_submission_email_from_email_address
5455
ses_submission_email_reply_to_email_address = var.forms_runner_settings.ses_submission_email_reply_to_email_address

infra/deployments/forms/inputs.tf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ variable "forms_runner_settings" {
166166
enable_maintenance_mode = bool
167167
cloudwatch_metrics_enabled = bool
168168
analytics_enabled = bool
169+
enable_opentelemetry = optional(bool, false)
169170
allow_human_readonly_roles_to_assume_submissions_to_s3_role = bool
170171
allow_human_readonly_roles_to_assume_submissions_to_runner_role = bool
171172
ses_submission_email_from_email_address = string
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
version: 0.2
2+
phases:
3+
pre_build:
4+
commands:
5+
# Check Image URI is not the default pipeline value
6+
- |
7+
if [[ "${IMAGE_URI}" = "MUST_BE_SET" ]]; then
8+
echo "The IMAGE_URI has not been set by the caller. The value of IMAGE_URI is \"${IMAGE_URI}\""
9+
exit 1
10+
fi
11+
12+
# Get task definition
13+
- echo "Existing ECS task definition name is ${TASK_DEFINITION_NAME}"
14+
- ECS_TASK_DEFINITION=$(aws ecs describe-task-definition --task-definition "${TASK_DEFINITION_NAME}")
15+
16+
# Delete any reference to the old image from the task definition and add the new image uri
17+
- NEW_ECS_TASK_DEFINITION=$(echo "${ECS_TASK_DEFINITION}" | jq --arg "INPUT_IMAGE_URI" "${IMAGE_URI}" --arg "APP_NAME" "${APP_NAME}" '.taskDefinition | .containerDefinitions |= map(if .name == $APP_NAME then .image = $INPUT_IMAGE_URI else . end) | del(.taskDefinitionArn) | del(.revision) | del(.status) | del(.requiresAttributes) | del(.compatibilities) | del(.registeredAt) | del(.registeredBy)')
18+
- echo "Replaced image uri with ${IMAGE_URI} for container ${APP_NAME}"
19+
20+
# Register a new task definition
21+
- |
22+
NEW_ECS_TASK_DEFINITION_ARN=$(aws ecs register-task-definition --cli-input-json "$NEW_ECS_TASK_DEFINITION" | jq -r '.taskDefinition.taskDefinitionArn' )
23+
echo "New ECS task definition ARN ${NEW_ECS_TASK_DEFINITION_ARN}"
24+
25+
# Get the existing configuration
26+
- ECS_CLUSTER_ARN=$(aws ecs describe-clusters --cluster "${CLUSTER_NAME}" | jq -r '.clusters[].clusterArn')
27+
- ECS_TASK_CONTAINER_DEFINITION_NAME=$(aws ecs describe-task-definition --task-definition "${TASK_DEFINITION_NAME}" | jq --arg "APP_NAME" "${APP_NAME}" -r '.taskDefinition.containerDefinitions[] | select(.name == $APP_NAME) | .name')
28+
- ECS_TASK_NETWORK_CONFIGURATION=$(aws ecs describe-services --cluster "${CLUSTER_NAME}" --services "${APP_NAME}" | jq '.services[].networkConfiguration[]')
29+
30+
# Write new task definition overriding the command the container runs
31+
- |
32+
jq -nrM \
33+
--arg "ECS_CLUSTER_ARN" "${ECS_CLUSTER_ARN}" \
34+
--arg "CONTAINER_DEFINITION_NAME" "${ECS_TASK_CONTAINER_DEFINITION_NAME}" \
35+
--arg "ECS_TASK_DEFINITION_ARN" "${NEW_ECS_TASK_DEFINITION_ARN}" \
36+
--argjson "ECS_TASK_NETWORK_CONFIGURATION" "${ECS_TASK_NETWORK_CONFIGURATION}" \
37+
'{
38+
"cluster": $ECS_CLUSTER_ARN,
39+
"taskDefinition": $ECS_TASK_DEFINITION_ARN,
40+
"count": 1,
41+
"launchType": "FARGATE",
42+
"overrides": {
43+
"containerOverrides": [
44+
{
45+
"name": $CONTAINER_DEFINITION_NAME,
46+
"command": ["rake", "db:migrate"],
47+
"environment": [
48+
{ "name": "VERBOSE", "value": "true" }
49+
]
50+
}
51+
]
52+
},
53+
"networkConfiguration": {
54+
"awsvpcConfiguration": $ECS_TASK_NETWORK_CONFIGURATION
55+
}
56+
}' > db_migrate_task_definition.json
57+
58+
- echo -e "New task definition \n $(<db_migrate_task_definition.json )"
59+
build:
60+
commands:
61+
- echo "Running database migration for ${APP_NAME}"
62+
- RUNNING_TASK_ARN=$(aws ecs run-task --cli-input-json "file://db_migrate_task_definition.json" | jq -r '.tasks[].taskArn')
63+
- echo "Running task ARN ${RUNNING_TASK_ARN}"
64+
- echo "Waiting for the task to finish"
65+
- aws ecs wait tasks-stopped --tasks "${RUNNING_TASK_ARN}" --cluster "${ECS_CLUSTER_ARN}"
66+
# Determine the exit code for the running task
67+
# Determine if the running task or the CodeBuild build failed
68+
# No failures: 0
69+
# Any failures: 1
70+
- |
71+
RUNNING_TASK_EXIT_CODE=$(\
72+
aws ecs describe-tasks --tasks "${RUNNING_TASK_ARN}" --cluster "${ECS_CLUSTER_ARN}" \
73+
| jq --arg "CONTAINER_NAME" "${ECS_TASK_CONTAINER_DEFINITION_NAME}" -r '.tasks[0].containers[] | select(.name == $CONTAINER_NAME) | .exitCode'
74+
)
75+
- exit "${RUNNING_TASK_EXIT_CODE}"

infra/deployments/forms/pipelines/deploy-forms-runner-container.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ module "db_migrate_runner" {
328328
project_description = "Run database migrations"
329329
environment = var.environment_name
330330
artifact_store_arn = module.artifact_bucket.arn
331-
buildspec = file("${path.root}/buildspecs/db-migrate/db-migrate.yml")
331+
buildspec = file("${path.root}/buildspecs/db-migrate/db-migrate-adot.yml")
332332
log_group_name = "codebuild/db_migrate_runner_${var.environment_name}"
333333
codebuild_service_role_arn = data.aws_iam_role.deployer_role.arn
334334
}

infra/deployments/forms/tfvars/dev.tfvars

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ forms_runner_settings = {
9696
enable_maintenance_mode = false
9797
cloudwatch_metrics_enabled = true
9898
analytics_enabled = true
99+
enable_opentelemetry = true
99100
allow_human_readonly_roles_to_assume_submissions_to_s3_role = true
100101
allow_human_readonly_roles_to_assume_submissions_to_runner_role = true
101102
ses_submission_email_from_email_address = "no-reply@dev.forms.service.gov.uk"

infra/deployments/forms/tfvars/staging.tfvars

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ forms_runner_settings = {
6161
enable_maintenance_mode = false
6262
cloudwatch_metrics_enabled = true
6363
analytics_enabled = true
64+
enable_opentelemetry = true
6465
allow_human_readonly_roles_to_assume_submissions_to_s3_role = false
6566
allow_human_readonly_roles_to_assume_submissions_to_runner_role = false
6667
ses_submission_email_from_email_address = "no-reply@staging.forms.service.gov.uk"

infra/modules/deployer-access/manage-ecs-service-policy.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,11 @@ data "aws_iam_policy_document" "ecs" {
138138
]
139139
resources = [
140140
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-admin-ecs-task-policy",
141+
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-admin-adot-collector",
141142
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-runner-ecs-task-policy",
142-
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-product-page-ecs-task-policy"
143+
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-runner-adot-collector",
144+
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-product-page-ecs-task-policy",
145+
"arn:aws:iam::${var.account_id}:policy/${var.environment_name}-forms-product-page-adot-collector"
143146
]
144147
effect = "Allow"
145148
}
@@ -281,9 +284,12 @@ data "aws_iam_policy_document" "logs" {
281284
]
282285
resources = [
283286
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-admin-${var.environment_name}:*",
287+
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-admin-${var.environment_name}/adot-collector:*",
284288
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-runner-${var.environment_name}:*",
289+
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-runner-${var.environment_name}/adot-collector:*",
285290
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-runner-queue-worker-${var.environment_name}:*",
286291
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-product-page-${var.environment_name}:*",
292+
"arn:aws:logs:eu-west-2:${var.account_id}:log-group:/aws/ecs/forms-product-page-${var.environment_name}/adot-collector:*",
287293
]
288294
effect = "Allow"
289295
}

infra/modules/ecs-service/ecs.tf

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,39 @@ data "aws_ecs_container_definition" "active_container" {
1111

1212
locals {
1313
log_group_name = "/aws/ecs/${var.application}-${var.env_name}"
14+
adot_log_group_name = "/aws/ecs/${var.application}-${var.env_name}/adot-collector"
1415
task_definition_family = "${var.env_name}_${var.application}"
1516

1617
image = var.image == null ? data.aws_ecs_container_definition.active_container[0].image : var.image
1718

1819
task_container_definition = {
19-
name = var.application,
20-
environment = var.environment_variables,
20+
name = var.application,
21+
environment = var.enable_opentelemetry ? concat(var.environment_variables, [
22+
{
23+
name = "ENABLE_OTEL"
24+
value = "true"
25+
},
26+
{
27+
name = "OTEL_EXPORTER_OTLP_ENDPOINT"
28+
value = "http://localhost:4318"
29+
},
30+
{
31+
name = "OTEL_SERVICE_NAME"
32+
value = var.application
33+
},
34+
{
35+
name = "OTEL_PROPAGATORS"
36+
value = "xray"
37+
}
38+
]) : var.environment_variables,
2139
mountPoints = [],
2240
secrets = var.secrets,
2341
image = local.image
2442
essential = true,
2543
readonlyRootFilesystem = var.readonly_root_filesystem
44+
command = null,
45+
cpu = null,
46+
memory = null,
2647
portMappings = [
2748
{
2849
hostPort = var.container_port,
@@ -40,8 +61,54 @@ locals {
4061
awslogs-stream-prefix = local.log_group_name
4162
}
4263
},
64+
healthCheck = null,
65+
dependsOn = var.enable_opentelemetry ? [
66+
{
67+
containerName = "aws-otel-collector",
68+
condition = "START"
69+
}
70+
] : []
4371
}
4472

73+
# ADOT collector sidecar container
74+
adot_container_definition = {
75+
name = "aws-otel-collector",
76+
image = var.adot_image,
77+
essential = true,
78+
readonlyRootFilesystem = false,
79+
command = [
80+
"--config=${var.adot_collector_config}"
81+
],
82+
cpu = var.adot_sidecar_cpu,
83+
memory = var.adot_sidecar_memory,
84+
logConfiguration = {
85+
logDriver = "awslogs",
86+
options = {
87+
awslogs-group = local.adot_log_group_name,
88+
awslogs-region = "eu-west-2",
89+
awslogs-stream-prefix = "adot"
90+
}
91+
},
92+
healthCheck = {
93+
command = [
94+
"CMD",
95+
"/healthcheck"
96+
],
97+
interval = 5,
98+
timeout = 6,
99+
retries = 5,
100+
startPeriod = 1
101+
},
102+
}
103+
104+
# Conditional container array composition
105+
container_definitions = var.enable_opentelemetry ? jsonencode([
106+
local.task_container_definition,
107+
local.adot_container_definition
108+
]) : jsonencode([
109+
local.task_container_definition
110+
])
111+
45112
# Extract the values needed for the ECS service network configuration
46113
# to local variable so we can ensure the same configuration is used
47114
# for any pre-deploy tasks
@@ -53,7 +120,7 @@ locals {
53120
}
54121
resource "aws_ecs_task_definition" "task" {
55122
family = local.task_definition_family
56-
container_definitions = jsonencode([local.task_container_definition])
123+
container_definitions = local.container_definitions
57124

58125
// As this terraform module doesn't deal with updating app code, we see drift every time it's applied because the image is changed elsewhere.
59126
// Enable tracking of the latest ACTIVE task definition revision rather than the one in terraform state, so that changes to the image / task revision outside of terraform are picked up and not considered drift.
@@ -70,8 +137,11 @@ resource "aws_ecs_task_definition" "task" {
70137
task_role_arn = aws_iam_role.ecs_task_role.arn
71138

72139
requires_compatibilities = ["FARGATE"]
73-
cpu = var.cpu
74-
memory = var.memory
140+
# When ADOT sidecar is enabled, round up to next valid Fargate CPU/memory configuration
141+
# Valid Fargate configs: 256/.5-2GB, 512/1-4GB, 1024/2-8GB, 2048/4-16GB, 4096/8-30GB
142+
# For forms-runner: 512 CPU + 1024 MB + ADOT (256 CPU + 512 MB) = needs 1024 CPU / 2048 MB minimum
143+
cpu = var.enable_opentelemetry ? (var.cpu + var.adot_sidecar_cpu <= 512 ? 512 : 1024) : var.cpu
144+
memory = var.enable_opentelemetry ? (var.memory + var.adot_sidecar_memory <= 1024 ? 1024 : 2048) : var.memory
75145

76146
network_mode = "awsvpc"
77147

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# IAM permissions for AWS Distro for OpenTelemetry sidecar
2+
# These permissions allow the ADOT collector to send traces to X-Ray
3+
# Note: Container logs are handled by the task execution role, not the task role
4+
5+
data "aws_iam_policy_document" "adot_permissions" {
6+
count = var.enable_opentelemetry ? 1 : 0
7+
8+
# X-Ray permissions for trace collection
9+
statement {
10+
sid = "XRayAccess"
11+
effect = "Allow"
12+
actions = [
13+
"xray:PutTraceSegments",
14+
"xray:PutTelemetryRecords",
15+
"xray:GetSamplingRules",
16+
"xray:GetSamplingTargets",
17+
"xray:GetSamplingStatisticSummaries"
18+
]
19+
resources = ["*"]
20+
}
21+
}
22+
23+
resource "aws_iam_policy" "adot_policy" {
24+
count = var.enable_opentelemetry ? 1 : 0
25+
name = "${var.env_name}-${var.application}-adot-collector"
26+
policy = data.aws_iam_policy_document.adot_permissions[0].json
27+
}
28+
29+
resource "aws_iam_role_policy_attachment" "adot_policy_attachment" {
30+
count = var.enable_opentelemetry ? 1 : 0
31+
role = aws_iam_role.ecs_task_role.name
32+
policy_arn = aws_iam_policy.adot_policy[0].arn
33+
}

infra/modules/ecs-service/logging.tf

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@ resource "aws_cloudwatch_log_group" "log" {
55
retention_in_days = 30
66
}
77

8+
# Separate log group for ADOT collector
9+
resource "aws_cloudwatch_log_group" "adot_log" {
10+
count = var.enable_opentelemetry ? 1 : 0
11+
#checkov:skip=CKV_AWS_338:We're happy with 30 days retention for now
12+
#checkov:skip=CKV_AWS_158:Default AWS SSE is sufficient, no need for CM KMS.
13+
name = local.adot_log_group_name
14+
retention_in_days = 30
15+
}
16+
817
module "cribl_well_known" {
918
source = "../well-known/cribl"
1019
}
@@ -21,3 +30,17 @@ resource "aws_cloudwatch_log_subscription_filter" "via_cribl_to_splunk" {
2130
distribution = "ByLogStream"
2231
role_arn = var.kinesis_subscription_role_arn
2332
}
33+
34+
# Subscribe ADOT logs to Cribl/Splunk
35+
resource "aws_cloudwatch_log_subscription_filter" "adot_via_cribl_to_splunk" {
36+
count = var.enable_opentelemetry && var.kinesis_subscription_role_arn != "" ? 1 : 0
37+
38+
name = "adot-via-cribl-to-splunk"
39+
40+
log_group_name = aws_cloudwatch_log_group.adot_log[0].name
41+
42+
filter_pattern = ""
43+
destination_arn = module.cribl_well_known.kinesis_destination_arns["eu-west-2"]
44+
distribution = "ByLogStream"
45+
role_arn = var.kinesis_subscription_role_arn
46+
}

0 commit comments

Comments
 (0)