Skip to content

Commit 94c0beb

Browse files
committed
Don't enable OpenTelemetry for background jobs
We only want to enable tracing for user interactions in the main app, and not for background jobs like mailchimp sync, org sync, or the queue worker. These traces would be noisy and not very useful, as they aren't triggered by user interactions, and would also require the ADOT collector to be running for these containers, which adds extra complexity and resource usage. This is specifically an issue while we're using XRay for tracing. If we switch to a different tracing backend in the future, we can revisit this and consider enabling tracing for background jobs if it's more accessible with the new backend.
1 parent 5bc428d commit 94c0beb

3 files changed

Lines changed: 38 additions & 92 deletions

File tree

infra/modules/forms-admin/mailchimp-sync.tf

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,18 @@ locals {
1717
awslogs-stream-prefix = "forms-admin-${var.env_name}-mailchimp-sync"
1818
}
1919
},
20-
# Don't require the otel collector to be running for mailchimp sync.
21-
# This overrides the `dependsOn` set in the main task container definition.
22-
dependsOn = []
20+
# Explicitly disable opentelemetry for mailchimp sync, as we only want to trace user interactions in the main app, and not background jobs.
21+
# Override the `dependsOn` set in the main task container definition, as we don't provision the collector here.
22+
dependsOn = [],
23+
# Also, strip out any OTEL_ or _OTEL envars, to disable opentelemetry for this container, even if it's enabled for the main app.
24+
environment = [
25+
for env in module.ecs_service.task_container_definition.environment :
26+
env
27+
if !startswith(env.name, "OTEL_") && !endswith(env.name, "_OTEL")
28+
]
29+
# Explicitly set CPU / Memory, as this is increased in the module if OpenTelemetry is enabled.
30+
cpu = var.cpu
31+
memory = var.memory
2332
}
2433
)
2534
}

infra/modules/forms-admin/orgs-sync.tf

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,18 @@ locals {
1717
awslogs-stream-prefix = "forms-admin-${var.env_name}-organisations-sync"
1818
}
1919
},
20-
# Don't require the otel collector to be running for organisations sync.
21-
# This overrides the `dependsOn` set in the main task container definition.
22-
dependsOn = []
20+
# Explicitly disable opentelemetry for org sync, as we only want to trace user interactions in the main app, and not background jobs.
21+
# Override the `dependsOn` set in the main task container definition, as we don't provision the collector here.
22+
dependsOn = [],
23+
# Also, strip out any OTEL_ or _OTEL envars, to disable opentelemetry for this container, even if it's enabled for the main app.
24+
environment = [
25+
for env in module.ecs_service.task_container_definition.environment :
26+
env
27+
if !startswith(env.name, "OTEL_") && !endswith(env.name, "_OTEL")
28+
]
29+
# Explicitly set CPU / Memory, as this is increased in the module if OpenTelemetry is enabled.
30+
cpu = var.cpu
31+
memory = var.memory
2332
}
2433
)
2534
}

infra/modules/forms-runner/queue_worker.tf

Lines changed: 14 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,14 @@ locals {
33
queue_worker_log_group_name = "/aws/ecs/${local.queue_worker_name}-${var.env_name}"
44
queue_worker_adot_log_group_name = "/aws/ecs/${local.queue_worker_name}-${var.env_name}/adot-collector"
55

6-
queue_worker_environment = var.enable_opentelemetry ? concat(
7-
[
8-
for env in module.ecs_service.task_container_definition.environment : env
9-
if env.name != "OTEL_SERVICE_NAME"
10-
],
11-
[
12-
{
13-
name = "OTEL_SERVICE_NAME"
14-
value = local.queue_worker_name
15-
}
16-
]
17-
) : module.ecs_service.task_container_definition.environment
18-
196
# Take the exported task container definition and override some parts of it
207
# Note: the ENV variables aren't overridden because it's not possible to cherry pick them
218
# This means DISABLE_SOLID_QUEUE is always set to true, but that instruction is overridden
229
# by the command `bin/jobs` which starts the SolidQueue worker but not the Rails server
2310
queue_worker_container_definitions = merge(
2411
module.ecs_service.task_container_definition,
2512
{
26-
name = local.queue_worker_name,
27-
environment = local.queue_worker_environment,
13+
name = local.queue_worker_name,
2814

2915
command = ["bin/jobs"]
3016
image = module.ecs_service.task_container_definition.image,
@@ -45,14 +31,18 @@ locals {
4531
awslogs-stream-prefix = local.queue_worker_log_group_name
4632
}
4733
}
48-
49-
# Queue worker has ADOT sidecar when enabled
50-
dependsOn = var.enable_opentelemetry ? [
51-
{
52-
containerName = "aws-otel-collector",
53-
condition = "START"
54-
}
55-
] : []
34+
# Explicitly disable opentelemetry for the queue worker, as we only want to trace user interactions in the main app, and not background jobs.
35+
# Override the `dependsOn` set in the main task container definition, as we don't provision the collector here.
36+
dependsOn = [],
37+
# Also, strip out any OTEL_ or _OTEL envars, to disable opentelemetry for this container, even if it's enabled for the main app.
38+
environment = [
39+
for env in module.ecs_service.task_container_definition.environment :
40+
env
41+
if !startswith(env.name, "OTEL_") && !endswith(env.name, "_OTEL")
42+
]
43+
# Explicitly set CPU / Memory, as this is increased in the module if OpenTelemetry is enabled.
44+
cpu = var.cpu
45+
memory = var.memory
5646

5747
secrets = [
5848
{
@@ -78,50 +68,11 @@ locals {
7868
]
7969
}
8070
)
81-
82-
# ADOT collector sidecar container for queue worker
83-
queue_worker_adot_container_definition = {
84-
name = "aws-otel-collector",
85-
image = module.ecs_service.adot_image,
86-
essential = false,
87-
readonlyRootFilesystem = true,
88-
command = [
89-
"--config=${module.ecs_service.adot_collector_config}"
90-
],
91-
cpu = module.ecs_service.adot_sidecar_cpu,
92-
memory = module.ecs_service.adot_sidecar_memory,
93-
logConfiguration = {
94-
logDriver = "awslogs",
95-
options = {
96-
awslogs-group = local.queue_worker_adot_log_group_name,
97-
awslogs-region = "eu-west-2",
98-
awslogs-stream-prefix = "adot"
99-
}
100-
},
101-
healthCheck = {
102-
command = [
103-
"CMD",
104-
"/healthcheck"
105-
],
106-
interval = 30,
107-
timeout = 5,
108-
retries = 5,
109-
startPeriod = 10
110-
},
111-
}
112-
113-
# Conditional container array composition - jsonencode in the local to avoid Terraform type issues
114-
queue_worker_all_container_definitions = var.enable_opentelemetry ? jsonencode([
115-
local.queue_worker_container_definitions,
116-
local.queue_worker_adot_container_definition
117-
]) : jsonencode([
118-
local.queue_worker_container_definitions
119-
])
12071
}
12172

12273
resource "aws_ecs_task_definition" "queue_worker" {
12374
family = "${var.env_name}-${local.queue_worker_name}"
124-
container_definitions = local.queue_worker_all_container_definitions
75+
container_definitions = jsonencode([local.queue_worker_container_definitions])
12576
execution_role_arn = aws_iam_role.ecs_task_exec_role.arn
12677
task_role_arn = module.ecs_service.task_definition.task_role_arn
12778
requires_compatibilities = module.ecs_service.task_definition.requires_compatibilities
@@ -272,15 +223,6 @@ resource "aws_cloudwatch_log_group" "queue_worker" {
272223
retention_in_days = 30
273224
}
274225

275-
# Separate log group for ADOT collector in queue worker
276-
resource "aws_cloudwatch_log_group" "queue_worker_adot_log" {
277-
count = var.enable_opentelemetry ? 1 : 0
278-
#checkov:skip=CKV_AWS_338:We're happy with 30 days retention for now
279-
#checkov:skip=CKV_AWS_158:Default AWS SSE is sufficient, no need for CM KMS.
280-
name = local.queue_worker_adot_log_group_name
281-
retention_in_days = 30
282-
}
283-
284226
module "cribl_well_known" {
285227
source = "../well-known/cribl"
286228
}
@@ -297,17 +239,3 @@ resource "aws_cloudwatch_log_subscription_filter" "via_cribl_to_splunk" {
297239
distribution = "ByLogStream"
298240
role_arn = var.kinesis_subscription_role_arn
299241
}
300-
301-
# Subscribe ADOT logs to Cribl/Splunk
302-
resource "aws_cloudwatch_log_subscription_filter" "adot_via_cribl_to_splunk" {
303-
count = var.enable_opentelemetry && var.kinesis_subscription_role_arn != "" ? 1 : 0
304-
305-
name = "adot-via-cribl-to-splunk"
306-
307-
log_group_name = aws_cloudwatch_log_group.queue_worker_adot_log[0].name
308-
309-
filter_pattern = ""
310-
destination_arn = module.cribl_well_known.kinesis_destination_arns["eu-west-2"]
311-
distribution = "ByLogStream"
312-
role_arn = var.kinesis_subscription_role_arn
313-
}

0 commit comments

Comments
 (0)