-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathagent-scaler.tf
More file actions
217 lines (187 loc) · 7.71 KB
/
agent-scaler.tf
File metadata and controls
217 lines (187 loc) · 7.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#tfsec:ignore:aws-lambda-enable-tracing X-Ray tracing is optional and can be enabled by users if required for debugging
resource "aws_lambda_function" "scaler" {
count = local.has_variable_size ? 1 : 0
function_name = "${local.stack_name_full}-scaler"
description = "Scales ${aws_autoscaling_group.agent_auto_scale_group.name} based on Buildkite metrics"
s3_bucket = local.agent_scaler_s3_bucket
s3_key = "buildkite-agent-scaler/v${local.buildkite_agent_scaler_version}/handler${var.lambda_architecture == "arm64" ? "-arm64" : ""}.zip"
handler = "bootstrap"
runtime = "provided.al2023"
architectures = [var.lambda_architecture]
timeout = 120
memory_size = 128
role = local.use_custom_scaler_lambda_role ? var.scaler_lambda_role_arn : aws_iam_role.scaler_lambda_role[0].arn
environment {
variables = {
# Required parameters
BUILDKITE_AGENT_TOKEN_SSM_KEY = local.use_custom_token_path ? var.buildkite_agent_token_parameter_store_path : aws_ssm_parameter.buildkite_agent_token_parameter[0].name
BUILDKITE_QUEUE = var.buildkite_queue
AGENTS_PER_INSTANCE = tostring(var.agents_per_instance)
ASG_NAME = aws_autoscaling_group.agent_auto_scale_group.name
# Optional agent endpoint
BUILDKITE_AGENT_ENDPOINT = var.agent_endpoint
# Scaling configuration
DISABLE_SCALE_IN = var.disable_scale_in ? "true" : "false"
SCALE_IN_COOLDOWN_PERIOD = "${var.scale_in_cooldown_period}s"
SCALE_OUT_COOLDOWN_PERIOD = "${var.scale_out_cooldown_period}s"
SCALE_OUT_FACTOR = tostring(var.scale_out_factor)
INSTANCE_BUFFER = tostring(var.instance_buffer)
INCLUDE_WAITING = var.scale_out_for_waiting_jobs ? "true" : "false"
# Lambda behavior / Polling configuration
LAMBDA_INTERVAL = var.scaler_min_poll_interval
LAMBDA_TIMEOUT = "50s" # Less than function timeout to allow graceful exit
# Elastic CI Mode (experimental)
ELASTIC_CI_MODE = var.scaler_enable_elastic_ci_mode ? "true" : "false"
# CloudWatch metrics (optional)
CLOUDWATCH_METRICS = "false" # Can be made configurable if needed
}
}
# Ensure the log group exists before the function
depends_on = [
aws_cloudwatch_log_group.scaler_lambda_logs[0],
aws_iam_role_policy_attachment.scaler_lambda_policy[0]
]
tags = local.common_tags
}
#tfsec:ignore:aws-cloudwatch-log-group-customer-key Using default encryption for CloudWatch Logs; CMK can be added by users if required
resource "aws_cloudwatch_log_group" "scaler_lambda_logs" {
count = local.has_variable_size ? 1 : 0
name = "/aws/lambda/${local.stack_name_full}-scaler"
retention_in_days = var.lambda_log_retention_days
tags = local.common_tags
}
resource "aws_cloudwatch_event_rule" "scaler_schedule" {
count = local.has_variable_size ? 1 : 0
name = "${local.stack_name_full}-scaler-schedule"
description = "Triggers Buildkite agent scaler Lambda every ${var.scaler_event_schedule_period}"
schedule_expression = "rate(${var.scaler_event_schedule_period})"
tags = local.common_tags
}
resource "aws_cloudwatch_event_target" "scaler_lambda" {
count = local.has_variable_size ? 1 : 0
rule = aws_cloudwatch_event_rule.scaler_schedule[0].name
target_id = "BuildkiteAgentScalerLambda"
arn = aws_lambda_function.scaler[0].arn
}
resource "aws_lambda_permission" "allow_eventbridge" {
count = local.has_variable_size ? 1 : 0
statement_id = "AllowExecutionFromEventBridge"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.scaler[0].function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.scaler_schedule[0].arn
}
resource "aws_iam_role" "scaler_lambda_role" {
count = local.use_custom_scaler_lambda_role ? 0 : (local.has_variable_size ? 1 : 0)
name = "${local.stack_name_full}-scaler-lambda-role"
permissions_boundary = local.use_permissions_boundary ? var.instance_role_permissions_boundary_arn : null
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Service = "lambda.amazonaws.com"
}
Action = "sts:AssumeRole"
}
]
})
tags = local.common_tags
}
#tfsec:ignore:aws-iam-no-policy-wildcards Lambda requires CloudWatch Logs CreateLogGroup permission with wildcard for dynamic log group creation
resource "aws_iam_role_policy" "scaler_lambda_policy" {
count = local.use_custom_scaler_lambda_role ? 0 : (local.has_variable_size ? 1 : 0)
name = "${local.stack_name_full}-scaler-lambda-policy"
role = aws_iam_role.scaler_lambda_role[0].id
policy = jsonencode({
Version = "2012-10-17"
Statement = concat(
[
# CloudWatch Logs
{
Effect = "Allow"
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
]
Resource = "${aws_cloudwatch_log_group.scaler_lambda_logs[0].arn}:*"
},
# Auto Scaling - Core scaler permissions
{
Effect = "Allow"
Action = [
"autoscaling:DescribeAutoScalingGroups",
"autoscaling:DescribeScalingActivities",
"autoscaling:SetDesiredCapacity"
]
Resource = "*"
},
# SSM Parameter Store - Token retrieval
{
Effect = "Allow"
Action = [
"ssm:GetParameter"
]
Resource = local.use_custom_token_path ? "arn:aws:ssm:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:parameter${var.buildkite_agent_token_parameter_store_path}" : aws_ssm_parameter.buildkite_agent_token_parameter[0].arn
}
],
# KMS for encrypted SSM parameter (if using customer-managed key)
local.use_custom_token_kms ? [
{
Effect = "Allow"
Action = [
"kms:Decrypt"
]
Resource = var.buildkite_agent_token_parameter_store_kms_key
}
] : [],
# Elastic CI Mode - Enhanced permissions for graceful scale-in
# Split into separate conditionals to avoid type mismatch
var.scaler_enable_elastic_ci_mode ? [
{
Effect = "Allow"
Action = [
"ec2:DescribeInstances",
"ec2:DescribeInstanceStatus",
"ssm:DescribeInstanceInformation"
]
Resource = "*"
}
] : [],
var.scaler_enable_elastic_ci_mode ? [
{
Effect = "Allow"
Action = [
"ssm:SendCommand",
"ssm:GetCommandInvocation"
]
Resource = [
"arn:aws:ssm:${data.aws_region.current.id}::document/AWS-RunShellScript",
"arn:aws:ec2:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:instance/*"
]
}
] : [],
var.scaler_enable_elastic_ci_mode ? [
{
Effect = "Allow"
Action = [
"ec2:TerminateInstances"
]
Resource = "arn:aws:ec2:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:instance/*"
Condition = {
StringEquals = {
"ec2:ResourceTag/aws:autoscaling:groupName" = aws_autoscaling_group.agent_auto_scale_group.name
}
}
}
] : []
)
})
}
resource "aws_iam_role_policy_attachment" "scaler_lambda_policy" {
count = local.use_custom_scaler_lambda_role ? 0 : (local.has_variable_size ? 1 : 0)
role = aws_iam_role.scaler_lambda_role[0].name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
}