Skip to content

Commit cfe5d2c

Browse files
committed
Disable spot worker pool by default
Frequent interruptions mean that spot instances have gone from being the backbone of TIS to an outright liability
1 parent a1a872b commit cfe5d2c

File tree

3 files changed

+19
-8
lines changed

3 files changed

+19
-8
lines changed

modules/imputation-server/main.tf

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,12 @@ EOF
242242
}
243243

244244
resource "aws_emr_instance_group" "task" {
245-
# EMR workers using spot instances. This is the preferred type due to cost, and has more favorable scaling options.
245+
# EMR workers using spot instances.
246+
# In theory, spot instances are great; in practice, their availability is unpredictable. Frequent interruptions can
247+
# actually cause more problems than they solve. This worker pool is defined in TF, but can be enabled/disabled depending on current availability in AWS.
248+
249+
count = var.task_instance_spot_enabled ? 1 : 0 # if spot instances are interrupted often, disable and prefer on-demand instead
250+
246251
name = "${var.name_prefix}-instance-group"
247252
cluster_id = aws_emr_cluster.cluster.id
248253
instance_count = max(var.task_instance_spot_count_min, var.task_instance_spot_count_current)

modules/imputation-server/outputs.tf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,18 @@ output "emr_ec2_attributes" {
3434
}
3535

3636
output "emr_instance_group_id" {
37-
description = "The EMR (spot) Instance Group ID"
38-
value = aws_emr_instance_group.task.id
37+
description = "The EMR (spot) Instance Group ID, if any"
38+
value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).id : null
3939
}
4040

4141
output "emr_instance_group_name" {
42-
description = "The name of the (spot) Instance Group"
43-
value = aws_emr_instance_group.task.name
42+
description = "The name of the (spot) Instance Group, if any"
43+
value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).name : null
4444
}
4545

4646
output "emr_instance_group_running_instance_count" {
47-
description = "The number of (spot) instances currently running in this instance group"
48-
value = aws_emr_instance_group.task.running_instance_count
47+
description = "The number of (spot) instances currently running in this instance group, if any"
48+
value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).running_instance_count : 0
4949
}
5050

5151
output "emr_instance_group_ondemand_id" {

modules/imputation-server/variables.tf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,13 @@ variable "task_instance_spot_count_current" {
224224
}
225225

226226

227-
# Spot instances are the preferred worker type and should usually have higher min/max values
227+
# Spot instances are the ideal worker type, but if they get interrupted often, they can make things worse (instead of better)
228+
variable "task_instance_spot_enabled" {
229+
description = "Whether to use spot instances at all. Use them when they are readily available, rarely interrupted, and jobs are short. Avoid them if big jobs will require many restarts to complete."
230+
default = false
231+
type = bool
232+
}
233+
228234
variable "task_instance_spot_count_max" {
229235
description = "Max capacity for task instance ASG (spot)"
230236
default = 50

0 commit comments

Comments
 (0)