Disable spot worker pool by default

abought · abought · commit cfe5d2ca4928 · 2024-08-14T13:56:22.000-04:00
Frequent interruptions mean that spot instances have gone from being the backbone of TIS to an outright liability
diff --git a/modules/imputation-server/main.tf b/modules/imputation-server/main.tf
@@ -242,7 +242,12 @@ EOF
 }
 
 resource "aws_emr_instance_group" "task" {
-  # EMR workers using spot instances. This is the preferred type due to cost, and has more favorable scaling options.
+  # EMR workers using spot instances.
+  # In theory, spot instances are great; in practice, their availability is unpredictable. Frequent interruptions can
+  #   actually cause more problems than they solve. This worker pool is defined in TF, but can be enabled/disabled depending on current availability in AWS.
+
+  count = var.task_instance_spot_enabled ? 1 : 0 # if spot instances are interrupted often, disable and prefer on-demand instead
+
   name           = "${var.name_prefix}-instance-group"
   cluster_id     = aws_emr_cluster.cluster.id
   instance_count = max(var.task_instance_spot_count_min, var.task_instance_spot_count_current)
diff --git a/modules/imputation-server/outputs.tf b/modules/imputation-server/outputs.tf
@@ -34,18 +34,18 @@ output "emr_ec2_attributes" {
 }
 
 output "emr_instance_group_id" {
-  description = "The EMR (spot) Instance Group ID"
-  value       = aws_emr_instance_group.task.id
+  description = "The EMR (spot) Instance Group ID, if any"
+  value       = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).id : null
 }
 
 output "emr_instance_group_name" {
-  description = "The name of the (spot) Instance Group"
-  value       = aws_emr_instance_group.task.name
+  description = "The name of the (spot) Instance Group, if any"
+  value       = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).name : null
 }
 
 output "emr_instance_group_running_instance_count" {
-  description = "The number of (spot) instances currently running in this instance group"
-  value       = aws_emr_instance_group.task.running_instance_count
+  description = "The number of (spot) instances currently running in this instance group, if any"
+  value       = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).running_instance_count : 0
 }
 
 output "emr_instance_group_ondemand_id" {
diff --git a/modules/imputation-server/variables.tf b/modules/imputation-server/variables.tf
@@ -224,7 +224,13 @@ variable "task_instance_spot_count_current" {
 }
 
 
-# Spot instances are the preferred worker type and should usually have higher min/max values
+# Spot instances are the ideal worker type, but if they get interrupted often, they can make things worse (instead of better)
+variable "task_instance_spot_enabled" {
+  description = "Whether to use spot instances at all. Use them when they are readily available, rarely interrupted, and jobs are short. Avoid them if big jobs will require many restarts to complete."
+  default     = false
+  type        = bool
+}
+
 variable "task_instance_spot_count_max" {
   description = "Max capacity for task instance ASG (spot)"
   default     = 50

Original file line number	Diff line number	Diff line change
`@@ -34,18 +34,18 @@ output "emr_ec2_attributes" {`
`34`	`34`	`}`
`35`	`35`
`36`	`36`	`output "emr_instance_group_id" {`
`37`		`- description = "The EMR (spot) Instance Group ID"`
`38`		`- value = aws_emr_instance_group.task.id`
	`37`	`+ description = "The EMR (spot) Instance Group ID, if any"`
	`38`	`+ value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).id : null`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`output "emr_instance_group_name" {`
`42`		`- description = "The name of the (spot) Instance Group"`
`43`		`- value = aws_emr_instance_group.task.name`
	`42`	`+ description = "The name of the (spot) Instance Group, if any"`
	`43`	`+ value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).name : null`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`output "emr_instance_group_running_instance_count" {`
`47`		`- description = "The number of (spot) instances currently running in this instance group"`
`48`		`- value = aws_emr_instance_group.task.running_instance_count`
	`47`	`+ description = "The number of (spot) instances currently running in this instance group, if any"`
	`48`	`+ value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).running_instance_count : 0`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`output "emr_instance_group_ondemand_id" {`