From a1a872b4f23e29f2541f6933c909c35c895eb0bb Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Wed, 14 Aug 2024 10:25:32 -0400 Subject: [PATCH 1/2] Disable SNS notifications for alarm that was noisier than intended DTADFIC, a queue of 100 items can sometimes not send any work to hadoop --- modules/imputation-server/monitoring.tf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/imputation-server/monitoring.tf b/modules/imputation-server/monitoring.tf index 0878c92..bd51c55 100644 --- a/modules/imputation-server/monitoring.tf +++ b/modules/imputation-server/monitoring.tf @@ -1,3 +1,8 @@ +/// This alarm is a useful idea in theory, but it's noisy, and doesn't operate on the timescales we need +// It also can't account for the dual-queue design of cloudgene (the "active" vs "queued" feature): if 15 jobs +// are exporting, then even though there are 100 jobs in queue, hadoop won't be sent work, and will signal "all clear" +// No amount of alarm cleverness can compensate for a webapp that hides information from the system, which makes it hard to fix alarm just from the AWS side. +// We'll keep the alarm defined and tracking metrics, in case it aids future capacity planning. But it won't send alerts. resource "aws_cloudwatch_metric_alarm" "cluster_needs_resources" { # Warn if the system is unable to scale enough, after several hours of trying. Resolved by: # a) add spot capacity (if we're blitzed with lots of jobs), @@ -13,7 +18,7 @@ resource "aws_cloudwatch_metric_alarm" "cluster_needs_resources" { datapoints_to_alarm = 24 evaluation_periods = 24 - actions_enabled = true + actions_enabled = false # Don't send alerts- see notes. # Notify when the alarm changes state- for good or bad alarm_actions = [var.alert_sns_arn] From cfe5d2ca4928af1e51ba585a1ab22e8658ad9cbf Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Wed, 14 Aug 2024 13:56:22 -0400 Subject: [PATCH 2/2] Disable spot worker pool by default Frequent interruptions mean that spot instances have gone from being the backbone of TIS to an outright liability --- modules/imputation-server/main.tf | 7 ++++++- modules/imputation-server/outputs.tf | 12 ++++++------ modules/imputation-server/variables.tf | 8 +++++++- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/modules/imputation-server/main.tf b/modules/imputation-server/main.tf index 252f674..0c31160 100644 --- a/modules/imputation-server/main.tf +++ b/modules/imputation-server/main.tf @@ -242,7 +242,12 @@ EOF } resource "aws_emr_instance_group" "task" { - # EMR workers using spot instances. This is the preferred type due to cost, and has more favorable scaling options. + # EMR workers using spot instances. + # In theory, spot instances are great; in practice, their availability is unpredictable. Frequent interruptions can + # actually cause more problems than they solve. This worker pool is defined in TF, but can be enabled/disabled depending on current availability in AWS. + + count = var.task_instance_spot_enabled ? 1 : 0 # if spot instances are interrupted often, disable and prefer on-demand instead + name = "${var.name_prefix}-instance-group" cluster_id = aws_emr_cluster.cluster.id instance_count = max(var.task_instance_spot_count_min, var.task_instance_spot_count_current) diff --git a/modules/imputation-server/outputs.tf b/modules/imputation-server/outputs.tf index 689ae3d..ef56d64 100644 --- a/modules/imputation-server/outputs.tf +++ b/modules/imputation-server/outputs.tf @@ -34,18 +34,18 @@ output "emr_ec2_attributes" { } output "emr_instance_group_id" { - description = "The EMR (spot) Instance Group ID" - value = aws_emr_instance_group.task.id + description = "The EMR (spot) Instance Group ID, if any" + value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).id : null } output "emr_instance_group_name" { - description = "The name of the (spot) Instance Group" - value = aws_emr_instance_group.task.name + description = "The name of the (spot) Instance Group, if any" + value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).name : null } output "emr_instance_group_running_instance_count" { - description = "The number of (spot) instances currently running in this instance group" - value = aws_emr_instance_group.task.running_instance_count + description = "The number of (spot) instances currently running in this instance group, if any" + value = length(aws_emr_instance_group.task) > 0 ? one(aws_emr_instance_group.task).running_instance_count : 0 } output "emr_instance_group_ondemand_id" { diff --git a/modules/imputation-server/variables.tf b/modules/imputation-server/variables.tf index c30ae18..898309b 100644 --- a/modules/imputation-server/variables.tf +++ b/modules/imputation-server/variables.tf @@ -224,7 +224,13 @@ variable "task_instance_spot_count_current" { } -# Spot instances are the preferred worker type and should usually have higher min/max values +# Spot instances are the ideal worker type, but if they get interrupted often, they can make things worse (instead of better) +variable "task_instance_spot_enabled" { + description = "Whether to use spot instances at all. Use them when they are readily available, rarely interrupted, and jobs are short. Avoid them if big jobs will require many restarts to complete." + default = false + type = bool +} + variable "task_instance_spot_count_max" { description = "Max capacity for task instance ASG (spot)" default = 50