Skip to content

Commit

Permalink
Merge pull request #11 from TimBuss/master
Browse files Browse the repository at this point in the history
feat: Reduce alarm noise
  • Loading branch information
aoggz authored Apr 22, 2020
2 parents 0fc7d94 + b1f9ce9 commit 73f501e
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 27 deletions.
32 changes: 24 additions & 8 deletions autoscaling.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_or_cpu_high" {
count = var.enable_autoscale ? 1 : 0
alarm_name = "${aws_ecs_service.main.name}-Memory-${var.task_scale_out_memory_threshold_percent}-OR-CPU-${var.task_scale_out_cpu_threshold_percent}-High"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
evaluation_periods = var.task_alarm_evaluation_periods
threshold = "1"
alarm_description = "Scale out ${aws_ecs_service.main.name} tasks"

Expand All @@ -32,7 +32,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_or_cpu_high" {
metric {
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
period = var.task_scale_out_alarm_evaluation_period
period = var.task_alarm_period
stat = "Average"
unit = "Percent"

Expand All @@ -49,7 +49,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_or_cpu_high" {
metric {
metric_name = "MemoryUtilization"
namespace = "AWS/ECS"
period = var.task_scale_out_alarm_evaluation_period
period = var.task_alarm_period
stat = "Average"
unit = "Percent"

Expand All @@ -71,13 +71,13 @@ resource "aws_cloudwatch_metric_alarm" "memory_and_cpu_low" {
count = var.enable_autoscale ? 1 : 0
alarm_name = "${aws_ecs_service.main.name}-Memory-${var.task_scale_in_memory_threshold_percent}-AND-CPU-${var.task_scale_in_cpu_threshold_percent}-Low"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
evaluation_periods = var.task_alarm_evaluation_periods
threshold = "1"
alarm_description = "Scale in ${aws_ecs_service.main.name} tasks"

metric_query {
id = "e1"
expression = "CEIL((cpu-${var.task_scale_in_cpu_threshold_percent})/(100))+CEIL((memory-${var.task_scale_in_memory_threshold_percent})/(100))"
expression = "CEIL((cpu-${var.task_scale_in_cpu_threshold_percent})/(100))+CEIL((memory-${var.task_scale_in_memory_threshold_percent})/(100))+((${var.task_count}+1)-healthyhosts)"
label = "CPU and Memory Utilization low"
return_data = "true"
}
Expand All @@ -88,7 +88,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_and_cpu_low" {
metric {
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
period = var.task_scale_in_alarm_evaluation_period
period = var.task_alarm_period
stat = "Average"
unit = "Percent"

Expand All @@ -105,7 +105,7 @@ resource "aws_cloudwatch_metric_alarm" "memory_and_cpu_low" {
metric {
metric_name = "MemoryUtilization"
namespace = "AWS/ECS"
period = var.task_scale_in_alarm_evaluation_period
period = var.task_alarm_period
stat = "Average"
unit = "Percent"

Expand All @@ -116,6 +116,22 @@ resource "aws_cloudwatch_metric_alarm" "memory_and_cpu_low" {
}
}

metric_query {
id = "healthyhosts"

metric {
metric_name = "HealthyHostCount"
namespace = "AWS/ApplicationELB"
period = var.task_alarm_period
stat = "Average"

dimensions = {
TargetGroup = aws_lb_target_group.app.arn_suffix
LoadBalancer = aws_lb.main.arn_suffix
}
}
}

alarm_actions = var.enable_monitoring ? [aws_appautoscaling_policy.app_in[0].arn, module.notify-slack.this_slack_topic_arn] : [aws_appautoscaling_policy.app_in[0].arn]

depends_on = [
Expand Down Expand Up @@ -167,4 +183,4 @@ resource "aws_appautoscaling_policy" "app_in" {
depends_on = [
aws_appautoscaling_target.app_scale_target[0],
]
}
}
1 change: 1 addition & 0 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ resource "aws_lb_target_group" "app" {
protocol = "HTTPS"
vpc_id = var.vpc_id
target_type = "ip"
slow_start = var.target_group_slow_start

health_check {
path = var.app_healthcheck_endpoint
Expand Down
3 changes: 1 addition & 2 deletions monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ resource "aws_cloudwatch_metric_alarm" "httpcode_target_5xx_count" {
alarm_description = "${var.resource_prefix}-${terraform.workspace}-5XX-target-group-errors"
alarm_actions = [module.notify-slack.this_slack_topic_arn]
ok_actions = [module.notify-slack.this_slack_topic_arn]
insufficient_data_actions = [module.notify-slack.this_slack_topic_arn]

dimensions = {
"TargetGroup" = aws_lb_target_group.app.arn_suffix
Expand All @@ -74,10 +73,10 @@ resource "aws_cloudwatch_metric_alarm" "unhealthy_host_count" {
period = var.monitoring_period
statistic = "Average"
threshold = 0
treat_missing_data = "notBreaching"
alarm_description = "${var.resource_prefix}-${terraform.workspace}-unhealthy-hosts"
alarm_actions = [module.notify-slack.this_slack_topic_arn]
ok_actions = [module.notify-slack.this_slack_topic_arn]
insufficient_data_actions = [module.notify-slack.this_slack_topic_arn]

dimensions = {
"TargetGroup" = aws_lb_target_group.app.arn_suffix
Expand Down
40 changes: 23 additions & 17 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -189,35 +189,41 @@ variable "task_scale_out_cpu_threshold_percent" {
variable "task_scale_in_memory_threshold_percent" {
type = number
default = 25
description= "[Optional] Autoscale - When both cpu and memory are below scale in thresholds decrease tasks by 1"
description= "[Optional] Autoscale - When both cpu and memory are below scale in thresholds decrease tasks by 1. Only applies when the total number of tasks exceed the minimum task count. number of running tasks determined by HealthyHostCount."
}

variable "task_scale_in_cpu_threshold_percent" {
type = number
default = 30
description= "[Optional] Autoscale - When both cpu and memory are below scale in thresholds decrease tasks by 1"
type = number
default = 30
description = "[Optional] Autoscale - When both cpu and memory are below scale in thresholds decrease tasks by 1. Only applies when the total number of tasks exceed the minimum task count. number of running tasks determined by HealthyHostCount."
}

variable "task_scale_in_cooldown_period" {
type = number
default = 300
type = number
default = 300
description = "[Optional] Autoscale - scale in cooldown period, in seconds"
}

variable "task_scale_out_cooldown_period" {
type = number
default = 300
type = number
default = 300
description = "[Optional] Autoscale - scale out cooldown period, in seconds"
}

variable "task_scale_in_alarm_evaluation_period" {
type = number
default = 60
description = "[Optional] Autoscale - scale in alarm evaluation period, in seconds"
variable "task_alarm_period" {
type = number
default = 60
description = "[Optional] Autoscale - period of time to evaluate, in seconds"
}

variable "task_scale_out_alarm_evaluation_period" {
type = number
default = 60
description = "[Optional] Autoscale - scale out alarm evaluation period, in seconds"
}
variable "task_alarm_evaluation_periods" {
type = number
default = 3
description = "[Optional] Autoscale - number of data points to use for evaluation"
}

variable "target_group_slow_start" {
type = number
default = 300
description = "[Optional] Load balancer - time period to wait before forwarding requests to the target group, time in seconds."
}

0 comments on commit 73f501e

Please sign in to comment.