From ff38fe1bb1d15012c8237f09baeb860f334c8b68 Mon Sep 17 00:00:00 2001 From: Mat Geist Date: Fri, 3 Apr 2020 11:07:07 -0700 Subject: [PATCH 1/2] Oldest Replication Slot Lag RDS alarm --- alarms.tf | 34 +++++++++++++++++++++++++++------- variables.tf | 7 +++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/alarms.tf b/alarms.tf index 3549863..93415ed 100644 --- a/alarms.tf +++ b/alarms.tf @@ -1,12 +1,13 @@ locals { thresholds = { - BurstBalanceThreshold = min(max(var.burst_balance_threshold, 0), 100) - CPUUtilizationThreshold = min(max(var.cpu_utilization_threshold, 0), 100) - CPUCreditBalanceThreshold = max(var.cpu_credit_balance_threshold, 0) - DiskQueueDepthThreshold = max(var.disk_queue_depth_threshold, 0) - FreeableMemoryThreshold = max(var.freeable_memory_threshold, 0) - FreeStorageSpaceThreshold = max(var.free_storage_space_threshold, 0) - SwapUsageThreshold = max(var.swap_usage_threshold, 0) + BurstBalanceThreshold = min(max(var.burst_balance_threshold, 0), 100) + CPUUtilizationThreshold = min(max(var.cpu_utilization_threshold, 0), 100) + CPUCreditBalanceThreshold = max(var.cpu_credit_balance_threshold, 0) + DiskQueueDepthThreshold = max(var.disk_queue_depth_threshold, 0) + FreeableMemoryThreshold = max(var.freeable_memory_threshold, 0) + FreeStorageSpaceThreshold = max(var.free_storage_space_threshold, 0) + OldestReplicationThreshold = max(var.oldest_replication_threshold, 0) + SwapUsageThreshold = max(var.swap_usage_threshold, 0) } alarm_names = toset([ @@ -139,6 +140,25 @@ resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" { } } +resource "aws_cloudwatch_metric_alarm" "oldest_replication_too_high" { + alarm_name = "oldest_replication_too_high" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "OldestReplicationSlotLag" + namespace = "AWS/RDS" + period = "600" + statistic = "Average" + threshold = local.thresholds["OldestReplicationThreshold"] + alarm_description = "Average database replication lag over last 10 minutes too high, disk may fill" + alarm_actions = [aws_sns_topic.default.arn] + ok_actions = [aws_sns_topic.default.arn] + + dimensions = { + DBInstanceIdentifier = var.db_instance_id + } +} + + resource "aws_cloudwatch_metric_alarm" "swap_usage_too_high" { alarm_name = module.label["swap_usage_too_high"].id comparison_operator = "GreaterThanThreshold" diff --git a/variables.tf b/variables.tf index 1d31986..1d1c8a3 100644 --- a/variables.tf +++ b/variables.tf @@ -43,6 +43,13 @@ variable "free_storage_space_threshold" { # 2 Gigabyte in Byte } +variable "oldest_replication_threshold" { + description = "The maximum amount of replication lag space in Megabyte." + type = string + default = 1000 + # 1 Gigabyte in Megabyte +} + variable "swap_usage_threshold" { description = "The maximum amount of swap space used on the DB instance in Byte." type = number From 2b4ac788c9c80ef5b8f3376d85807e651d2004b0 Mon Sep 17 00:00:00 2001 From: Darren Weber Date: Thu, 2 Mar 2023 09:01:02 -0800 Subject: [PATCH 2/2] Update OldestReplicationSlotLag for cloudposse module --- alarms.tf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/alarms.tf b/alarms.tf index 93415ed..1e04745 100644 --- a/alarms.tf +++ b/alarms.tf @@ -17,6 +17,7 @@ locals { "disk_queue_depth_too_high", "freeable_memory_too_low", "free_storage_space_threshold", + "oldest_replication_too_high", "swap_usage_too_high" ]) } @@ -141,7 +142,7 @@ resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" { } resource "aws_cloudwatch_metric_alarm" "oldest_replication_too_high" { - alarm_name = "oldest_replication_too_high" + alarm_name = module.label["oldest_replication_too_high"].id comparison_operator = "GreaterThanThreshold" evaluation_periods = "1" metric_name = "OldestReplicationSlotLag" @@ -150,8 +151,8 @@ resource "aws_cloudwatch_metric_alarm" "oldest_replication_too_high" { statistic = "Average" threshold = local.thresholds["OldestReplicationThreshold"] alarm_description = "Average database replication lag over last 10 minutes too high, disk may fill" - alarm_actions = [aws_sns_topic.default.arn] - ok_actions = [aws_sns_topic.default.arn] + alarm_actions = aws_sns_topic.default.*.arn + ok_actions = aws_sns_topic.default.*.arn dimensions = { DBInstanceIdentifier = var.db_instance_id