From ff38fe1bb1d15012c8237f09baeb860f334c8b68 Mon Sep 17 00:00:00 2001 From: Mat Geist Date: Fri, 3 Apr 2020 11:07:07 -0700 Subject: [PATCH] Oldest Replication Slot Lag RDS alarm --- alarms.tf | 34 +++++++++++++++++++++++++++------- variables.tf | 7 +++++++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/alarms.tf b/alarms.tf index 3549863..93415ed 100644 --- a/alarms.tf +++ b/alarms.tf @@ -1,12 +1,13 @@ locals { thresholds = { - BurstBalanceThreshold = min(max(var.burst_balance_threshold, 0), 100) - CPUUtilizationThreshold = min(max(var.cpu_utilization_threshold, 0), 100) - CPUCreditBalanceThreshold = max(var.cpu_credit_balance_threshold, 0) - DiskQueueDepthThreshold = max(var.disk_queue_depth_threshold, 0) - FreeableMemoryThreshold = max(var.freeable_memory_threshold, 0) - FreeStorageSpaceThreshold = max(var.free_storage_space_threshold, 0) - SwapUsageThreshold = max(var.swap_usage_threshold, 0) + BurstBalanceThreshold = min(max(var.burst_balance_threshold, 0), 100) + CPUUtilizationThreshold = min(max(var.cpu_utilization_threshold, 0), 100) + CPUCreditBalanceThreshold = max(var.cpu_credit_balance_threshold, 0) + DiskQueueDepthThreshold = max(var.disk_queue_depth_threshold, 0) + FreeableMemoryThreshold = max(var.freeable_memory_threshold, 0) + FreeStorageSpaceThreshold = max(var.free_storage_space_threshold, 0) + OldestReplicationThreshold = max(var.oldest_replication_threshold, 0) + SwapUsageThreshold = max(var.swap_usage_threshold, 0) } alarm_names = toset([ @@ -139,6 +140,25 @@ resource "aws_cloudwatch_metric_alarm" "free_storage_space_too_low" { } } +resource "aws_cloudwatch_metric_alarm" "oldest_replication_too_high" { + alarm_name = "oldest_replication_too_high" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = "OldestReplicationSlotLag" + namespace = "AWS/RDS" + period = "600" + statistic = "Average" + threshold = local.thresholds["OldestReplicationThreshold"] + alarm_description = "Average database replication lag over last 10 minutes too high, disk may fill" + alarm_actions = [aws_sns_topic.default.arn] + ok_actions = [aws_sns_topic.default.arn] + + dimensions = { + DBInstanceIdentifier = var.db_instance_id + } +} + + resource "aws_cloudwatch_metric_alarm" "swap_usage_too_high" { alarm_name = module.label["swap_usage_too_high"].id comparison_operator = "GreaterThanThreshold" diff --git a/variables.tf b/variables.tf index 1d31986..1d1c8a3 100644 --- a/variables.tf +++ b/variables.tf @@ -43,6 +43,13 @@ variable "free_storage_space_threshold" { # 2 Gigabyte in Byte } +variable "oldest_replication_threshold" { + description = "The maximum amount of replication lag space in Megabyte." + type = string + default = 1000 + # 1 Gigabyte in Megabyte +} + variable "swap_usage_threshold" { description = "The maximum amount of swap space used on the DB instance in Byte." type = number