From b3a506d1f1de5a1b51cc3fd2e190f19d9c5bd473 Mon Sep 17 00:00:00 2001 From: Andrew Date: Wed, 17 Jan 2024 11:21:13 -0400 Subject: [PATCH] feat: add alarm and log metric for queues not being active (#1123) * feat: add alarm and log metric for queues not being active * chore: fix alarms --- aws/eks/cloudwatch_alarms.tf | 31 +++++++++++++++++++++++++++++++ aws/eks/cloudwatch_log.tf | 13 +++++++++++++ 2 files changed, 44 insertions(+) diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index 33d45bf34..4cc8cb901 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -907,3 +907,34 @@ resource "aws_cloudwatch_metric_alarm" "karpenter-replicas-unavailable" { } } } + +resource "aws_cloudwatch_metric_alarm" "queues-not-active-1-minute-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "queues-not-active-1-minute-warning" + alarm_description = "Queues have not been active for one minute" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.queues-are-active[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.queues-are-active[0].metric_transformation[0].namespace + period = "60" + statistic = "Sum" + threshold = 1 + treat_missing_data = "breaching" + alarm_actions = [var.sns_alert_warning_arn] +} + +resource "aws_cloudwatch_metric_alarm" "queues-not-active-5-minutes-critical" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "queues-not-active-5-minutes-critical" + alarm_description = "Queues have not been active for 5 minutes" + comparison_operator = "LessThanThreshold" + evaluation_periods = "1" + metric_name = aws_cloudwatch_log_metric_filter.queues-are-active[0].metric_transformation[0].name + namespace = aws_cloudwatch_log_metric_filter.queues-are-active[0].metric_transformation[0].namespace + period = "300" + statistic = "Sum" + threshold = 1 + treat_missing_data = "breaching" + alarm_actions = [var.sns_alert_critical_arn] + ok_actions = [var.sns_alert_critical_arn] +} diff --git a/aws/eks/cloudwatch_log.tf b/aws/eks/cloudwatch_log.tf index a752ad3fe..746cb829b 100644 --- a/aws/eks/cloudwatch_log.tf +++ b/aws/eks/cloudwatch_log.tf @@ -153,3 +153,16 @@ resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" { value = "1" } } + +resource "aws_cloudwatch_log_metric_filter" "queues-are-active" { + count = var.cloudwatch_enabled ? 1 : 0 + name = "queues-are-active" + pattern = "Batch saving with" + log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name + + metric_transformation { + name = "queues-are-active" + namespace = "LogMetrics" + value = "1" + } +}