From a0bc9dc8a9d41f3422a410ac07e8d30c2a90d980 Mon Sep 17 00:00:00 2001 From: Ben Larabie Date: Wed, 6 Dec 2023 08:55:27 -0500 Subject: [PATCH 1/2] Setting scalable and primary celery alarms --- aws/eks/cloudwatch_alarms.tf | 188 ++++++++++++++++++++++++++++++++--- env/dev/eks/terragrunt.hcl | 2 +- 2 files changed, 177 insertions(+), 13 deletions(-) diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index e186065ac..c69578496 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -202,10 +202,10 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" { } } -resource "aws_cloudwatch_metric_alarm" "celery-pods-high-cpu-warning" { +resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" { count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "celery-pods-high-cpu-warning" - alarm_description = "Average CPU of Celery pods >=50% during 10 minutes" + alarm_name = "celery-primary-pods-high-cpu-warning" + alarm_description = "Average CPU of Primary Celery pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "2" metric_name = "pod_cpu_utilization" @@ -218,7 +218,28 @@ resource "aws_cloudwatch_metric_alarm" "celery-pods-high-cpu-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery" + Service = "celery-primary" + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "celery-scalable-pods-high-cpu-warning" + alarm_description = "Average CPU of Scalable Celery pods >=50% during 10 minutes" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "2" + metric_name = "pod_cpu_utilization" + namespace = "ContainerInsights" + period = 300 + statistic = "Average" + threshold = 50 + alarm_actions = [var.sns_alert_warning_arn] + insufficient_data_actions = [var.sns_alert_warning_arn] + treat_missing_data = "missing" + dimensions = { + Namespace = "notification-canada-ca" + Service = "celery-scalable" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -287,10 +308,10 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" { } } -resource "aws_cloudwatch_metric_alarm" "celery-pods-high-memory-warning" { +resource "aws_cloudwatch_metric_alarm" "celeryprimary--pods-high-memory-warning" { count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "celery-pods-high-memory-warning" - alarm_description = "Average memory of Celery pods >=50% during 10 minutes" + alarm_name = "celeryprimary--pods-high-memory-warning" + alarm_description = "Average memory of Primary Celery pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "2" metric_name = "pod_memory_utilization" @@ -303,7 +324,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-pods-high-memory-warning" { treat_missing_data = "missing" dimensions = { Namespace = "notification-canada-ca" - Service = "celery" + Service = "celery-primary" ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name } } @@ -435,12 +456,12 @@ resource "aws_cloudwatch_metric_alarm" "kubernetes-failed-nodes" { } } -resource "aws_cloudwatch_metric_alarm" "celery-replicas-unavailable" { +resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" { count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "celery-replicas-unavailable" + alarm_name = "celery-primary-replicas-unavailable" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = 2 - alarm_description = "Celery Replicas Unavailable" + alarm_description = "Celery Primary Replicas Unavailable" #Setting to warn until we verify that it is working as expected alarm_actions = [var.sns_alert_warning_arn] treat_missing_data = "notBreaching" @@ -457,7 +478,36 @@ resource "aws_cloudwatch_metric_alarm" "celery-replicas-unavailable" { dimensions = { ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name namespace = var.notify_k8s_namespace - deployment = "celery" + deployment = "celery-primary" + } + } + } +} + + +resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "celery-scalable-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 3 + alarm_description = "Celery Scalable Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-scalable" } } } @@ -519,6 +569,120 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" { } } +resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unavailable" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "celery-email-send-primary-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery Email Send Primary Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-email-send-primary" + } + } + } +} + + +resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unavailable" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "celery-email-send-scalable-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 3 + alarm_description = "Celery Email Send Scalable Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-email-send-scalable" + } + } + } +} + +resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavailable" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "celery-sms-send-primary-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 2 + alarm_description = "Celery SMS Send Primary Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-sms-send-primary" + } + } + } +} + + +resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavailable" { + count = var.cloudwatch_enabled ? 1 : 0 + alarm_name = "celery-sms-send-scalable-replicas-unavailable" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 3 + alarm_description = "Celery SMS Send Scalable Replicas Unavailable" + #Setting to warn until we verify that it is working as expected + alarm_actions = [var.sns_alert_warning_arn] + treat_missing_data = "notBreaching" + threshold = 1 + + metric_query { + id = "m1" + return_data = "true" + metric { + metric_name = "kube_deployment_status_replicas_unavailable" + namespace = "ContainerInsights/Prometheus" + period = 300 + stat = "Minimum" + dimensions = { + ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name + namespace = var.notify_k8s_namespace + deployment = "celery-sms-send-scalable" + } + } + } +} + resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" { count = var.cloudwatch_enabled ? 1 : 0 alarm_name = "admin-replicas-unavailable" diff --git a/env/dev/eks/terragrunt.hcl b/env/dev/eks/terragrunt.hcl index 3e59f9713..c02d494e6 100644 --- a/env/dev/eks/terragrunt.hcl +++ b/env/dev/eks/terragrunt.hcl @@ -51,7 +51,7 @@ include { } inputs = { - primary_worker_desired_size = 3 + primary_worker_desired_size = 4 primary_worker_instance_types = ["m5.large"] secondary_worker_instance_types = ["m5.large"] nodeUpgrade = false From fd64c13717adff47cba8da631fd098129db9a8f2 Mon Sep 17 00:00:00 2001 From: Ben Larabie Date: Wed, 6 Dec 2023 12:52:12 -0500 Subject: [PATCH 2/2] fixing typo --- aws/eks/cloudwatch_alarms.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws/eks/cloudwatch_alarms.tf b/aws/eks/cloudwatch_alarms.tf index c69578496..58de5613e 100644 --- a/aws/eks/cloudwatch_alarms.tf +++ b/aws/eks/cloudwatch_alarms.tf @@ -308,9 +308,9 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" { } } -resource "aws_cloudwatch_metric_alarm" "celeryprimary--pods-high-memory-warning" { +resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning" { count = var.cloudwatch_enabled ? 1 : 0 - alarm_name = "celeryprimary--pods-high-memory-warning" + alarm_name = "celery-primary-pods-high-memory-warning" alarm_description = "Average memory of Primary Celery pods >=50% during 10 minutes" comparison_operator = "GreaterThanOrEqualToThreshold" evaluation_periods = "2"