-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Setting scalable and primary celery alarms #1067
Conversation
Updating alarms ⏰? Great! Please update the Google Sheet and add a 👍 to this message after 🙏 |
1 similar comment
Updating alarms ⏰? Great! Please update the Google Sheet and add a 👍 to this message after 🙏 |
We only have high cpu / memory warning alarms for celery-primary/scalable, not for the email or sms pods - it that intentional? |
|
Staging: eks✅ Terraform Init: Plan: 9 to add, 0 to change, 3 to destroy Show summary
Show planResource actions are indicated with the following symbols:
+ create
- destroy
Terraform will perform the following actions:
# aws_cloudwatch_metric_alarm.celery-email-send-primary-replicas-unavailable[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unavailable" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Celery Email Send Primary Replicas Unavailable"
+ alarm_name = "celery-email-send-primary-replicas-unavailable"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 2
+ id = (known after apply)
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "notBreaching"
+ metric_query {
+ id = "m1"
+ return_data = true
+ metric {
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "deployment" = "celery-email-send-primary"
+ "namespace" = "notification-canada-ca"
}
+ metric_name = "kube_deployment_status_replicas_unavailable"
+ namespace = "ContainerInsights/Prometheus"
+ period = 300
+ stat = "Minimum"
}
}
}
# aws_cloudwatch_metric_alarm.celery-email-send-scalable-replicas-unavailable[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unavailable" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Celery Email Send Scalable Replicas Unavailable"
+ alarm_name = "celery-email-send-scalable-replicas-unavailable"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 3
+ id = (known after apply)
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "notBreaching"
+ metric_query {
+ id = "m1"
+ return_data = true
+ metric {
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "deployment" = "celery-email-send-scalable"
+ "namespace" = "notification-canada-ca"
}
+ metric_name = "kube_deployment_status_replicas_unavailable"
+ namespace = "ContainerInsights/Prometheus"
+ period = 300
+ stat = "Minimum"
}
}
}
# aws_cloudwatch_metric_alarm.celery-pods-high-cpu-warning[0] will be destroyed
# (because aws_cloudwatch_metric_alarm.celery-pods-high-cpu-warning is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "celery-pods-high-cpu-warning" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-warning",
] -> null
- alarm_description = "Average CPU of Celery pods >=50% during 10 minutes" -> null
- alarm_name = "celery-pods-high-cpu-warning" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:celery-pods-high-cpu-warning" -> null
- comparison_operator = "GreaterThanOrEqualToThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {
- "ClusterName" = "notification-canada-ca-staging-eks-cluster"
- "Namespace" = "notification-canada-ca"
- "Service" = "celery"
} -> null
- evaluation_periods = 2 -> null
- id = "celery-pods-high-cpu-warning" -> null
- insufficient_data_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-warning",
] -> null
- metric_name = "pod_cpu_utilization" -> null
- namespace = "ContainerInsights" -> null
- ok_actions = [] -> null
- period = 300 -> null
- statistic = "Average" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 50 -> null
- treat_missing_data = "missing" -> null
}
# aws_cloudwatch_metric_alarm.celery-pods-high-memory-warning[0] will be destroyed
# (because aws_cloudwatch_metric_alarm.celery-pods-high-memory-warning is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "celery-pods-high-memory-warning" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-warning",
] -> null
- alarm_description = "Average memory of Celery pods >=50% during 10 minutes" -> null
- alarm_name = "celery-pods-high-memory-warning" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:celery-pods-high-memory-warning" -> null
- comparison_operator = "GreaterThanOrEqualToThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {
- "ClusterName" = "notification-canada-ca-staging-eks-cluster"
- "Namespace" = "notification-canada-ca"
- "Service" = "celery"
} -> null
- evaluation_periods = 2 -> null
- id = "celery-pods-high-memory-warning" -> null
- insufficient_data_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-warning",
] -> null
- metric_name = "pod_memory_utilization" -> null
- namespace = "ContainerInsights" -> null
- ok_actions = [] -> null
- period = 300 -> null
- statistic = "Average" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 50 -> null
- treat_missing_data = "missing" -> null
}
# aws_cloudwatch_metric_alarm.celery-primary-pods-high-cpu-warning[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Average CPU of Primary Celery pods >=50% during 10 minutes"
+ alarm_name = "celery-primary-pods-high-cpu-warning"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "Namespace" = "notification-canada-ca"
+ "Service" = "celery-primary"
}
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 2
+ id = (known after apply)
+ insufficient_data_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ metric_name = "pod_cpu_utilization"
+ namespace = "ContainerInsights"
+ period = 300
+ statistic = "Average"
+ tags_all = (known after apply)
+ threshold = 50
+ treat_missing_data = "missing"
}
# aws_cloudwatch_metric_alarm.celery-primary-pods-high-memory-warning[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Average memory of Primary Celery pods >=50% during 10 minutes"
+ alarm_name = "celery-primary-pods-high-memory-warning"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "Namespace" = "notification-canada-ca"
+ "Service" = "celery-primary"
}
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 2
+ id = (known after apply)
+ insufficient_data_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ metric_name = "pod_memory_utilization"
+ namespace = "ContainerInsights"
+ period = 300
+ statistic = "Average"
+ tags_all = (known after apply)
+ threshold = 50
+ treat_missing_data = "missing"
}
# aws_cloudwatch_metric_alarm.celery-primary-replicas-unavailable[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Celery Primary Replicas Unavailable"
+ alarm_name = "celery-primary-replicas-unavailable"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 2
+ id = (known after apply)
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "notBreaching"
+ metric_query {
+ id = "m1"
+ return_data = true
+ metric {
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "deployment" = "celery-primary"
+ "namespace" = "notification-canada-ca"
}
+ metric_name = "kube_deployment_status_replicas_unavailable"
+ namespace = "ContainerInsights/Prometheus"
+ period = 300
+ stat = "Minimum"
}
}
}
# aws_cloudwatch_metric_alarm.celery-replicas-unavailable[0] will be destroyed
# (because aws_cloudwatch_metric_alarm.celery-replicas-unavailable is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "celery-replicas-unavailable" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-warning",
] -> null
- alarm_description = "Celery Replicas Unavailable" -> null
- alarm_name = "celery-replicas-unavailable" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:celery-replicas-unavailable" -> null
- comparison_operator = "GreaterThanOrEqualToThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {} -> null
- evaluation_periods = 2 -> null
- id = "celery-replicas-unavailable" -> null
- insufficient_data_actions = [] -> null
- ok_actions = [] -> null
- period = 0 -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 1 -> null
- treat_missing_data = "notBreaching" -> null
- metric_query {
- id = "m1" -> null
- period = 0 -> null
- return_data = true -> null
- metric {
- dimensions = {
- "ClusterName" = "notification-canada-ca-staging-eks-cluster"
- "deployment" = "celery"
- "namespace" = "notification-canada-ca"
} -> null
- metric_name = "kube_deployment_status_replicas_unavailable" -> null
- namespace = "ContainerInsights/Prometheus" -> null
- period = 300 -> null
- stat = "Minimum" -> null
}
}
}
# aws_cloudwatch_metric_alarm.celery-scalable-pods-high-cpu-warning[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Average CPU of Scalable Celery pods >=50% during 10 minutes"
+ alarm_name = "celery-scalable-pods-high-cpu-warning"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "Namespace" = "notification-canada-ca"
+ "Service" = "celery-scalable"
}
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 2
+ id = (known after apply)
+ insufficient_data_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ metric_name = "pod_cpu_utilization"
+ namespace = "ContainerInsights"
+ period = 300
+ statistic = "Average"
+ tags_all = (known after apply)
+ threshold = 50
+ treat_missing_data = "missing"
}
# aws_cloudwatch_metric_alarm.celery-scalable-replicas-unavailable[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Celery Scalable Replicas Unavailable"
+ alarm_name = "celery-scalable-replicas-unavailable"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 3
+ id = (known after apply)
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "notBreaching"
+ metric_query {
+ id = "m1"
+ return_data = true
+ metric {
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "deployment" = "celery-scalable"
+ "namespace" = "notification-canada-ca"
}
+ metric_name = "kube_deployment_status_replicas_unavailable"
+ namespace = "ContainerInsights/Prometheus"
+ period = 300
+ stat = "Minimum"
}
}
}
# aws_cloudwatch_metric_alarm.celery-sms-send-primary-replicas-unavailable[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavailable" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Celery SMS Send Primary Replicas Unavailable"
+ alarm_name = "celery-sms-send-primary-replicas-unavailable"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 2
+ id = (known after apply)
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "notBreaching"
+ metric_query {
+ id = "m1"
+ return_data = true
+ metric {
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "deployment" = "celery-sms-send-primary"
+ "namespace" = "notification-canada-ca"
}
+ metric_name = "kube_deployment_status_replicas_unavailable"
+ namespace = "ContainerInsights/Prometheus"
+ period = 300
+ stat = "Minimum"
}
}
}
# aws_cloudwatch_metric_alarm.celery-sms-send-scalable-replicas-unavailable[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavailable" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Celery SMS Send Scalable Replicas Unavailable"
+ alarm_name = "celery-sms-send-scalable-replicas-unavailable"
+ arn = (known after apply)
+ comparison_operator = "GreaterThanOrEqualToThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 3
+ id = (known after apply)
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "notBreaching"
+ metric_query {
+ id = "m1"
+ return_data = true
+ metric {
+ dimensions = {
+ "ClusterName" = "notification-canada-ca-staging-eks-cluster"
+ "deployment" = "celery-sms-send-scalable"
+ "namespace" = "notification-canada-ca"
}
+ metric_name = "kube_deployment_status_replicas_unavailable"
+ namespace = "ContainerInsights/Prometheus"
+ period = 300
+ stat = "Minimum"
}
}
}
Plan: 9 to add, 0 to change, 3 to destroy.
─────────────────────────────────────────────────────────────────────────────
Saved the plan to: plan.tfplan
To perform exactly these actions, run the following command to apply:
terraform apply "plan.tfplan"
Show Conftest resultsWARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.celery-error[0]"]
WARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.scanfiles-timeout[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.notification-canada-ca-alt[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_listener.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-admin"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-api"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-document"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-document-api"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-documentation"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-cluster-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-evicted-pods[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-pods-high-cpu-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-pods-high-memory-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-replicas-unavailable[0]"]
WARN - plan.json - main - Missing Common Tags:... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM!
Summary | Résumé
Updating celery alarms to reflect primary and scalable replicas
Test instructions | Instructions pour tester la modification
Applied in dev for testing: