-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore: update alarm names/descriptions/metric filters to be more accu… #1131
Conversation
Updating alarms ⏰? Great! Please update the Google Sheet and add a 👍 to this message after 🙏 |
aws/eks/cloudwatch_alarms.tf
Outdated
period = "60" | ||
statistic = "Sum" | ||
threshold = 1 | ||
treat_missing_data = "breaching" | ||
alarm_actions = [var.sns_alert_warning_arn] | ||
} | ||
|
||
resource "aws_cloudwatch_metric_alarm" "queues-not-active-5-minutes-critical" { | ||
resource "aws_cloudwatch_metric_alarm" "beat-inbox-tasks-not-active-5-minutes-critical" { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we keep the queue
part in the name? beat and tasks are heave on Celery and could be true of other components. A name such as aggregating-queues
would carry the meaning. We can talk offline when we have 5 minutes?
Staging: eks✅ Terraform Init: Plan: 3 to add, 0 to change, 3 to destroy Show summary
Show planResource actions are indicated with the following symbols:
+ create
- destroy
Terraform will perform the following actions:
# aws_cloudwatch_log_metric_filter.aggregating-queues-are-active[0] will be created
+ resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" {
+ id = (known after apply)
+ log_group_name = "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application"
+ name = "aggregating-queues-are-active"
+ pattern = "Batch saving with"
+ metric_transformation {
+ name = "aggregating-queues-are-active"
+ namespace = "LogMetrics"
+ unit = "None"
+ value = "1"
}
}
# aws_cloudwatch_log_metric_filter.queues-are-active[0] will be destroyed
# (because aws_cloudwatch_log_metric_filter.queues-are-active is not in configuration)
- resource "aws_cloudwatch_log_metric_filter" "queues-are-active" {
- id = "queues-are-active" -> null
- log_group_name = "/aws/containerinsights/notification-canada-ca-staging-eks-cluster/application" -> null
- name = "queues-are-active" -> null
- pattern = "Batch saving with" -> null
- metric_transformation {
- dimensions = {} -> null
- name = "queues-are-active" -> null
- namespace = "LogMetrics" -> null
- unit = "None" -> null
- value = "1" -> null
}
}
# aws_cloudwatch_metric_alarm.aggregating-queues-not-active-1-minute-warning[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-1-minute-warning" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-warning",
]
+ alarm_description = "Beat inbox tasks have not been active for one minute"
+ alarm_name = "aggregating-queues-not-active-1-minute-warning"
+ arn = (known after apply)
+ comparison_operator = "LessThanThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 1
+ id = (known after apply)
+ metric_name = "aggregating-queues-are-active"
+ namespace = "LogMetrics"
+ period = 60
+ statistic = "Sum"
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "breaching"
}
# aws_cloudwatch_metric_alarm.aggregating-queues-not-active-5-minutes-critical[0] will be created
+ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes-critical" {
+ actions_enabled = true
+ alarm_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-critical",
]
+ alarm_description = "Beat inbox tasks have not been active for 5 minutes"
+ alarm_name = "aggregating-queues-not-active-5-minutes-critical"
+ arn = (known after apply)
+ comparison_operator = "LessThanThreshold"
+ evaluate_low_sample_count_percentiles = (known after apply)
+ evaluation_periods = 1
+ id = (known after apply)
+ metric_name = "aggregating-queues-are-active"
+ namespace = "LogMetrics"
+ ok_actions = [
+ "arn:aws:sns:ca-central-1:239043911459:alert-critical",
]
+ period = 300
+ statistic = "Sum"
+ tags_all = (known after apply)
+ threshold = 1
+ treat_missing_data = "breaching"
}
# aws_cloudwatch_metric_alarm.queues-not-active-1-minute-warning[0] will be destroyed
# (because aws_cloudwatch_metric_alarm.queues-not-active-1-minute-warning is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "queues-not-active-1-minute-warning" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-warning",
] -> null
- alarm_description = "Queues have not been active for one minute" -> null
- alarm_name = "queues-not-active-1-minute-warning" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:queues-not-active-1-minute-warning" -> null
- comparison_operator = "LessThanThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {} -> null
- evaluation_periods = 1 -> null
- id = "queues-not-active-1-minute-warning" -> null
- insufficient_data_actions = [] -> null
- metric_name = "queues-are-active" -> null
- namespace = "LogMetrics" -> null
- ok_actions = [] -> null
- period = 60 -> null
- statistic = "Sum" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 1 -> null
- treat_missing_data = "breaching" -> null
}
# aws_cloudwatch_metric_alarm.queues-not-active-5-minutes-critical[0] will be destroyed
# (because aws_cloudwatch_metric_alarm.queues-not-active-5-minutes-critical is not in configuration)
- resource "aws_cloudwatch_metric_alarm" "queues-not-active-5-minutes-critical" {
- actions_enabled = true -> null
- alarm_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-critical",
] -> null
- alarm_description = "Queues have not been active for 5 minutes" -> null
- alarm_name = "queues-not-active-5-minutes-critical" -> null
- arn = "arn:aws:cloudwatch:ca-central-1:239043911459:alarm:queues-not-active-5-minutes-critical" -> null
- comparison_operator = "LessThanThreshold" -> null
- datapoints_to_alarm = 0 -> null
- dimensions = {} -> null
- evaluation_periods = 1 -> null
- id = "queues-not-active-5-minutes-critical" -> null
- insufficient_data_actions = [] -> null
- metric_name = "queues-are-active" -> null
- namespace = "LogMetrics" -> null
- ok_actions = [
- "arn:aws:sns:ca-central-1:239043911459:alert-critical",
] -> null
- period = 300 -> null
- statistic = "Sum" -> null
- tags = {} -> null
- tags_all = {} -> null
- threshold = 1 -> null
- treat_missing_data = "breaching" -> null
}
Plan: 3 to add, 0 to change, 3 to destroy.
─────────────────────────────────────────────────────────────────────────────
Saved the plan to: plan.tfplan
To perform exactly these actions, run the following command to apply:
terraform apply "plan.tfplan"
Show Conftest resultsWARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.celery-error[0]"]
WARN - plan.json - main - Cloudwatch log metric pattern is invalid: ["aws_cloudwatch_log_metric_filter.scanfiles-timeout[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_acm_certificate.notification-canada-ca-alt[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_listener.notification-canada-ca"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-admin"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-api"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-document"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-document-api"]
WARN - plan.json - main - Missing Common Tags: ["aws_alb_target_group.notification-canada-ca-documentation"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-cluster-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-evicted-pods[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-pods-high-cpu-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-pods-high-memory-warning[0]"]
WARN - plan.json - main - Missing Common Tags: ["aws_cloudwatch_metric_alarm.admin-replicas-unavailable[0]"]
WARN - plan.json - main - Missing Common Tags:... |
Summary | Résumé
This pr amends #1123 to make the alarm names, alarm description, and metric filter names more accurate, i.e. this is monitoring that the beat inbox celery tasks are constantly firing and not so much watching the queues themselves.