Skip to content

Commit

Permalink
Updating cloudwatch stuff to reflect new pod and deployment names (#1691
Browse files Browse the repository at this point in the history
)

* Updating cloudwatch stuff to reflect new pod and deployment names
[review]

* switch to breaching

* alarm fixes

* making these conditional for our production rollout.  Need them to work on staging first only for a while
[review]

* Revert "making these conditional for our production rollout.  Need them to work on staging first only for a while"

This reverts commit 621d0fe.

* formatting

* self referencing whoops

---------

Co-authored-by: Michael Pond <[email protected]>
  • Loading branch information
ben851 and P0NDER0SA authored Dec 12, 2024
1 parent 620c2bd commit b9c72ef
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 178 deletions.
84 changes: 34 additions & 50 deletions aws/eks/cloudwatch_alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "admin"
Service = var.env == "production" ? "admin" : "notify-admin"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -197,7 +197,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "api"
Service = var.env == "production" ? "api" : "notify-api"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -218,7 +218,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-primary"
Service = var.env == "production" ? "celery-primary" : "notify-celery-primary"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -239,7 +239,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-scalable"
Service = var.env == "production" ? "celery-scalable" : "notify-celery-scalable"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -260,7 +260,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-cpu-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-sms"
Service = var.env == "production" ? "celery-sms" : "notify-celery-sms"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -282,7 +282,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-pods-high-memory-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "admin"
Service = var.env == "production" ? "admin" : "notify-admin"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -303,7 +303,7 @@ resource "aws_cloudwatch_metric_alarm" "api-pods-high-memory-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "api"
Service = var.env == "production" ? "api" : "notify-api"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -324,7 +324,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-pods-high-memory-warning"
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-primary"
Service = var.env == "production" ? "celery-primary" : "notify-celery-primary"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand All @@ -345,7 +345,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-pods-high-memory-warning" {
treat_missing_data = "missing"
dimensions = {
Namespace = "notification-canada-ca"
Service = "celery-sms"
Service = var.env == "production" ? "celery-sms" : "notify-celery-sms"
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
}
}
Expand Down Expand Up @@ -464,7 +464,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" {
alarm_description = "Celery Primary Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -478,7 +478,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-primary-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-primary"
deployment = var.env == "production" ? "celery-primary" : "notify-celery-primary"
}
}
}
Expand All @@ -493,7 +493,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" {
alarm_description = "Celery Scalable Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -507,7 +507,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-scalable-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-scalable"
deployment = var.env == "production" ? "celery-scalable" : "notify-celery-scalable"
}
}
}
Expand All @@ -521,7 +521,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" {
alarm_description = "Celery Beat Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -535,7 +535,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-beat-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-beat"
deployment = var.env == "production" ? "celery-beat" : "notify-celery-beat"
}
}
}
Expand All @@ -549,7 +549,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" {
alarm_description = "Celery SMS Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -563,7 +563,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-sms"
deployment = var.env == "production" ? "celery-sms" : "notify-celery-sms"
}
}
}
Expand All @@ -577,7 +577,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unava
alarm_description = "Celery Email Send Primary Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -591,7 +591,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-primary-replicas-unava
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-email-send-primary"
deployment = var.env == "production" ? "celery-email-send-primary" : "notify-celery-email-send-primary"
}
}
}
Expand All @@ -606,7 +606,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unav
alarm_description = "Celery Email Send Scalable Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -620,7 +620,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-email-send-scalable-replicas-unav
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-email-send-scalable"
deployment = var.env == "production" ? "celery-email-send-scalable" : "notify-celery-email-send-scalable"
}
}
}
Expand All @@ -634,7 +634,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavail
alarm_description = "Celery SMS Send Primary Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -648,7 +648,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-primary-replicas-unavail
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-sms-send-primary"
deployment = var.env == "production" ? "celery-sms-send-primary" : "notify-celery-sms-send-primary"
}
}
}
Expand All @@ -663,7 +663,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavai
alarm_description = "Celery SMS Send Scalable Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -677,7 +677,7 @@ resource "aws_cloudwatch_metric_alarm" "celery-sms-send-scalable-replicas-unavai
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "celery-sms-send-scalable"
deployment = var.env == "production" ? "celery-sms-send-scalable" : "notify-celery-sms-send-scalable"
}
}
}
Expand All @@ -691,7 +691,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" {
alarm_description = "Notify Admin Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -705,7 +705,7 @@ resource "aws_cloudwatch_metric_alarm" "admin-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "admin"
deployment = var.env == "production" ? "admin" : "notify-admin"
}
}
}
Expand All @@ -719,7 +719,7 @@ resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" {
alarm_description = "Notify K8S API Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -733,7 +733,7 @@ resource "aws_cloudwatch_metric_alarm" "api-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "api"
deployment = var.env == "production" ? "api" : "notify-api"
}
}
}
Expand All @@ -747,7 +747,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" {
alarm_description = "Notify Documentation Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -761,7 +761,7 @@ resource "aws_cloudwatch_metric_alarm" "documentation-replicas-unavailable" {
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "documentation"
deployment = var.env == "production" ? "documentation" : "notify-documentation"
}
}
}
Expand All @@ -775,7 +775,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailab
alarm_description = "Notify Document Download API Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand All @@ -789,7 +789,7 @@ resource "aws_cloudwatch_metric_alarm" "document-download-api-replicas-unavailab
dimensions = {
ClusterName = aws_eks_cluster.notification-canada-ca-eks-cluster.name
namespace = var.notify_k8s_namespace
deployment = "document-download-api"
deployment = var.env == "production" ? "document-download" : "notify-document-download"
}
}
}
Expand Down Expand Up @@ -888,7 +888,7 @@ resource "aws_cloudwatch_metric_alarm" "karpenter-replicas-unavailable" {
alarm_description = "Karpenter Replicas Unavailable"
#Setting to warn until we verify that it is working as expected
alarm_actions = [var.sns_alert_warning_arn]
treat_missing_data = "notBreaching"
treat_missing_data = "breaching"
threshold = 1

metric_query {
Expand Down Expand Up @@ -939,22 +939,6 @@ resource "aws_cloudwatch_metric_alarm" "aggregating-queues-not-active-5-minutes-
ok_actions = [var.sns_alert_critical_arn]
}

resource "aws_cloudwatch_metric_alarm" "github-arc-runner-error-alarm" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "github-arc-runner-error-alarm"
alarm_description = "GitHub ARC Runners Are Failing"
comparison_operator = "LessThanThreshold"
evaluation_periods = "1"
metric_name = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].name
namespace = aws_cloudwatch_log_metric_filter.github-arc-runner-alarm[0].metric_transformation[0].namespace
period = "300"
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_actions = [var.sns_alert_critical_arn]
ok_actions = [var.sns_alert_critical_arn]
}

resource "aws_cloudwatch_metric_alarm" "service-callback-too-many-failures-warning" {
count = var.cloudwatch_enabled ? 1 : 0
alarm_name = "service-callback-too-many-failures-warning"
Expand Down
23 changes: 5 additions & 18 deletions aws/eks/cloudwatch_log.tf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ resource "aws_cloudwatch_log_metric_filter" "bounce-rate-critical" {
resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "api-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"api-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"api-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-api-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -111,7 +111,7 @@ resource "aws_cloudwatch_log_metric_filter" "api-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "celery-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"celery-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"celery-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-celery-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -124,7 +124,7 @@ resource "aws_cloudwatch_log_metric_filter" "celery-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "admin-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"admin-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"admin-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-admin-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -137,7 +137,7 @@ resource "aws_cloudwatch_log_metric_filter" "admin-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "document-download-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"document-download-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"document-download-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-document-download-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -150,7 +150,7 @@ resource "aws_cloudwatch_log_metric_filter" "document-download-evicted-pods" {
resource "aws_cloudwatch_log_metric_filter" "documentation-evicted-pods" {
count = var.cloudwatch_enabled ? 1 : 0
name = "documentation-evicted-pods"
pattern = "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"documentation-*\") }"
pattern = var.env == "production" ? "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"documentation-*\") }" : "{ ($.reason = \"Evicted\") && ($.kube_pod_status_reason = 1) && ($.pod = \"notify-documentation-*\") }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-prometheus-logs[0].name

metric_transformation {
Expand All @@ -173,19 +173,6 @@ resource "aws_cloudwatch_log_metric_filter" "aggregating-queues-are-active" {
}
}

resource "aws_cloudwatch_log_metric_filter" "github-arc-runner-alarm" {
count = var.cloudwatch_enabled ? 1 : 0
name = "GitHub ARC Runners Write Alarm"
pattern = "{ $.kubernetes.pod_name = \"github-arc-ss-${var.env}-*-runner-*\" && $.log = \"*ERROR*\" }"
log_group_name = aws_cloudwatch_log_group.notification-canada-ca-eks-application-logs[0].name

metric_transformation {
name = "aggregating-github-arc-runner-alarm"
namespace = "LogMetrics"
value = "1"
}
}

resource "aws_cloudwatch_log_metric_filter" "callback-request-failures" {
count = var.cloudwatch_enabled ? 1 : 0
name = "callback-request-failures"
Expand Down
Loading

0 comments on commit b9c72ef

Please sign in to comment.