Skip to content

Commit

Permalink
fix: more alarms (#297)
Browse files Browse the repository at this point in the history
* fix: latency alarms

* fix: RDS CPU and memory alarms

* chore: revert

* fix: name

* chore: reduce threshold for HTTP latency alarm
  • Loading branch information
chris13524 authored Jan 20, 2024
1 parent c4a0542 commit 41ae8c9
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 0 deletions.
19 changes: 19 additions & 0 deletions terraform/monitoring/panels/app/http_request_latency.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,25 @@ local targets = grafana.targets;
.withUnit('ms')
)

.setAlert(vars.environment, grafana.alert.new(
namespace = vars.namespace,
name = '%(env)s - HTTP request latency too high' % { env: vars.environment },
message = '%(env)s - HTTP request latency too high' % { env: vars.environment },
notifications = vars.notifications,
noDataState = 'no_data',
conditions = [
grafana.alertCondition.new(
evaluatorParams = [ 3000 ],
evaluatorType = 'gt',
operatorType = 'or',
queryRefId = 'HttpRequestLatency',
queryTimeStart = '5m',
queryTimeEnd = 'now',
reducerType = grafana.alert_reducers.Avg
),
],
))

.addTarget(targets.prometheus(
datasource = ds.prometheus,
expr = 'sum by (aws_ecs_task_revision, method, endpoint) (rate(http_request_latency_sum[$__rate_interval])) / sum by (aws_ecs_task_revision, method, endpoint) (rate(http_request_latency_count[$__rate_interval]))',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,25 @@ local targets = grafana.targets;
.withUnit('ms')
)

.setAlert(vars.environment, grafana.alert.new(
namespace = vars.namespace,
name = '%(env)s - Relay incomming message latency too high' % { env: vars.environment },
message = '%(env)s - Relay incomming message latency too high' % { env: vars.environment },
notifications = vars.notifications,
noDataState = 'no_data',
conditions = [
grafana.alertCondition.new(
evaluatorParams = [ 5000 ],
evaluatorType = 'gt',
operatorType = 'or',
queryRefId = 'RelayIncomingMessageLatency',
queryTimeStart = '5m',
queryTimeEnd = 'now',
reducerType = grafana.alert_reducers.Avg
),
],
))

.addTarget(targets.prometheus(
datasource = ds.prometheus,
expr = 'sum by (aws_ecs_task_revision, tag) (rate(relay_incoming_message_latency_sum[$__rate_interval])) / sum by (aws_ecs_task_revision, tag) (rate(relay_incoming_message_latency_count[$__rate_interval]))',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,26 @@ local targets = grafana.targets;
.withUnit('cps')
)

.setAlert(vars.environment, grafana.alert.new(
namespace = vars.namespace,
name = '%(env)s - Not receiving any watch subscriptions requests' % { env: vars.environment },
message = '%(env)s - Not receiving any watch subscriptions requests' % { env: vars.environment },
notifications = vars.notifications,
noDataState = 'no_data',
period = '30m',
conditions = [
grafana.alertCondition.new(
evaluatorParams = [ 1 ],
evaluatorType = 'lt',
operatorType = 'or',
queryRefId = 'RelayIncomingWatchSubscriptionsRate',
queryTimeStart = '5m',
queryTimeEnd = 'now',
reducerType = grafana.alert_reducers.Avg
),
],
))

.addTarget(targets.prometheus(
datasource = ds.prometheus,
expr = 'sum by (aws_ecs_task_revision, tag) (rate(relay_incoming_messages_total[$__rate_interval]))',
Expand Down
13 changes: 13 additions & 0 deletions terraform/monitoring/panels/rds/cpu.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,23 @@ local targets = grafana.targets;
)
.configure(defaults.configuration.timeseries)

.setAlert(
vars.environment,
defaults.alerts.cpu(
namespace = vars.namespace,
env = vars.environment,
title = 'RDS',
notifications = vars.notifications,
refid = 'CPU_Avg',
limit = 70,
)
)

.addTarget(targets.cloudwatch(
datasource = ds.cloudwatch,
namespace = 'AWS/RDS',
metricName = 'CPUUtilization',
statistic = 'Average',
refId = 'CPU_Avg'
))
}
19 changes: 19 additions & 0 deletions terraform/monitoring/panels/rds/freeable_memory.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,29 @@ local targets = grafana.targets;
.withUnit(grafana.fieldConfig.units.DecBytes)
)

.setAlert(vars.environment, grafana.alert.new(
namespace = vars.namespace,
name = '%(env)s - RDS freeable memory low' % { env: vars.environment },
message = '%(env)s - RDS freeable memory low' % { env: vars.environment },
notifications = vars.notifications,
conditions = [
grafana.alertCondition.new(
evaluatorParams = [ 30 ],
evaluatorType = 'lt',
operatorType = 'or',
queryRefId = 'Mem_Avg',
queryTimeStart = '5m',
queryTimeEnd = 'now',
reducerType = grafana.alert_reducers.Avg
),
],
))

.addTarget(targets.cloudwatch(
datasource = ds.cloudwatch,
namespace = 'AWS/RDS',
metricName = 'FreeableMemory',
statistic = 'Average',
refId = 'Mem_Avg',
))
}

0 comments on commit 41ae8c9

Please sign in to comment.