Skip to content

Commit

Permalink
🧱 Fikser prod-alerts.yaml
Browse files Browse the repository at this point in the history
Co-authored-by: Øivind Stensrud <[email protected]>
Co-authored-by: Tor Idland <[email protected]>
  • Loading branch information
3 people committed Jan 19, 2024
1 parent 127ef1f commit ef6c5ef
Showing 1 changed file with 21 additions and 22 deletions.
43 changes: 21 additions & 22 deletions .nais/prod-alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,24 @@ metadata:
app: soknad
namespace: aap
spec:
receivers:
slack:
channel: '#po-aap-innbygger-prod-varslinger'
alerts:
- alert: Søknad - app nede
expr: kube_deployment_status_replicas_unavailable{deployment="soknad",job="kubernetes-service-endpoints"} > 0
for: 10m
description: "soknad har utilgjengelige podder i aap"
action: "kubectl describe pod -l app=soknad -n aap` for events og `kubectl get pods -l app=soknad -n aap` for å se feilende podder"
sla: respond within 1h, during office hours
severity: danger
- alert: Søknad -kontinuerlig restart
expr: sum(increase(kube_pod_container_status_restarts_total{container=~"soknad"}[5m])) by (container) > 2
for: 2m
description: "soknad har restartet flere ganger de siste 5 minuttene!"
action: "Se `kubectl describe pod soknad` for events, og `kubectl logs soknad` for logger"
sla: respond within 1h, during office hours
severity: danger
- alert: Søknad - høy feilrate i logger
expr: (100 * sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="soknad",log_level=~"Error"}[10m])) / sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="soknad"}[10m]))) > 15
for: 5m
action: "<https://logs.adeo.no/goto/1e8b3180-56c8-11ed-b3e8-d969437dd878|Check logs>"
groups:
- name: 'alerts'
rules:
- alert: Søknad - app nede
expr: kube_deployment_status_replicas_unavailable{deployment="soknad",job="kubernetes-service-endpoints"} > 0
for: 10m
description: "soknad har utilgjengelige podder i aap"
action: "kubectl describe pod -l app=soknad -n aap` for events og `kubectl get pods -l app=soknad -n aap` for å se feilende podder"
sla: respond within 1h, during office hours
severity: danger
- alert: Søknad -kontinuerlig restart
expr: sum(increase(kube_pod_container_status_restarts_total{container=~"soknad"}[5m])) by (container) > 2
for: 2m
description: "soknad har restartet flere ganger de siste 5 minuttene!"
action: "Se `kubectl describe pod soknad` for events, og `kubectl logs soknad` for logger"
sla: respond within 1h, during office hours
severity: danger
- alert: Søknad - høy feilrate i logger
expr: (100 * sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="soknad",log_level=~"Error"}[10m])) / sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="soknad"}[10m]))) > 15
for: 5m
action: "<https://logs.adeo.no/goto/1e8b3180-56c8-11ed-b3e8-d969437dd878|Check logs>"

0 comments on commit ef6c5ef

Please sign in to comment.