Skip to content

Commit

Permalink
🔧 Oppdaterer alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
mrbjoern committed Jan 23, 2024
1 parent fc5ad45 commit 2f7f534
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_and_deploy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ jobs:
secrets: inherit
with:
cluster: prod-gcp
manifest: .nais/nais.yaml
manifest: .nais/nais.yaml,.nais/prod-alerts.yaml
imageSuffix: -prod-gcp
vars: .nais/prod.yaml
56 changes: 32 additions & 24 deletions .nais/prod-alerts.yaml
Original file line number Diff line number Diff line change
@@ -1,31 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: innsyn-alerts
name: mine-aap-alerts
namespace: aap
labels:
team: aap
app: innsyn
namespace: aap
spec:
receivers:
slack:
channel: '#po-aap-innbygger-prod-varslinger'
alerts:
- alert: Mine AAP(aap-innsyn) er nede
expr: kube_deployment_status_replicas_unavailable{deployment="innsyn",job="kubernetes-service-endpoints"} > 0
for: 10m
description: "innsyn har utilgjengelige podder i aap"
action: "kubectl describe pod -l app=innsyn -n aap` for events og `kubectl get pods -l app=innsyn -n aap` for å se feilende podder"
sla: respond within 1h, during office hours
severity: danger
- alert: Mine AAP(aap-innsyn) kontinuerlig restart
expr: sum(increase(kube_pod_container_status_restarts_total{container=~"innsyn"}[5m])) by (container) > 2
for: 2m
description: "innsyn har restartet flere ganger de siste 5 minuttene!"
action: "Se `kubectl describe pod innsyn` for events, og `kubectl logs innsyn` for logger"
sla: respond within 1h, during office hours
severity: danger
- alert: Mine AAP(aap-innsyn) - høy feilrate i logger
expr: (100 * sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="innsyn",log_level=~"Error"}[10m])) / sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="innsyn"}[10m]))) > 15
for: 5m
action: "<https://logs.adeo.no/goto/c5feac60-58ff-11ed-b3e8-d969437dd878|Check logs>"
groups:
- name: 'aap-mine-aap-alerts'
rules:
- alert: Mine aap - app nede
expr: kube_deployment_status_replicas_unavailable{deployment="mine-aap",job="kubernetes-service-endpoints"} > 0
for: 10m
annotations:
summary: "mine-aap har utilgjengelige podder i prod"
action: "kubectl describe pod -l app=innsyn -n aap` for events og `kubectl get pods -l app=innsyn -n aap` for å se feilende podder"
labels:
namespace: aap
severity: danger
- alert: Mine aap -kontinuerlig restart
expr: sum(increase(kube_pod_container_status_restarts_total{container=~"innsyn"}[5m])) by (container) > 2
for: 2m
annotations:
summary: "Mine aap har restartet flere ganger de siste 5 minuttene"
action: "Se `kubectl describe pod innsyn` for events, og `kubectl logs innsyn` for logger"
labels:
namespace: aap
severity: danger
- alert: Mine aap - høy feilrate i logger
expr: (100 * sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="innsyn",log_level=~"Error"}[10m])) / sum by (log_app, log_namespace) (rate(logd_messages_total{log_app="innsyn"}[10m]))) > 15
for: 5m
annotations:
summary: "Høy feilrate i prod logger"
action: "<https://logs.adeo.no/goto/c5feac60-58ff-11ed-b3e8-d969437dd878|Check logs>"
labels:
namespace: aap
severity: danger

0 comments on commit 2f7f534

Please sign in to comment.