Skip to content

Commit

Permalink
feat: add influxdb pvc monitoring
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Görn <[email protected]>
  • Loading branch information
goern committed Sep 13, 2023
1 parent d4b9ea1 commit 670d746
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 20 deletions.
25 changes: 13 additions & 12 deletions manifests/base/influxdb/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- influxdb2/influxdb/templates/configmap-backup.yaml
- influxdb2/influxdb/templates/cronjob-backup.yaml
- influxdb2/influxdb/templates/deployment.yaml
- influxdb2/influxdb/templates/pvc-backup.yaml
- influxdb2/influxdb/templates/pvc.yaml
- influxdb2/influxdb/templates/service.yaml
- influxdb2/influxdb/templates/serviceaccount.yaml
- influx-config.yaml
- influx-vars.yaml
# - influxdb2-auth.yaml
# - influxdb2-backup.yaml
- ingress.yaml
- influxdb2/influxdb/templates/configmap-backup.yaml
- influxdb2/influxdb/templates/cronjob-backup.yaml
- influxdb2/influxdb/templates/deployment.yaml
- influxdb2/influxdb/templates/pvc-backup.yaml
- influxdb2/influxdb/templates/pvc.yaml
- influxdb2/influxdb/templates/service.yaml
- influxdb2/influxdb/templates/serviceaccount.yaml
- influx-config.yaml
- influx-vars.yaml
- monitoring_alerting.yaml
- ingress.yaml

patchesStrategicMerge:
- deployment.yaml
- cronjob-backup.yaml
36 changes: 36 additions & 0 deletions manifests/base/influxdb/monitoring_alerting.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: influxdb2
spec:
endpoints:
- interval: 15s
port: http
scheme: http
selector:
matchLabels:
app.kubernetes.io/component: influxdb
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: influxdb2
spec:
groups:
- name: InfluxDB2PersistentVolumes
rules:
- alert: influxdb2
annotations:
description: InfluxDB2 data volume is full.
summary: InfluxDB2 data volume is full.
expr: 'kubelet_volume_stats_available_bytes{job="kubelet",persistentvolumeclaim="influxdb2"} / kubelet_volume_stats_capacity_bytes{job="kubelet",persistentvolumeclaim="influxdb2"} < 0.2'
labels:
severity: critical
# - alert: influxdb2-backup
# annotations:
# description: InfluxDB2 backup volume is full.
# summary: InfluxDB2 backup volume is full.
# expr: 'kubelet_volume_stats_available_bytes{job="kubelet",persistentvolumeclaim="influxdb2"} / kubelet_volume_stats_capacity_bytes{job="kubelet",persistentvolumeclaim="influxdb2"} < 0.2'
# labels:
# severity: critical
24 changes: 16 additions & 8 deletions manifests/env/nostromo-stage/postgresql/alerting.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,29 @@ spec:
expr: time() - max( kube_job_status_start_time * ON(job_name) GROUP_RIGHT() kube_job_labels{job_name=~"db-repo1-diff.*"} * ON(job_name) GROUP_RIGHT() label_replace(kube_pod_labels, "job_name", "$1", "label_job_name", "(.*)") * ON(pod) GROUP_RIGHT() kube_pod_status_phase{job="kube-state-metrics",phase=~"Succeeded"}) BY (job_name,label_postgres_operator_crunchydata_com_pgbackrest_cronjob) > 86400
labels:
severity: critical
for: 10m
annotations:
message: "Backup job {{ $labels.job_name }} has NOT been running for more than 24 hours."
- alert: FailedBackupPods
expr: kube_pod_status_phase{namespace="b4mad-racing-stage",phase="Failed"} * ON(pod) GROUP_RIGHT() kube_pod_labels{label_postgres_operator_crunchydata_com_pgbackrest_cronjob="diff",namespace="b4mad-racing-stage"} > 0
labels:
severity: critical
for: 10m
annotations:
message: "Backup job {{ $labels.job_name }} has NOT been running for more than 24 hours."
for: 10m
annotations:
message: "Backup job {{ $labels.job_name }} has NOT been running for more than 24 hours."
- name: PersistenVolumeFullAlertRule
rules:
- alert: PostgresqlPersistenVolumeFull
expr: kubelet_volume_stats_available_bytes{job="kubelet", persistentvolumeclaim=~"db-instance.*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",persistentvolumeclaim=~"db-instance.*"} < 0.2
labels:
severity: critical
for: 5m
labels:
secerity: critical
annotations:
message: The Postgresql PersistentVolume claimed is under 20 percent free.
for: 5m
annotations:
message: The Postgresql PersistentVolume claimed is under 20 percent free.
- alert: PostgresqlPersistenVolumeShort
expr: kubelet_volume_stats_available_bytes{job="kubelet", persistentvolumeclaim=~"db-instance.*"} / kubelet_volume_stats_capacity_bytes{job="kubelet",persistentvolumeclaim=~"db-instance.*"} < 0.4
labels:
severity: critical
for: 5m
annotations:
message: The Postgresql PersistentVolume claimed is under 40 percent free.

0 comments on commit 670d746

Please sign in to comment.