Skip to content

Commit

Permalink
feat(alertmanager): move to app-template, add VolSync jank
Browse files Browse the repository at this point in the history
  • Loading branch information
JJGadgets committed Apr 23, 2024
1 parent 4e7b955 commit 4a14ed3
Show file tree
Hide file tree
Showing 12 changed files with 464 additions and 90 deletions.
1 change: 1 addition & 0 deletions kube/clusters/biohazard/flux/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ resources:
- ../../../deploy/core/monitoring/metrics-server/
- ../../../deploy/core/monitoring/kps/
- ../../../deploy/core/monitoring/grafana/
- ../../../deploy/core/monitoring/alertmanager/
- ../../../deploy/core/monitoring/karma/
- ../../../deploy/core/monitoring/node-exporter/
- ../../../deploy/core/monitoring/smartctl-exporter/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
---
global:
resolve_timeout: 5m
route:
group_by: ["alertname", "job"]
group_interval: 10m
group_wait: 1m
receiver: discord
repeat_interval: 12h
routes:
# - receiver: heartbeat
# group_interval: 5m
# group_wait: 0s
# matchers:
# - alertname =~ "Watchdog"
# repeat_interval: 5m
- receiver: "null"
matchers:
- alertname =~ "InfoInhibitor"
- receiver: discord
continue: true
matchers:
- severity = "critical"
- severity = "warning" # gotta try it out to know what's what
inhibit_rules:
- equal: ["alertname", "namespace"]
source_matchers:
- severity = "critical"
target_matchers:
- severity = "warning"
receivers:
# - name: heartbeat
# webhook_configs:
# - send_resolved: true
# url: "${CLOUD_GATUS_HEARTBEAT_URL}"
- name: "null"
- name: discord
discord_configs:
- send_resolved: true
webhook_url_file: "/secrets/discord"
# yoinked below from onedr0p, blame him if something doesn't work
title: >-
{{ .CommonLabels.alertname }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
message: |-
{{- range .Alerts }}
{{- if ne .Annotations.description "" }}
{{ .Annotations.description }}
{{- else if ne .Annotations.summary "" }}
{{ .Annotations.summary }}
{{- else if ne .Annotations.message "" }}
{{ .Annotations.message }}
{{- else }}
Alert description not available
{{- if gt (len .Labels.SortedPairs) 0 }}
<small>
{{- range .Labels.SortedPairs }}
<b>{{ .Name }}:</b> {{ .Value }}
{{- end }}
</small>
{{- end }}
{{- end }}
{{ .Fingerprint }}
{{ .GeneratorURL }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
configMapGenerator:
- name: alertmanager-config
files:
- ./alertmanager.yaml
generatorOptions:
disableNameSuffixHash: true
19 changes: 19 additions & 0 deletions kube/deploy/core/monitoring/alertmanager/app/es.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
# yaml-language-server: $schema=https://crds.jank.ing/external-secrets.io/externalsecret_v1beta1.json
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: &name alertmanager-secrets
namespace: alertmanager
spec:
refreshInterval: 1m
secretStoreRef:
kind: ClusterSecretStore
name: 1p
dataFrom:
- extract:
key: "AlertManager - ${CLUSTER_NAME}"
target:
creationPolicy: Owner
deletionPolicy: Retain
name: *name
222 changes: 222 additions & 0 deletions kube/deploy/core/monitoring/alertmanager/app/hr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta2
kind: HelmRelease
metadata:
name: &app alertmanager
spec:
interval: 5m
chart:
spec:
chart: app-template
version: "2.6.0"
sourceRef:
name: bjw-s
kind: HelmRepository
namespace: flux-system
values:
controllers:
main:
type: deployment
replicas: 1
pod:
labels:
ingress.home.arpa/nginx-internal: "allow"
egress.home.arpa/discord: "allow"
containers: &ct
main:
image:
repository: "quay.io/prometheus/alertmanager"
tag: "main@sha256:8145bf0107b5106fc1c554f2fc0b628e9e1d2c818bfe4e78867d8f75046aa6c7" # TODO: using main because waiting on 0.27.1 for DiscordConfig webhook_url_file fix (https://github.com/prometheus/alertmanager/pull/3728)
#tag: "v0.27.0@sha256:e13b6ed5cb929eeaee733479dce55e10eb3bc2e9c4586c705a4e8da41e5eacf5"
args:
- --config.file=/etc/alertmanager/alertmanager.yaml
- --storage.path=/alertmanager
- --data.retention=336h
- "--web.external-url=https://${APP_DNS_ALERTMANAGER}/"
- --web.route-prefix=/
- --web.listen-address=:9093
- --cluster.listen-address=[$(POD_IP)]:9094
- "--cluster.label=${CLUSTER_NAME}"
- --cluster.peer=alertmanager.monitoring.svc.cluster.local:9094
- --cluster.peer=alertmanager-local-0.monitoring.svc.cluster.local:9094
- --cluster.peer=alertmanager-local-1.monitoring.svc.cluster.local:9094
- --cluster.peer=alertmanager-local-2.monitoring.svc.cluster.local:9094
- --cluster.reconnect-timeout=5m
env:
TZ: "${CONFIG_TZ}"
POD_IP:
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
ports:
- name: http
containerPort: 9093
securityContext: &sc
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
resources:
requests:
cpu: "10m"
memory: "128Mi"
limits:
cpu: "3000m"
memory: "1Gi"
probes:
readiness:
enabled: true
custom: true
spec: &ready
httpGet:
path: "/-/ready"
port: http
scheme: HTTP
initialDelaySeconds: 3
periodSeconds: 5
timeoutSeconds: 3
successThreshold: 1
failureThreshold: 10
startup:
enabled: true
custom: true
spec:
<<: *ready
periodSeconds: 1
failureThreshold: 60
liveness:
enabled: true
custom: true
spec:
httpGet:
path: "/-/healthy"
port: http
scheme: HTTP
periodSeconds: 10
timeoutSeconds: 3
successThreshold: 1
failureThreshold: 10
local:
type: statefulset
replicas: 3
pod:
labels:
ingress.home.arpa/nginx-internal: allow
egress.home.arpa/discord: allow
containers: *ct
statefulset:
volumeClaimTemplates:
- name: data
size: 2Gi
storageClass: local
accessMode: ReadWriteOnce
advancedMounts:
main:
- path: /alertmanager
service:
main:
controller: main
ports: &ports
http:
port: 9093
cluster:
port: 9094
local: &svc
controller: local
ports: *ports
local-0:
<<: *svc
extraSelectorLabels:
apps.kubernetes.io/pod-index: "0"
local-1:
<<: *svc
extraSelectorLabels:
apps.kubernetes.io/pod-index: "1"
local-2:
<<: *svc
extraSelectorLabels:
apps.kubernetes.io/pod-index: "2"
ingress:
main:
enabled: true
primary: true
className: "nginx-internal"
annotations:
external-dns.alpha.kubernetes.io/target: "${DNS_CF}"
external-dns.alpha.kubernetes.io/cloudflare-proxied: "true"
hosts:
- host: &host "${APP_DNS_ALERTMANAGER}"
paths: &paths
- path: /
pathType: Prefix
service:
name: local
port: http
tls:
- hosts: [*host]
tailscale:
enabled: true
primary: false
className: "tailscale"
hosts:
- host: &host "${APP_DNS_ALERTMANAGER_TS}"
paths: *paths
tls:
- hosts: [*host]
persistence:
data:
existingClaim: alertmanager-data
advancedMounts:
main:
main:
- subPath: data
path: /alertmanager
config:
enabled: true
type: configMap
name: alertmanager-config
globalMounts:
- path: /etc/alertmanager
secrets:
type: secret
name: alertmanager-secrets
globalMounts:
- path: /secrets
defaultPodOptions:
automountServiceAccountToken: false
enableServiceLinks: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
fsGroupChangePolicy: "Always"
seccompProfile: { type: "RuntimeDefault" }
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "kubernetes.io/hostname"
whenUnsatisfiable: "DoNotSchedule"
labelSelector:
matchLabels:
app.kubernetes.io/name: *app
app.kubernetes.io/component: local
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "fuckoff.home.arpa/alertmanager"
operator: "DoesNotExist"
serviceMonitor:
main:
enabled: true
selector:
app.kubernetes.io/name: alertmanager
endpoints:
- port: http
scheme: http
enableHttp2: true
path: "/metrics"
interval: 1m
scrapeTimeout: 30s
41 changes: 41 additions & 0 deletions kube/deploy/core/monitoring/alertmanager/ks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: 1-core-monitoring-alertmanager-app
namespace: flux-system
labels: &l
app.kubernetes.io/name: "alertmanager"
spec:
commonMetadata:
labels: *l
path: ./kube/deploy/core/monitoring/alertmanager/app
targetNamespace: "monitoring"
dependsOn:
- name: 1-core-monitoring-alertmanager-pvc
---
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: 1-core-monitoring-alertmanager-pvc
namespace: flux-system
labels: &l
app.kubernetes.io/name: "alertmanager"
spec:
commonMetadata:
labels: *l
path: ./kube/deploy/core/storage/volsync/template
targetNamespace: "monitoring"
dependsOn:
- name: 1-core-storage-volsync-app
- name: 1-core-storage-rook-ceph-cluster
postBuild:
substitute:
PVC: "alertmanager-data"
SIZE: "2Gi"
SC: &sc "file"
SNAP: *sc
ACCESSMODE: "ReadWriteMany"
RUID: "1000"
RGID: "2000"
RFSG: "2000"
5 changes: 5 additions & 0 deletions kube/deploy/core/monitoring/alertmanager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ks.yaml
2 changes: 1 addition & 1 deletion kube/deploy/core/monitoring/karma/app/hr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
env:
TZ: "${CONFIG_TZ}"
# TODO: use full config mapping to all AM instances and 1 cluster, including different deployments for authentik, cloudflared and Tailscale's forward auth headers
ALERTMANAGER_URI: "http://kps-alertmanager.monitoring.svc:9093"
ALERTMANAGER_URI: "http://alertmanager-local.monitoring.svc:8080"
ALERTMANAGER_EXTERNAL_URI: "https://${APP_DNS_ALERTMANAGER}"
ALERTMANAGER_PROXY: "true"
securityContext: &sc
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
alertmanager:
enabled: true
enabled: false
ingress:
enabled: true
pathType: "Prefix"
Expand Down
Loading

0 comments on commit 4a14ed3

Please sign in to comment.