From ba05dbd34c8f04e8b5074dc39d2127242f794121 Mon Sep 17 00:00:00 2001 From: JJGadgets Date: Tue, 23 Apr 2024 03:06:11 +0800 Subject: [PATCH] feat(alertmanager): move to app-template, add VolSync jank --- .../biohazard/flux/kustomization.yaml | 1 + .../alertmanager/app/config/alertmanager.yaml | 65 ++++++ .../app/config/kustomization.yaml | 9 + .../core/monitoring/alertmanager/app/es.yaml | 19 ++ .../core/monitoring/alertmanager/app/hr.yaml | 220 ++++++++++++++++++ .../core/monitoring/alertmanager/ks.yaml | 41 ++++ .../alertmanager/kustomization.yaml | 5 + kube/deploy/core/monitoring/karma/app/hr.yaml | 2 +- .../kps/app/helm-values/alertmanager.yaml | 2 +- .../monitoring/kps/app/helm-values/prom.yaml | 10 + kube/deploy/core/monitoring/kps/app/hr.yaml | 4 +- 11 files changed, 375 insertions(+), 3 deletions(-) create mode 100644 kube/deploy/core/monitoring/alertmanager/app/config/alertmanager.yaml create mode 100644 kube/deploy/core/monitoring/alertmanager/app/config/kustomization.yaml create mode 100644 kube/deploy/core/monitoring/alertmanager/app/es.yaml create mode 100644 kube/deploy/core/monitoring/alertmanager/app/hr.yaml create mode 100644 kube/deploy/core/monitoring/alertmanager/ks.yaml create mode 100644 kube/deploy/core/monitoring/alertmanager/kustomization.yaml diff --git a/kube/clusters/biohazard/flux/kustomization.yaml b/kube/clusters/biohazard/flux/kustomization.yaml index 4869559907..56faea0c7f 100644 --- a/kube/clusters/biohazard/flux/kustomization.yaml +++ b/kube/clusters/biohazard/flux/kustomization.yaml @@ -46,6 +46,7 @@ resources: - ../../../deploy/core/monitoring/metrics-server/ - ../../../deploy/core/monitoring/kps/ - ../../../deploy/core/monitoring/grafana/ + - ../../../deploy/core/monitoring/alertmanager/ - ../../../deploy/core/monitoring/karma/ - ../../../deploy/core/monitoring/node-exporter/ - ../../../deploy/core/monitoring/smartctl-exporter/ diff --git a/kube/deploy/core/monitoring/alertmanager/app/config/alertmanager.yaml b/kube/deploy/core/monitoring/alertmanager/app/config/alertmanager.yaml new file mode 100644 index 0000000000..fffd8884ed --- /dev/null +++ b/kube/deploy/core/monitoring/alertmanager/app/config/alertmanager.yaml @@ -0,0 +1,65 @@ +--- +global: + resolve_timeout: 5m +route: + group_by: ["alertname", "job"] + group_interval: 10m + group_wait: 1m + receiver: discord + repeat_interval: 12h + routes: + # - receiver: heartbeat + # group_interval: 5m + # group_wait: 0s + # matchers: + # - alertname =~ "Watchdog" + # repeat_interval: 5m + - receiver: "null" + matchers: + - alertname =~ "InfoInhibitor" + - receiver: discord + continue: true + matchers: + - severity = "critical" + - severity = "warning" # gotta try it out to know what's what +inhibit_rules: + - equal: ["alertname", "namespace"] + source_matchers: + - severity = "critical" + target_matchers: + - severity = "warning" +receivers: + # - name: heartbeat + # webhook_configs: + # - send_resolved: true + # url: "${CLOUD_GATUS_HEARTBEAT_URL}" + - name: "null" + - name: discord + discord_configs: + - send_resolved: true + webhook_url_file: "/secrets/discord" + # yoinked below from onedr0p, blame him if something doesn't work + title: >- + {{ .CommonLabels.alertname }} + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] + message: |- + {{- range .Alerts }} + {{- if ne .Annotations.description "" }} + {{ .Annotations.description }} + {{- else if ne .Annotations.summary "" }} + {{ .Annotations.summary }} + {{- else if ne .Annotations.message "" }} + {{ .Annotations.message }} + {{- else }} + Alert description not available + {{- if gt (len .Labels.SortedPairs) 0 }} + + {{- range .Labels.SortedPairs }} + {{ .Name }}: {{ .Value }} + {{- end }} + + {{- end }} + {{- end }} + {{ .Fingerprint }} + {{ .GeneratorURL }} + {{- end }} diff --git a/kube/deploy/core/monitoring/alertmanager/app/config/kustomization.yaml b/kube/deploy/core/monitoring/alertmanager/app/config/kustomization.yaml new file mode 100644 index 0000000000..185caa39fc --- /dev/null +++ b/kube/deploy/core/monitoring/alertmanager/app/config/kustomization.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +configMapGenerator: + - name: alertmanager-config + files: + - ./alertmanager.yaml +generatorOptions: + disableNameSuffixHash: true diff --git a/kube/deploy/core/monitoring/alertmanager/app/es.yaml b/kube/deploy/core/monitoring/alertmanager/app/es.yaml new file mode 100644 index 0000000000..b1f50a627d --- /dev/null +++ b/kube/deploy/core/monitoring/alertmanager/app/es.yaml @@ -0,0 +1,19 @@ +--- +# yaml-language-server: $schema=https://crds.jank.ing/external-secrets.io/externalsecret_v1beta1.json +apiVersion: external-secrets.io/v1beta1 +kind: ExternalSecret +metadata: + name: &name alertmanager-secrets + namespace: alertmanager +spec: + refreshInterval: 1m + secretStoreRef: + kind: ClusterSecretStore + name: 1p + dataFrom: + - extract: + key: "AlertManager - ${CLUSTER_NAME}" + target: + creationPolicy: Owner + deletionPolicy: Retain + name: *name diff --git a/kube/deploy/core/monitoring/alertmanager/app/hr.yaml b/kube/deploy/core/monitoring/alertmanager/app/hr.yaml new file mode 100644 index 0000000000..fdb99b4068 --- /dev/null +++ b/kube/deploy/core/monitoring/alertmanager/app/hr.yaml @@ -0,0 +1,220 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta2 +kind: HelmRelease +metadata: + name: &app alertmanager +spec: + interval: 5m + chart: + spec: + chart: app-template + version: "2.6.0" + sourceRef: + name: bjw-s + kind: HelmRepository + namespace: flux-system + values: + controllers: + main: + type: deployment + replicas: 1 + pod: + labels: + ingress.home.arpa/nginx-internal: "allow" + egress.home.arpa/discord: "allow" + containers: &ct + main: + image: + repository: "quay.io/prometheus/alertmanager" + tag: "v0.27.0@sha256:e13b6ed5cb929eeaee733479dce55e10eb3bc2e9c4586c705a4e8da41e5eacf5" + args: + - --config-file=/etc/alertmanager/alertmanager.yaml + - --storage.path=/alertmanager + - --data.retention=336h + - "--web.external-url=https://${APP_DNS_ALERTMANAGER}/" + - --web.route-prefix=/ + - --web.listen-address=:9093 + - --cluster.listen-address=[$(POD_IP)]:9094 + - "--cluster.label=${CLUSTER_NAME}" + - --cluster.peer=alertmanager.monitoring.svc.cluster.local:9094 + - --cluster.peer=alertmanager-local-0.monitoring.svc.cluster.local:9094 + - --cluster.peer=alertmanager-local-1.monitoring.svc.cluster.local:9094 + - --cluster.peer=alertmanager-local-2.monitoring.svc.cluster.local:9094 + - --cluster.reconnect-timeout=5m + env: + TZ: "${CONFIG_TZ}" + POD_IP: + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + ports: + - name: http + containerPort: 9093 + securityContext: &sc + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: "10m" + memory: "128Mi" + limits: + cpu: "3000m" + memory: "1Gi" + probes: + readiness: + enabled: true + custom: true + spec: &ready + httpGet: + path: "/-/ready" + port: http + scheme: HTTP + initialDelaySeconds: 3 + periodSeconds: 5 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 10 + startup: + enabled: true + custom: true + spec: + <<: *ready + periodSeconds: 1 + failureThreshold: 60 + liveness: + enabled: true + custom: true + spec: + httpGet: + path: "/-/healthy" + port: http + scheme: HTTP + periodSeconds: 10 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 10 + local: + type: statefulset + replicas: 3 + pod: + labels: + ingress.home.arpa/nginx-internal: allow + egress.home.arpa/discord: allow + containers: *ct + statefulset: + volumeClaimTemplates: + - name: data + size: 2Gi + storageClass: local + accessMode: ReadWriteOnce + advancedMounts: + main: + - path: /alertmanager + service: + main: + controller: main + ports: &ports + http: + port: 9093 + cluster: + port: 9094 + local: &svc + controller: local + ports: *ports + local-0: + <<: *svc + extraSelectorLabels: + apps.kubernetes.io/pod-index: "0" + local-1: + <<: *svc + extraSelectorLabels: + apps.kubernetes.io/pod-index: "1" + local-2: + <<: *svc + extraSelectorLabels: + apps.kubernetes.io/pod-index: "2" + ingress: + main: + enabled: true + primary: true + className: "nginx-internal" + annotations: + external-dns.alpha.kubernetes.io/target: "${DNS_CF}" + external-dns.alpha.kubernetes.io/cloudflare-proxied: "true" + hosts: + - host: &host "${APP_DNS_ALERTMANAGER}" + paths: &paths + - path: / + pathType: Prefix + service: + name: local + port: http + tls: + - hosts: [*host] + tailscale: + enabled: true + primary: false + className: "tailscale" + hosts: + - host: &host "${APP_DNS_ALERTMANAGER_TS}" + paths: *paths + tls: + - hosts: [*host] + persistence: + data: + existingClaim: alertmanager-data + advancedMounts: + main: + main: + - subPath: data + path: /alertmanager + config: + type: configMap + name: alertmanager-config + globalMounts: + - path: /etc/alertmanager + secrets: + type: secret + name: alertmanager-secrets + defaultMode: 0400 + globalMounts: + - path: /secrets + defaultPodOptions: + automountServiceAccountToken: false + enableServiceLinks: false + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 2000 + fsGroup: 2000 + fsGroupChangePolicy: "Always" + seccompProfile: { type: "RuntimeDefault" } + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: "DoNotSchedule" + labelSelector: + matchLabels: + app.kubernetes.io/name: *app + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "fuckoff.home.arpa/alertmanager" + operator: "DoesNotExist" + serviceMonitor: + main: + enabled: true + selector: + app.kubernetes.io/name: alertmanager + endpoints: + - port: http + scheme: http + enableHttp2: true + path: "/metrics" + interval: 1m + scrapeTimeout: 30s diff --git a/kube/deploy/core/monitoring/alertmanager/ks.yaml b/kube/deploy/core/monitoring/alertmanager/ks.yaml new file mode 100644 index 0000000000..6f5640b99b --- /dev/null +++ b/kube/deploy/core/monitoring/alertmanager/ks.yaml @@ -0,0 +1,41 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: 1-core-monitoring-alertmanager-app + namespace: flux-system + labels: &l + app.kubernetes.io/name: "alertmanager" +spec: + commonMetadata: + labels: *l + path: ./kube/deploy/core/monitoring/alertmanager/app + targetNamespace: "monitoring" + dependsOn: + - name: 1-core-monitoring-alertmanager-pvc +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: 1-core-monitoring-alertmanager-pvc + namespace: flux-system + labels: &l + app.kubernetes.io/name: "alertmanager" +spec: + commonMetadata: + labels: *l + path: ./kube/deploy/core/storage/volsync/template + targetNamespace: "monitoring" + dependsOn: + - name: 1-core-storage-volsync-app + - name: 1-core-storage-rook-ceph-cluster + postBuild: + substitute: + PVC: "alertmanager-data" + SIZE: "2Gi" + SC: &sc "file" + SNAP: *sc + ACCESSMODE: "ReadWriteMany" + RUID: "1000" + RGID: "2000" + RFSG: "2000" diff --git a/kube/deploy/core/monitoring/alertmanager/kustomization.yaml b/kube/deploy/core/monitoring/alertmanager/kustomization.yaml new file mode 100644 index 0000000000..70a7702900 --- /dev/null +++ b/kube/deploy/core/monitoring/alertmanager/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ks.yaml diff --git a/kube/deploy/core/monitoring/karma/app/hr.yaml b/kube/deploy/core/monitoring/karma/app/hr.yaml index 5a8e46340a..88130d99a7 100644 --- a/kube/deploy/core/monitoring/karma/app/hr.yaml +++ b/kube/deploy/core/monitoring/karma/app/hr.yaml @@ -31,7 +31,7 @@ spec: env: TZ: "${CONFIG_TZ}" # TODO: use full config mapping to all AM instances and 1 cluster, including different deployments for authentik, cloudflared and Tailscale's forward auth headers - ALERTMANAGER_URI: "http://kps-alertmanager.monitoring.svc:9093" + ALERTMANAGER_URI: "http://alertmanager-local.monitoring.svc:8080" ALERTMANAGER_EXTERNAL_URI: "https://${APP_DNS_ALERTMANAGER}" ALERTMANAGER_PROXY: "true" securityContext: &sc diff --git a/kube/deploy/core/monitoring/kps/app/helm-values/alertmanager.yaml b/kube/deploy/core/monitoring/kps/app/helm-values/alertmanager.yaml index c8fe02b269..cd26c5dd16 100644 --- a/kube/deploy/core/monitoring/kps/app/helm-values/alertmanager.yaml +++ b/kube/deploy/core/monitoring/kps/app/helm-values/alertmanager.yaml @@ -1,6 +1,6 @@ --- alertmanager: - enabled: true + enabled: false ingress: enabled: true pathType: "Prefix" diff --git a/kube/deploy/core/monitoring/kps/app/helm-values/prom.yaml b/kube/deploy/core/monitoring/kps/app/helm-values/prom.yaml index 10913dc913..68a93b1a07 100644 --- a/kube/deploy/core/monitoring/kps/app/helm-values/prom.yaml +++ b/kube/deploy/core/monitoring/kps/app/helm-values/prom.yaml @@ -59,3 +59,13 @@ prometheus: registry: quay.io repository: prometheus/prometheus tag: v2.51.0-rc.0-dedupelabels + # external AlertManager (non-KPS) + alertingEndpoints: + - apiVersion: v2 + name: alertmanager + namespace: monitoring + port: 9093 + - apiVersion: v2 + name: alertmanager-local + namespace: monitoring + port: 9093 diff --git a/kube/deploy/core/monitoring/kps/app/hr.yaml b/kube/deploy/core/monitoring/kps/app/hr.yaml index c34f8c3e57..74c36feabf 100644 --- a/kube/deploy/core/monitoring/kps/app/hr.yaml +++ b/kube/deploy/core/monitoring/kps/app/hr.yaml @@ -32,6 +32,8 @@ spec: enabled: false fullnameOverride: "kps" cleanPrometheusOperatorObjectNames: true + alertmanager: + enabled: false nodeExporter: enabled: false grafana: @@ -70,4 +72,4 @@ spec: name: not-used namespace: not-used annotations: - grafana_folder: Kubernetes \ No newline at end of file + grafana_folder: Kubernetes