diff --git a/.gitguardian.yaml b/.gitguardian.yaml new file mode 100644 index 0000000..3e334db --- /dev/null +++ b/.gitguardian.yaml @@ -0,0 +1,4 @@ +version: 2 +secret: + ignored-paths: + - '2024-03-alerts-observability/*' diff --git a/2024-03-alerts-observability/README.md b/2024-03-alerts-observability/README.md new file mode 100644 index 0000000..560ccce --- /dev/null +++ b/2024-03-alerts-observability/README.md @@ -0,0 +1,33 @@ +# DIY alerts observability + +This demo uses `vector.dev` to collect data from different sources and write the data in the datastore +we use one `http_server` vector instance - to receive Alertmanager webhook notifications, +two `http_client` sources to query Alertmanager's alerts and silence API endpoints and +two `sinks` for writing all the state logs in ClickHouse into `alerts` and `silences` tables. + +The docker-compose will bring up several containers: + +* `Cadvisor` is used to generate system metrics for monitoring. +* `Prometheus` is used to monitor and generate alerts. +* `Alertmanager` is to route alerts and provide the alert events via webhook and API. +* `alertmanager_silence` is to create an Alertmanager silence. +* `blackbox_exporter` is for monitoring the sites and generating alerts. +* `ClickHouse` is used to write the Alertmanager alert events into the datastore for alerts observability. +* `Vector.dev` - to collect data from Alertmanager webhook, alerts and silences API, transform the data and write into ClickHouse. +* `Grafana` is used to visualize the logs. + +## Pre-requisite: +`docker` + +## Getting started: + +* Setup password for ClickHouse and bring up the containers using docker compose +```console +foo@bar:~$ export CLICKHOUSE_PASSWORD="" +foo@bar:~$ docker compose up +``` + +Please wait for about 5 minutes for the alerts to be triggered and +visit http://localhost:3000/ to explore the `Alerts and silences overview` dashboard and play around. + +![alerts and silences overview](images/alerts-silences-overview.png "alerts and silences overview") diff --git a/2024-03-alerts-observability/alertmanager/amtool b/2024-03-alerts-observability/alertmanager/amtool new file mode 100755 index 0000000..563a75a Binary files /dev/null and b/2024-03-alerts-observability/alertmanager/amtool differ diff --git a/2024-03-alerts-observability/alertmanager/config.yml b/2024-03-alerts-observability/alertmanager/config.yml new file mode 100644 index 0000000..bb8e471 --- /dev/null +++ b/2024-03-alerts-observability/alertmanager/config.yml @@ -0,0 +1,10 @@ +route: + receiver: 'webhook' + +receivers: + - name: 'webhook' + webhook_configs: + - url: http://vector:8888 + +templates: + - template/*.tmpl diff --git a/2024-03-alerts-observability/alertmanager/silence_entrypoint.sh b/2024-03-alerts-observability/alertmanager/silence_entrypoint.sh new file mode 100755 index 0000000..9010038 --- /dev/null +++ b/2024-03-alerts-observability/alertmanager/silence_entrypoint.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +echo "{ + \"matchers\": [ + { + \"name\": \"alertname\", + \"value\": \"service_down\", + \"isRegex\": false + } + ], + \"startsAt\": \"$(date -Ins | sed s/+00:00/Z/ | sed s/,/./)\", + \"endsAt\": \"$(TZ='UTC-1:00' date -Ins | sed s/+01:00/Z/ | sed s/,/./)\", + \"createdBy\": \"demouser\", + \"comment\": \"Silence\" +}" > post-data +sleep 90 + +wget 'http://alertmanager:9093/api/v1/silences' --header='Content-Type: application/json' --post-file=post-data diff --git a/2024-03-alerts-observability/blackboxexporter/config.yml b/2024-03-alerts-observability/blackboxexporter/config.yml new file mode 100644 index 0000000..7c79a77 --- /dev/null +++ b/2024-03-alerts-observability/blackboxexporter/config.yml @@ -0,0 +1,15 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2"] + valid_status_codes: [] + method: GET + no_follow_redirects: false + fail_if_ssl: false + fail_if_not_ssl: false + tls_config: + insecure_skip_verify: false + preferred_ip_protocol: "ip4" + diff --git a/2024-03-alerts-observability/clickhouse/init-defaults.sh b/2024-03-alerts-observability/clickhouse/init-defaults.sh new file mode 100755 index 0000000..b1645f7 --- /dev/null +++ b/2024-03-alerts-observability/clickhouse/init-defaults.sh @@ -0,0 +1,89 @@ +#!/bin/sh + +CLICKHOUSE_DB="${CLICKHOUSE_DB:-database}"; +CLICKHOUSE_USER="${CLICKHOUSE_USER:-user}"; +CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-password}"; + +cat <> /etc/clickhouse-server/users.d/user.xml + + + + <${CLICKHOUSE_USER}> + default + + ::/0 + + ${CLICKHOUSE_PASSWORD} + default + + + +EOT +#cat /etc/clickhouse-server/users.d/user.xml; + +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS ${CLICKHOUSE_DB}"; + +echo -n ' +SET input_format_import_nested_json = 1; +' | clickhouse-client + +echo -n ' +SET output_format_json_array_of_rows = 1; +' | clickhouse-client + +echo -n "SET date_time_input_format='best_effort';" | clickhouse-client + +echo -n ' +CREATE TABLE r0.alerts +( + `date` Date DEFAULT toDate(now()), + `datetime` DateTime DEFAULT now(), + `timestamp` DateTime64(3) DEFAULT now() CODEC(Delta(4), ZSTD(1)), + `startsAt` DateTime64(3), + `endsAt` DateTime64(3), + `updatedAt` DateTime64(3), + `status.inhibitedBy` Array(String), + `status.silencedBy` String, + `status.state` LowCardinality(String), + `annotations.summary` String, + `annotations.dashboard` String, + `annotations.link` String, + `fingerprint` String, + `receivers` Array(String), + `labelsmap` Map(String, String), + `labels.alertname` String, + `labels.component` String, + `labels.service` String, + `labels.instance` String, + `labels.job` String, + `labels.metal` String, + `labels.notify` String, + `labels.priority` String, + `labels.prometheus` String, + `labels.region` String, + `labels.severity` String +) +ENGINE = MergeTree +PARTITION BY toStartOfHour(datetime) +ORDER BY labels.alertname +SETTINGS index_granularity = 8192;' | clickhouse-client + +echo -n ' +CREATE TABLE r0.silences +( + `date` Date DEFAULT toDate(now()), + `datetime` DateTime DEFAULT now(), + `id` String, + `status.state` LowCardinality(String), + `updatedAt` DateTime64(3), + `startsAt` DateTime64(3), + `createdBy` LowCardinality(String), + `endsAt` DateTime64(3), + `matchers` Map(String, String), + `comment` String +) +ENGINE = ReplacingMergeTree +PARTITION BY toStartOfHour(datetime) +ORDER BY (id, startsAt, endsAt) +SETTINGS index_granularity = 8192; +' | clickhouse-client diff --git a/2024-03-alerts-observability/docker-compose.yml b/2024-03-alerts-observability/docker-compose.yml new file mode 100644 index 0000000..a240292 --- /dev/null +++ b/2024-03-alerts-observability/docker-compose.yml @@ -0,0 +1,164 @@ +version: '3.1' + +volumes: + prometheus_data: {} + grafana_data: {} + +services: + + cadvisor: + image: gcr.io/cadvisor/cadvisor + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + ports: + - 8080:8080 + restart: always + deploy: + mode: global + + prometheus: + image: prom/prometheus:v2.49.1 + volumes: + - ./prometheus/:/etc/prometheus/ + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + ports: + - 9090:9090 + links: + - cadvisor:cadvisor + - alertmanager:alertmanager + depends_on: + - cadvisor + restart: always + deploy: + mode: global + + alertmanager: + image: prom/alertmanager:v0.26.0 + ports: + - 9093:9093 + volumes: + - ./alertmanager/:/etc/alertmanager/ + - ./alertmanager/amtool:/bin/amtool + restart: always + command: + - '--config.file=/etc/alertmanager/config.yml' + deploy: + mode: global + + alertmanager_silence: + image: prom/alertmanager:v0.26.0 + volumes: + - ./alertmanager/:/etc/alertmanager/ + - ./alertmanager/amtool:/bin/amtool + - ./alertmanager/silence_entrypoint.sh:/silence_entrypoint.sh + depends_on: + alertmanager: + condition: service_started + restart: 'no' + entrypoint: '/silence_entrypoint.sh' + + blackbox_exporter: + image: prom/blackbox-exporter:v0.24.0 + ports: + - 9115:9115 + volumes: + - ./blackboxexporter/:/etc/blackboxexporter/ + command: + - '--config.file=/etc/blackboxexporter/config.yml' + restart: always + + clickhouse: + image: clickhouse/clickhouse-server:23.3.19.32 + ports: + - "8123:8123" + - "9008:9008" + - "9009:9009" + environment: + # Default user and database will be created using `init-defaults.sh` script + CLICKHOUSE_DB: r0 + CLICKHOUSE_USER: demouser + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD} + hostname: clickhouse-0.localhost + ulimits: + nproc: 65535 + nofile: + soft: 262144 + hard: 262144 + volumes: + - ./clickhouse/init-defaults.sh:/docker-entrypoint-initdb.d/init-defaults.sh:ro + healthcheck: + test: + [ + "CMD-SHELL", + "http_proxy='' wget -nv -t1 --spider 'http://localhost:8123/' || exit 1", + ] + interval: 10s + timeout: 10s + retries: 30 + + vector: + image: timberio/vector:0.27.0-debian + container_name: vector + ports: + - 8888:8888 + environment: + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD} + volumes: + - ./vector/vector.toml:/etc/vector/vector.toml:ro + depends_on: + clickhouse: + condition: service_healthy + + grafana: + build: ./grafana/ + environment: + - GF_PATHS_PROVISIONING=/etc/grafana/provisioning + - GF_AUTH_ANONYMOUS_ENABLED=true + - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin + - GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS=grafana-clickhouse-datasource + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + entrypoint: + - sh + - -euc + - | + mkdir -p /etc/grafana/provisioning/datasources + cat < /etc/grafana/provisioning/datasources/ds.yaml + apiVersion: 1 + datasources: + - name: 'ClickHouse' + type: 'grafana-clickhouse-datasource' + isDefault: true + jsonData: + defaultDatabase: r0 + port: 9000 + server: clickhouse-0.localhost + username: demouser + tlsSkipVerify: false + secureJsonData: + password: ${CLICKHOUSE_PASSWORD} + editable: true + EOF + mkdir -p /etc/grafana/provisioning/dashboards + cat < /etc/grafana/provisioning/dashboards/dashboard.yaml + apiVersion: 1 + providers: + - name: demo + type: file + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true + EOF + /run.sh + ports: + - "3000:3000" diff --git a/2024-03-alerts-observability/grafana/Dockerfile b/2024-03-alerts-observability/grafana/Dockerfile new file mode 100644 index 0000000..24cf624 --- /dev/null +++ b/2024-03-alerts-observability/grafana/Dockerfile @@ -0,0 +1,10 @@ +FROM grafana/grafana:10.2.3-ubuntu + +WORKDIR /tmp +USER root + +RUN grafana cli plugins install grafana-clickhouse-datasource + +RUN mkdir -p /var/lib/grafana/dashboards + +COPY dashboards/*.json /var/lib/grafana/dashboards/ diff --git a/2024-03-alerts-observability/grafana/dashboards/alerts-silences-overview.json b/2024-03-alerts-observability/grafana/dashboards/alerts-silences-overview.json new file mode 100644 index 0000000..b62e358 --- /dev/null +++ b/2024-03-alerts-observability/grafana/dashboards/alerts-silences-overview.json @@ -0,0 +1,1535 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 14, + "panels": [], + "title": "alerts", + "type": "row" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "expand": false, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT\n status,\n count() as count\nFROM\n(\n SELECT\n status.state as status,\n fingerprint\n FROM r0.alerts\n WHERE\n date >= toDate(1686684884)\n AND status != ''\n GROUP BY\n status,\n fingerprint\n)\nGROUP BY status", + "refId": "A" + } + ], + "title": "alert status pie", + "type": "piechart" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "count", + "site_down" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 12, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.0.5", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT labels.alertname,\n count() as count\n FROM (\n SELECT labels.alertname,\n fingerprint\n FROM r0.alerts\n WHERE date >= toDate(1686684884)\n AND status.state = 'firing'\n AND annotations.dashboard = ''\n GROUP BY labels.alertname,\n fingerprint\n )\n GROUP BY \n labels.alertname\n ORDER BY count DESC", + "refId": "A" + } + ], + "title": "Firing alerts by instance", + "type": "piechart" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 8, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": true + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "expand": false, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT\n count() as count\nFROM\n(\n SELECT\n labels.alertname,\n fingerprint\n FROM r0.alerts\n WHERE\n date >= toDate(1686684884)\n AND status.state = 'firing'\n GROUP BY\n labels.alertname,\n fingerprint\n)\nORDER BY count DESC", + "refId": "A" + } + ], + "title": "Total # of unique firing alerts", + "type": "stat" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 10, + "y": 1 + }, + "id": 5, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.0.5", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT\n labels.instance,\n count() as count\nFROM\n(\n SELECT\n labels.instance,\n status.state,\n fingerprint\n FROM r0.alerts\n WHERE\n date >= toDate(1686684884)\n AND status.state = 'firing'\n AND labels.instance != ''\n GROUP BY\n labels.instance,\n status.state,\n fingerprint\n)\nGROUP BY\n status.state,\n labels.instance\nORDER BY count DESC", + "refId": "A" + } + ], + "title": "Firing alerts by instance", + "type": "piechart" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "count" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 15, + "y": 1 + }, + "id": 11, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT labels.alertname,\n count() as count\n FROM (\n SELECT labels.alertname,\n fingerprint\n FROM r0.alerts\n WHERE date >= toDate(1686684884)\n AND status.state = 'firing'\n AND annotations.dashboard = ''\n GROUP BY labels.alertname,\n fingerprint\n )\n GROUP BY \n labels.alertname\n ORDER BY count DESC", + "refId": "A" + } + ], + "title": "Firing alerts by instance", + "type": "table" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "count" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT\n labels.instance,\n count() as count\nFROM\n(\n SELECT\n labels.instance,\n status.state,\n fingerprint\n FROM r0.alerts\n WHERE\n date >= toDate(1686684884)\n AND status.state = 'firing'\n AND labels.instance != ''\n GROUP BY\n labels.instance,\n status.state,\n fingerprint\n)\nGROUP BY\n status.state,\n labels.instance\nORDER BY count DESC", + "refId": "A" + } + ], + "title": "Firing alerts by instance", + "type": "table" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 56, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 15, + "x": 0, + "y": 6 + }, + "id": 7, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "colorByField": "count", + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "9.0.5", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "expand": false, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT\n toStartOfFifteenMinutes(timestamp) as t,\n count() as count\nFROM r0.alerts\nWHERE\n date >= toDate(1686684884)\n AND status.state = 'firing'\nGROUP BY t\nORDER BY t", + "refId": "A" + } + ], + "title": "Total # of unique firing alerts", + "type": "barchart" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "count" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "gauge" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 15, + "y": 6 + }, + "id": 13, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + }, + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "format": 1, + "meta": { + "builderOptions": { + "fields": [], + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "table": "alerts" + } + }, + "queryType": "sql", + "rawSql": "SELECT labels.alertname,\n count() as count\n FROM (\n SELECT labels.alertname,\n fingerprint\n FROM r0.alerts\n WHERE date >= toDate(1686684884)\n AND status.state = 'firing'\n AND annotations.link = ''\n GROUP BY labels.alertname,\n fingerprint\n )\n GROUP BY \n labels.alertname\n ORDER BY count DESC", + "refId": "A" + } + ], + "title": "Alertnames with missing runbook", + "type": "table" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "''": { + "color": "transparent", + "index": 0 + }, + "active": { + "color": "orange", + "index": 2 + }, + "firing": { + "color": "red", + "index": 1 + }, + "resolved": { + "color": "green", + "index": 4 + }, + "suppressed": { + "color": "yellow", + "index": 3 + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 0, + "y": 11 + }, + "id": 10, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "hide": false, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "filters": [], + "limit": 100, + "mode": "list", + "orderBy": [], + "queryType": "table", + "table": "alerts" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": "SELECT\n toStartOfFifteenMinutes(timestamp) as time,\n labels.instance,\n status.state,\n fingerprint\n FROM r0.alerts\n WHERE\n date >= toDate(1686684884)\n AND labels.instance != ''\n GROUP BY\n time,\n status.state,\n labels.instance,\n fingerprint", + "refId": "A" + } + ], + "title": "Instance status timeline", + "transformations": [ + { + "id": "groupingToMatrix", + "options": { + "columnField": "labels.instance", + "rowField": "time", + "valueField": "status.state" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "time", + "targetField": "time\\labels.instance" + } + ], + "fields": {} + } + } + ], + "type": "state-timeline" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "count" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "lcd", + "type": "gauge" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "labels.alertname" + }, + "properties": [ + { + "id": "custom.width", + "value": 131 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "id" + }, + "properties": [ + { + "id": "custom.width", + "value": 103 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "createdBy" + }, + "properties": [ + { + "id": "custom.width", + "value": 94 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "silences.startsAt" + }, + "properties": [ + { + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "silence_matchers" + }, + "properties": [ + { + "id": "custom.width", + "value": 466 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 14, + "x": 10, + "y": 11 + }, + "id": 20, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "limit": 1000, + "mode": "list", + "queryType": "table", + "table": "" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": "SELECT (labelsmap) as labelKeys\n FROM r0.alerts\n GROUP BY labelKeys", + "refId": "A" + } + ], + "title": "Alert label set", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 15, + "panels": [], + "title": "silences", + "type": "row" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 2, + "x": 0, + "y": 18 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "limit": 1000, + "mode": "list", + "queryType": "table", + "table": "" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": "SELECT\n uniq( id)\nFROM r0.silences\nWHERE status.state = 'active'", + "refId": "A" + } + ], + "title": "# of silences", + "type": "stat" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 2, + "y": 18 + }, + "id": 17, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "limit": 1000, + "mode": "list", + "queryType": "table", + "table": "" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": "SELECT\n createdBy,\n uniq(id) as count\nFROM r0.silences\nGROUP BY createdBy\nORDER BY count DESC", + "refId": "A" + } + ], + "title": "Silences createdBy", + "type": "piechart" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 6, + "y": 18 + }, + "id": 19, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "limit": 1000, + "mode": "list", + "queryType": "table", + "table": "" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": "SELECT\n status,\n count() as count\nFROM\n(\n SELECT\n status.state as status,\n id\n FROM r0.silences\n WHERE\n status != ''\n GROUP BY\n status,\n id\n)\nGROUP BY status", + "refId": "A" + } + ], + "title": "Silence active vs expired", + "type": "piechart" + }, + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "count" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "lcd", + "type": "gauge" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "labels.alertname" + }, + "properties": [ + { + "id": "custom.width", + "value": 131 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "id" + }, + "properties": [ + { + "id": "custom.width", + "value": 103 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "createdBy" + }, + "properties": [ + { + "id": "custom.width", + "value": 94 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "silences.startsAt" + }, + "properties": [ + { + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "silence_matchers" + }, + "properties": [ + { + "id": "custom.width", + "value": 466 + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 14, + "x": 10, + "y": 18 + }, + "id": 18, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "10.2.3", + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "limit": 1000, + "mode": "list", + "queryType": "table", + "table": "" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": " SELECT\n alerts.labels.alertname,\n alerts.status.silencedBy as id,\n silences.createdBy as createdBy,\n silences.matchers as silence_matchers,\n silences.startsAt, silences.comment\n FROM r0.alerts as alerts\n INNER JOIN (SELECT * FROM r0.silences) AS silences USING (id)\n GROUP BY\n alerts.labels.alertname,\n alerts.status.silencedBy,\n silences.matchers, silences.createdBy, silences.startsAt, silences.comment,\n alerts.fingerprint\n ORDER BY silences.startsAt DESC\n LIMIT 100", + "refId": "A" + } + ], + "title": "Silenced alerts", + "type": "table" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-18h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Alerts and silences overview", + "uid": "vcsM8q_VzG", + "version": 1, + "weekStart": "" +} diff --git a/2024-03-alerts-observability/grafana/dashboards/alerts-state-timeline.json b/2024-03-alerts-observability/grafana/dashboards/alerts-state-timeline.json new file mode 100644 index 0000000..fdf8467 --- /dev/null +++ b/2024-03-alerts-observability/grafana/dashboards/alerts-state-timeline.json @@ -0,0 +1,178 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 3, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "": { + "color": "transparent", + "index": 4 + }, + "active": { + "color": "orange", + "index": 1, + "text": "active" + }, + "firing": { + "color": "red", + "index": 0, + "text": "firing" + }, + "resolved": { + "color": "green", + "index": 3, + "text": "resolved" + }, + "suppressed": { + "color": "yellow", + "index": 2, + "text": "suppressed" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 17, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-clickhouse-datasource", + "uid": "PDEE91DDB90597936" + }, + "editorType": "sql", + "format": 1, + "meta": { + "builderOptions": { + "columns": [], + "database": "", + "limit": 1000, + "mode": "list", + "queryType": "table", + "table": "" + } + }, + "pluginVersion": "4.0.0-beta", + "queryType": "table", + "rawSql": "SELECT timestamp,\n status,\n alert_identifier\n from (\n SELECT toStartOfFifteenMinutes(timestamp) as timestamp,\n concat(labels.alertname, ', ', labels.instance) as alert_identifier,\n status.state as status,\n fingerprint\n FROM r0.alerts\n GROUP BY timestamp,\n alert_identifier,\n fingerprint,\n status\n Order by timestamp\n )", + "refId": "A" + } + ], + "title": "alertname, instance state timeline (rollup: toStartOfFifteenMinutes)", + "transformations": [ + { + "id": "groupingToMatrix", + "options": { + "columnField": "alert_identifier", + "emptyValue": "empty", + "rowField": "timestamp", + "valueField": "status" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "time", + "targetField": "timestamp\\alert_identifier" + } + ], + "fields": {} + } + } + ], + "type": "state-timeline" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Alerts state timeline", + "uid": "d71c6f86-ca18-4ddf-b8b8-cdb9833cc9a1", + "version": 3, + "weekStart": "" +} diff --git a/2024-03-alerts-observability/grafana/provisioning/dashboards/all.yml b/2024-03-alerts-observability/grafana/provisioning/dashboards/all.yml new file mode 100644 index 0000000..3b978e6 --- /dev/null +++ b/2024-03-alerts-observability/grafana/provisioning/dashboards/all.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: +- name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 #how often Grafana will scan for changed dashboards + options: + path: /var/lib/grafana/dashboards diff --git a/2024-03-alerts-observability/images/alerts-silences-overview.png b/2024-03-alerts-observability/images/alerts-silences-overview.png new file mode 100644 index 0000000..cd8bbf1 Binary files /dev/null and b/2024-03-alerts-observability/images/alerts-silences-overview.png differ diff --git a/2024-03-alerts-observability/prometheus/alert.rules b/2024-03-alerts-observability/prometheus/alert.rules new file mode 100644 index 0000000..cf2a0f1 --- /dev/null +++ b/2024-03-alerts-observability/prometheus/alert.rules @@ -0,0 +1,30 @@ +groups: +- name: example + rules: + + - alert: service_down + expr: up == 0 + for: 30s + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 30 seconds." + + - alert: high_load + expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80 + for: 30s + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} under high load" + description: "{{ $labels.instance }} is under high load." + + - alert: site_down + expr: probe_success < 1 + for: 30s + labels: + severity: page + annotations: + summary: "Site Down: {{$labels.instance}}" + description: "Site Down: {{$labels.instance}} for more than 30 seconds" diff --git a/2024-03-alerts-observability/prometheus/prometheus.yml b/2024-03-alerts-observability/prometheus/prometheus.yml new file mode 100644 index 0000000..2488fea --- /dev/null +++ b/2024-03-alerts-observability/prometheus/prometheus.yml @@ -0,0 +1,63 @@ +# my global config +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + evaluation_interval: 15s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'my-project' + +# Load and evaluate rules in this file every 'evaluation_interval' seconds. +rule_files: + - 'alert.rules' + +# alert +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "alertmanager:9093" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + - job_name: 'node' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: + - localhost:9090 + - cadvisor:8080 + - node-exporter:9100 + - job_name: 'blackbox' + metrics_path: /probe + params: + module: [http_2xx] # Look for a HTTP 200 response. + static_configs: + - targets: + - https://am2ch.com + - https://xyzdoodle.com + - https://prometheus.io + - localhost:9090 + - localhost:3000 + - cadvisor:8080 + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: "blackbox_exporter:9115" # Blackbox exporter. + - target_label: region + replacement: "local" + diff --git a/2024-03-alerts-observability/vector/vector.toml b/2024-03-alerts-observability/vector/vector.toml new file mode 100644 index 0000000..725b1f7 --- /dev/null +++ b/2024-03-alerts-observability/vector/vector.toml @@ -0,0 +1,179 @@ +[sources.webhook] +type = "http_server" +address = "0.0.0.0:8888" +headers = [ "User-Agent" ] +query_parameters = [ "application" ] +path = "/" +strict_path = true +path_key = "path" +method = "POST" +encoding = "json" + +[transforms.webhook_transform] +inputs = [ "webhook" ] +type = "remap" +source = """ +. = unnest!(.alerts) +. = map_values(.) -> |v| { + value = v.alerts + del(v.alerts) + value.endsAt = parse_timestamp!(value.endsAt, format: "%Y-%m-%dT%H:%M:%S%.fZ") + value.endsAt = to_unix_timestamp(value.endsAt, "milliseconds") + value.startsAt = parse_timestamp!(value.startsAt, format: "%Y-%m-%dT%H:%M:%S%.fZ") + value.startsAt = to_unix_timestamp(value.startsAt, "milliseconds") + value.user_agent = del(value."User-Agent") + value.status.state = del(value.status) + + value.labelskv = encode_key_value!(value.labels) + + labelsmap = value.labels + labelsmap_encoded = encode_json(labelsmap) + value.md5 = md5(labelsmap_encoded) + + value.source = "webhook" + flatten(value) +} +""" + +[transforms.webhook_transform2] +inputs = [ "webhook_transform" ] +type = "remap" +source = """ +.labelsmap = parse_key_value!(.labelskv) +""" + +[sources.alerts] +type = "http_client" +endpoint = "http://alertmanager:9093/api/v2/alerts" +method = "GET" +decoding.codec = "json" +scrape_interval_secs = 60 + + [sources.alerts.headers] + Accept = [ "application/json", "text/plain" ] + +[transforms.vrl] +inputs = [ "alerts" ] +type = "remap" +source = """ +# to split the multiline event to small events +""" + +[transforms.vrl2] +inputs = [ "vrl" ] +type = "remap" +source = """ +.timestamp = parse_timestamp!(.timestamp, format: "%Y-%m-%dT%H:%M:%S%.fZ") +.timestamp = to_unix_timestamp(.timestamp, "milliseconds") +.endsAt = to_timestamp!(.endsAt, unit: "milliseconds") +.endsAt = to_unix_timestamp(.endsAt, "milliseconds") +.updatedAt = to_timestamp!(.updatedAt, unit: "milliseconds") +.updatedAt = to_unix_timestamp(.updatedAt, "milliseconds") +.startsAt = to_timestamp!(.startsAt, unit: "milliseconds") +.startsAt = to_unix_timestamp(.startsAt, "milliseconds") + +.rec = del(.receivers) +.receivers = [] +for_each(array!(.rec)) -> |_index, r| { + .rid = get!(.rec, [to_int(_index)]) + .receivers = push(.receivers, .rid.name ) +} +del(.rec) +del(.rid) + +.labelskv = encode_key_value!(.labels) +labelsmap = .labels +. = flatten(.) +labelsmap_encoded = encode_json(labelsmap) +.md5 = md5(labelsmap_encoded) + +if is_empty(array!(."status.silencedBy")) { + del(.status.silencedBy) + ."status.silencedBy" = "" +} else { + . = unnest!(."status.silencedBy") +} + +""" + +[sources.silences] +type = "http_client" +endpoint = "http://alertmanager:9093/api/v2/silences" +method = "GET" +decoding.codec = "json" +scrape_interval_secs = 60 + + [sources.silences.headers] + Accept = [ "application/json", "text/plain" ] + +[transforms.silences_remap] +inputs = [ "silences" ] +type = "remap" +source = """ +ts_now = to_unix_timestamp(now(), unit: "milliseconds") +.endsAt = to_timestamp!(.endsAt, unit: "milliseconds") +.endsAt = to_unix_timestamp(.endsAt, "milliseconds") +.startsAt = to_timestamp!(.startsAt, unit: "milliseconds") +.startsAt = to_unix_timestamp(.startsAt, "milliseconds") +.updatedAt = to_timestamp!(.updatedAt, unit: "milliseconds") +.updatedAt = to_unix_timestamp(.updatedAt, "milliseconds") + +.drop = false +obj = {} +.match = {} +for_each(array!(.matchers)) -> |_index, value| { + .mid = get!(.matchers, [to_int(_index)]) + eq = "" + reg = "" + if to_bool!(.mid.isEqual) { + eq = "=" + } else { + eq = "!=" + } + if to_bool!(.mid.isRegex) { + reg = "~" + } + n = .mid.name + v, err = eq + reg + .mid.value + .match |= object(set!(obj, [.mid.name], v)) + del(.mid) +} +del( .matchers ) +matchers = del(.match ) +. = flatten(.) +.matchers = matchers +.matchers_string = encode_key_value(.matchers) +""" + +[transforms.silences_filter] +type = "filter" +inputs = [ "silences_remap" ] +condition = ".drop == false" + + +[sinks.silences_clickhouse] +inputs = ["silences_filter"] +type = "clickhouse" +database = "r0" +endpoint = "http://clickhouse-0.localhost:8123" +table = "silences" +compression = "gzip" +tls.verify_hostname = false +auth.user = "demouser" +auth.password = "${CLICKHOUSE_PASSWORD}" +auth.strategy = "basic" +batch.timeout_secs = 30 +skip_unknown_fields = true + +[sinks.clickhouse] +inputs = ["webhook_transform2", "vrl2"] +type = "clickhouse" +database = "r0" +endpoint = "http://clickhouse-0.localhost:8123" +table = "alerts" +compression = "gzip" +tls.verify_hostname = false +auth.user = "demouser" +auth.password = "${CLICKHOUSE_PASSWORD}" +auth.strategy = "basic" +batch.timeout_secs = 10