Skip to content

Commit

Permalink
2024-03: Minimizing on-call burnout through alerts observability
Browse files Browse the repository at this point in the history
  • Loading branch information
m0nikasingh authored and dknecht committed Mar 28, 2024
1 parent dba3407 commit df75cae
Show file tree
Hide file tree
Showing 16 changed files with 2,339 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitguardian.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
version: 2
secret:
ignored-paths:
- '2024-03-alerts-observability/*'
33 changes: 33 additions & 0 deletions 2024-03-alerts-observability/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# DIY alerts observability

This demo uses `vector.dev` to collect data from different sources and write the data in the datastore
we use one `http_server` vector instance - to receive Alertmanager webhook notifications,
two `http_client` sources to query Alertmanager's alerts and silence API endpoints and
two `sinks` for writing all the state logs in ClickHouse into `alerts` and `silences` tables.

The docker-compose will bring up several containers:

* `Cadvisor` is used to generate system metrics for monitoring.
* `Prometheus` is used to monitor and generate alerts.
* `Alertmanager` is to route alerts and provide the alert events via webhook and API.
* `alertmanager_silence` is to create an Alertmanager silence.
* `blackbox_exporter` is for monitoring the sites and generating alerts.
* `ClickHouse` is used to write the Alertmanager alert events into the datastore for alerts observability.
* `Vector.dev` - to collect data from Alertmanager webhook, alerts and silences API, transform the data and write into ClickHouse.
* `Grafana` is used to visualize the logs.

## Pre-requisite:
`docker`

## Getting started:

* Setup password for ClickHouse and bring up the containers using docker compose
```console
foo@bar:~$ export CLICKHOUSE_PASSWORD="<PASSWORD here>"
foo@bar:~$ docker compose up
```

Please wait for about 5 minutes for the alerts to be triggered and
visit http://localhost:3000/ to explore the `Alerts and silences overview` dashboard and play around.

![alerts and silences overview](images/alerts-silences-overview.png "alerts and silences overview")
Binary file added 2024-03-alerts-observability/alertmanager/amtool
Binary file not shown.
10 changes: 10 additions & 0 deletions 2024-03-alerts-observability/alertmanager/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
route:
receiver: 'webhook'

receivers:
- name: 'webhook'
webhook_configs:
- url: http://vector:8888

templates:
- template/*.tmpl
18 changes: 18 additions & 0 deletions 2024-03-alerts-observability/alertmanager/silence_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/sh

echo "{
\"matchers\": [
{
\"name\": \"alertname\",
\"value\": \"service_down\",
\"isRegex\": false
}
],
\"startsAt\": \"$(date -Ins | sed s/+00:00/Z/ | sed s/,/./)\",
\"endsAt\": \"$(TZ='UTC-1:00' date -Ins | sed s/+01:00/Z/ | sed s/,/./)\",
\"createdBy\": \"demouser\",
\"comment\": \"Silence\"
}" > post-data
sleep 90

wget 'http://alertmanager:9093/api/v1/silences' --header='Content-Type: application/json' --post-file=post-data
15 changes: 15 additions & 0 deletions 2024-03-alerts-observability/blackboxexporter/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: []
method: GET
no_follow_redirects: false
fail_if_ssl: false
fail_if_not_ssl: false
tls_config:
insecure_skip_verify: false
preferred_ip_protocol: "ip4"

89 changes: 89 additions & 0 deletions 2024-03-alerts-observability/clickhouse/init-defaults.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/sh

CLICKHOUSE_DB="${CLICKHOUSE_DB:-database}";
CLICKHOUSE_USER="${CLICKHOUSE_USER:-user}";
CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-password}";

cat <<EOT >> /etc/clickhouse-server/users.d/user.xml
<yandex>
<!-- Docs: <https://clickhouse.tech/docs/en/operations/settings/settings_users/> -->
<users>
<${CLICKHOUSE_USER}>
<profile>default</profile>
<networks>
<ip>::/0</ip>
</networks>
<password>${CLICKHOUSE_PASSWORD}</password>
<quota>default</quota>
</${CLICKHOUSE_USER}>
</users>
</yandex>
EOT
#cat /etc/clickhouse-server/users.d/user.xml;

clickhouse-client --query "CREATE DATABASE IF NOT EXISTS ${CLICKHOUSE_DB}";

echo -n '
SET input_format_import_nested_json = 1;
' | clickhouse-client

echo -n '
SET output_format_json_array_of_rows = 1;
' | clickhouse-client

echo -n "SET date_time_input_format='best_effort';" | clickhouse-client

echo -n '
CREATE TABLE r0.alerts
(
`date` Date DEFAULT toDate(now()),
`datetime` DateTime DEFAULT now(),
`timestamp` DateTime64(3) DEFAULT now() CODEC(Delta(4), ZSTD(1)),
`startsAt` DateTime64(3),
`endsAt` DateTime64(3),
`updatedAt` DateTime64(3),
`status.inhibitedBy` Array(String),
`status.silencedBy` String,
`status.state` LowCardinality(String),
`annotations.summary` String,
`annotations.dashboard` String,
`annotations.link` String,
`fingerprint` String,
`receivers` Array(String),
`labelsmap` Map(String, String),
`labels.alertname` String,
`labels.component` String,
`labels.service` String,
`labels.instance` String,
`labels.job` String,
`labels.metal` String,
`labels.notify` String,
`labels.priority` String,
`labels.prometheus` String,
`labels.region` String,
`labels.severity` String
)
ENGINE = MergeTree
PARTITION BY toStartOfHour(datetime)
ORDER BY labels.alertname
SETTINGS index_granularity = 8192;' | clickhouse-client

echo -n '
CREATE TABLE r0.silences
(
`date` Date DEFAULT toDate(now()),
`datetime` DateTime DEFAULT now(),
`id` String,
`status.state` LowCardinality(String),
`updatedAt` DateTime64(3),
`startsAt` DateTime64(3),
`createdBy` LowCardinality(String),
`endsAt` DateTime64(3),
`matchers` Map(String, String),
`comment` String
)
ENGINE = ReplacingMergeTree
PARTITION BY toStartOfHour(datetime)
ORDER BY (id, startsAt, endsAt)
SETTINGS index_granularity = 8192;
' | clickhouse-client
164 changes: 164 additions & 0 deletions 2024-03-alerts-observability/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
version: '3.1'

volumes:
prometheus_data: {}
grafana_data: {}

services:

cadvisor:
image: gcr.io/cadvisor/cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- 8080:8080
restart: always
deploy:
mode: global

prometheus:
image: prom/prometheus:v2.49.1
volumes:
- ./prometheus/:/etc/prometheus/
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
ports:
- 9090:9090
links:
- cadvisor:cadvisor
- alertmanager:alertmanager
depends_on:
- cadvisor
restart: always
deploy:
mode: global

alertmanager:
image: prom/alertmanager:v0.26.0
ports:
- 9093:9093
volumes:
- ./alertmanager/:/etc/alertmanager/
- ./alertmanager/amtool:/bin/amtool
restart: always
command:
- '--config.file=/etc/alertmanager/config.yml'
deploy:
mode: global

alertmanager_silence:
image: prom/alertmanager:v0.26.0
volumes:
- ./alertmanager/:/etc/alertmanager/
- ./alertmanager/amtool:/bin/amtool
- ./alertmanager/silence_entrypoint.sh:/silence_entrypoint.sh
depends_on:
alertmanager:
condition: service_started
restart: 'no'
entrypoint: '/silence_entrypoint.sh'

blackbox_exporter:
image: prom/blackbox-exporter:v0.24.0
ports:
- 9115:9115
volumes:
- ./blackboxexporter/:/etc/blackboxexporter/
command:
- '--config.file=/etc/blackboxexporter/config.yml'
restart: always

clickhouse:
image: clickhouse/clickhouse-server:23.3.19.32
ports:
- "8123:8123"
- "9008:9008"
- "9009:9009"
environment:
# Default user and database will be created using `init-defaults.sh` script
CLICKHOUSE_DB: r0
CLICKHOUSE_USER: demouser
CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD}
hostname: clickhouse-0.localhost
ulimits:
nproc: 65535
nofile:
soft: 262144
hard: 262144
volumes:
- ./clickhouse/init-defaults.sh:/docker-entrypoint-initdb.d/init-defaults.sh:ro
healthcheck:
test:
[
"CMD-SHELL",
"http_proxy='' wget -nv -t1 --spider 'http://localhost:8123/' || exit 1",
]
interval: 10s
timeout: 10s
retries: 30

vector:
image: timberio/vector:0.27.0-debian
container_name: vector
ports:
- 8888:8888
environment:
CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD}
volumes:
- ./vector/vector.toml:/etc/vector/vector.toml:ro
depends_on:
clickhouse:
condition: service_healthy

grafana:
build: ./grafana/
environment:
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
- GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS=grafana-clickhouse-datasource
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
entrypoint:
- sh
- -euc
- |
mkdir -p /etc/grafana/provisioning/datasources
cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
apiVersion: 1
datasources:
- name: 'ClickHouse'
type: 'grafana-clickhouse-datasource'
isDefault: true
jsonData:
defaultDatabase: r0
port: 9000
server: clickhouse-0.localhost
username: demouser
tlsSkipVerify: false
secureJsonData:
password: ${CLICKHOUSE_PASSWORD}
editable: true
EOF
mkdir -p /etc/grafana/provisioning/dashboards
cat <<EOF > /etc/grafana/provisioning/dashboards/dashboard.yaml
apiVersion: 1
providers:
- name: demo
type: file
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
EOF
/run.sh
ports:
- "3000:3000"
10 changes: 10 additions & 0 deletions 2024-03-alerts-observability/grafana/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM grafana/grafana:10.2.3-ubuntu

WORKDIR /tmp
USER root

RUN grafana cli plugins install grafana-clickhouse-datasource

RUN mkdir -p /var/lib/grafana/dashboards

COPY dashboards/*.json /var/lib/grafana/dashboards/
Loading

0 comments on commit df75cae

Please sign in to comment.