diff --git a/machines/andesite/comin.nix b/machines/andesite/comin.nix index 123aec0..13cb4d6 100644 --- a/machines/andesite/comin.nix +++ b/machines/andesite/comin.nix @@ -1,4 +1,4 @@ -{ inputs, ... }: +{ config, inputs, ... }: { imports = [ inputs.comin.nixosModules.comin ]; @@ -12,4 +12,16 @@ } ]; }; + + services.prometheus.scrapeConfigs = [ + { + job_name = "comin"; + static_configs = [ + { + labels.role = "andesite"; + targets = [ "localhost:${toString config.services.comin.exporter.port}" ]; + } + ]; + } + ]; } diff --git a/machines/andesite/configuration.nix b/machines/andesite/configuration.nix index 5c8d19b..e3aa7cf 100644 --- a/machines/andesite/configuration.nix +++ b/machines/andesite/configuration.nix @@ -12,6 +12,7 @@ ./blockgame-meta.nix ./comin.nix ./disks.nix + ./prometheus ./refraction.nix ]; diff --git a/machines/andesite/prometheus/alertmanager.env.age b/machines/andesite/prometheus/alertmanager.env.age new file mode 100644 index 0000000..98111d1 --- /dev/null +++ b/machines/andesite/prometheus/alertmanager.env.age @@ -0,0 +1,8 @@ +age-encryption.org/v1 +-> ssh-ed25519 ywGURw Y/oQ/enDLVsYWPXrsPkDt6TiFRHZvVchC+o0qHlboQ0 +HMPO08IhqG8dSrbDtrgAai/pz2BqlItiu5eNY6S22Ok +-> ssh-ed25519 zsVv8w ENZzXRBfRZS5HVguzm3LFc2EK8oPe5J6tx78R2pyUDs +AOiB6gZdi3tiVKjcuwJcFZkvTcn/BobkSNKDgquJNlA +--- 9XqYCvSUAgp0Jpaig/5bqbLib0XNv6LfPkC7dWbP9Oc +`P i4SU CPD+L +VN&eи-B|aʀ,8AIB:R% dTGnq6[]؞zgLQ> I"t \ No newline at end of file diff --git a/machines/andesite/prometheus/default.nix b/machines/andesite/prometheus/default.nix new file mode 100644 index 0000000..5f4aa19 --- /dev/null +++ b/machines/andesite/prometheus/default.nix @@ -0,0 +1,61 @@ +{ config, ... }: +{ + imports = [ + ./exporters/node.nix + ]; + + age.secrets."alertmanager.env".file = ./alertmanager.env.age; + + services.prometheus = { + enable = true; + + extraFlags = [ + "--storage.tsdb.retention.time=${toString (720 * 24)}h" + ]; + + globalConfig.scrape_interval = "15s"; + + alertmanagers = [ + { + scheme = "http"; + static_configs = [ + { targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ]; } + ]; + } + ]; + + alertmanager = { + enable = true; + + extraFlags = [ "--cluster.listen-address=''" ]; + + environmentFile = config.age.secrets."alertmanager.env".path; + configuration = { + receivers = [ + { + name = "discord"; + discord_configs = [ + { webhook_url = "https://discord.com/api/webhooks/1315429375981781003/$DISCORD_WEBHOOK_SUFFIX"; } + ]; + } + ]; + route = { + receiver = "discord"; + group_wait = "30s"; + group_interval = "5m"; + repeat_interval = "24h"; + group_by = [ "alertname" ]; + }; + }; + }; + }; + + # TODO: reverse proxy + networking.firewall.allowedTCPPorts = [ + config.services.prometheus.port + ]; + + environment.persistence."/nix/persistence".directories = [ + "/var/lib/prometheus2" + ]; +} diff --git a/machines/andesite/prometheus/exporters/node.nix b/machines/andesite/prometheus/exporters/node.nix new file mode 100644 index 0000000..f6b97b6 --- /dev/null +++ b/machines/andesite/prometheus/exporters/node.nix @@ -0,0 +1,52 @@ +{ ... }: + +{ + services.prometheus.exporters.node = { + enable = true; + enabledCollectors = [ "systemd" ]; + }; + + services.prometheus = { + scrapeConfigs = [ + { + job_name = "node"; + static_configs = [ + { + labels.role = "andesite"; + targets = [ "localhost:9100" ]; + } + ]; + } + ]; + rules = [ + (builtins.toJSON { + groups = [ + { + name = "node"; + rules = [ + { + alert = "SystemdUnitFailed"; + expr = ''node_systemd_unit_state{state="failed"} == 1''; + for = "5m"; + labels.severity = "warning"; + annotations.summary = "systemd unit {{ $labels.name }} on {{ $labels.instance }} has been down for more than 5 minutes."; + } + { + alert = "RootfsLowSpace"; + # /nix is the primary filesystem on impermanent installations + expr = '' + node_filesystem_avail_bytes{mountpoint=~"(/nix|/boot)"} / node_filesystem_size_bytes * 100 <= 10 + ''; + for = "10m"; + labels.severity = "warning"; + annotations.summary = '' + {{ $labels.device }} mounted at {{ $labels.mountpoint }} ({{ $labels.fstype }}) on {{ $labels.instance }} has {{ $value }}% space left. + ''; + } + ]; + } + ]; + }) + ]; + }; +} diff --git a/secrets.nix b/secrets.nix index 7cf3328..7697b05 100644 --- a/secrets.nix +++ b/secrets.nix @@ -8,6 +8,10 @@ in scrumplex andesite ]; + "machines/andesite/prometheus/alertmanager.env.age".publicKeys = [ + scrumplex + andesite + ]; "machines/andesite/refraction.env.age".publicKeys = [ scrumplex andesite