Skip to content

Commit

Permalink
andesite: add monitoring
Browse files Browse the repository at this point in the history
Signed-off-by: Sefa Eyeoglu <[email protected]>
  • Loading branch information
Scrumplex committed Dec 8, 2024
1 parent 6c1c7c0 commit 52e9ad1
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 1 deletion.
14 changes: 13 additions & 1 deletion machines/andesite/comin.nix
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{ inputs, ... }:
{ config, inputs, ... }:
{
imports = [ inputs.comin.nixosModules.comin ];

Expand All @@ -12,4 +12,16 @@
}
];
};

services.prometheus.scrapeConfigs = [
{
job_name = "comin";
static_configs = [
{
labels.role = "andesite";
targets = [ "localhost:${toString config.services.comin.exporter.port}" ];
}
];
}
];
}
1 change: 1 addition & 0 deletions machines/andesite/configuration.nix
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
./blockgame-meta.nix
./comin.nix
./disks.nix
./prometheus
./refraction.nix
];

Expand Down
8 changes: 8 additions & 0 deletions machines/andesite/prometheus/alertmanager.env.age
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
age-encryption.org/v1
-> ssh-ed25519 ywGURw Y/oQ/enDLVsYWPXrsPkDt6TiFRHZvVchC+o0qHlboQ0
HMPO08IhqG8dSrbDtrgAai/pz2BqlItiu5eNY6S22Ok
-> ssh-ed25519 zsVv8w ENZzXRBfRZS5HVguzm3LFc2EK8oPe5J6tx78R2pyUDs
AOiB6gZdi3tiVKjcuwJcFZkvTcn/BobkSNKDgquJNlA
--- 9XqYCvSUAgp0Jpaig/5bqbLib0XNv6LfPkC7dWbP9Oc
��`��P��� i���4S��U� C�P�D�+L�
VN&��eи-B�|aʀ,8�A����I�B���:��R�%�� dTG�n���q6��[]؞zg�L���Q���>� �I"���t�
61 changes: 61 additions & 0 deletions machines/andesite/prometheus/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{ config, ... }:
{
imports = [
./exporters/node.nix
];

age.secrets."alertmanager.env".file = ./alertmanager.env.age;

services.prometheus = {
enable = true;

extraFlags = [
"--storage.tsdb.retention.time=${toString (720 * 24)}h"
];

globalConfig.scrape_interval = "15s";

alertmanagers = [
{
scheme = "http";
static_configs = [
{ targets = [ "localhost:${toString config.services.prometheus.alertmanager.port}" ]; }
];
}
];

alertmanager = {
enable = true;

extraFlags = [ "--cluster.listen-address=''" ];

environmentFile = config.age.secrets."alertmanager.env".path;
configuration = {
receivers = [
{
name = "discord";
discord_configs = [
{ webhook_url = "https://discord.com/api/webhooks/1315429375981781003/$DISCORD_WEBHOOK_SUFFIX"; }
];
}
];
route = {
receiver = "discord";
group_wait = "30s";
group_interval = "5m";
repeat_interval = "24h";
group_by = [ "alertname" ];
};
};
};
};

# TODO: reverse proxy
networking.firewall.allowedTCPPorts = [
config.services.prometheus.port
];

environment.persistence."/nix/persistence".directories = [
"/var/lib/prometheus2"
];
}
52 changes: 52 additions & 0 deletions machines/andesite/prometheus/exporters/node.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{ ... }:

{
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [ "systemd" ];
};

services.prometheus = {
scrapeConfigs = [
{
job_name = "node";
static_configs = [
{
labels.role = "andesite";
targets = [ "localhost:9100" ];
}
];
}
];
rules = [
(builtins.toJSON {
groups = [
{
name = "node";
rules = [
{
alert = "SystemdUnitFailed";
expr = ''node_systemd_unit_state{state="failed"} == 1'';
for = "5m";
labels.severity = "warning";
annotations.summary = "systemd unit {{ $labels.name }} on {{ $labels.instance }} has been down for more than 5 minutes.";
}
{
alert = "RootfsLowSpace";
# /nix is the primary filesystem on impermanent installations
expr = ''
node_filesystem_avail_bytes{mountpoint=~"(/nix|/boot)"} / node_filesystem_size_bytes * 100 <= 10
'';
for = "10m";
labels.severity = "warning";
annotations.summary = ''
{{ $labels.device }} mounted at {{ $labels.mountpoint }} ({{ $labels.fstype }}) on {{ $labels.instance }} has {{ $value }}% space left.
'';
}
];
}
];
})
];
};
}
4 changes: 4 additions & 0 deletions secrets.nix
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ in
scrumplex
andesite
];
"machines/andesite/prometheus/alertmanager.env.age".publicKeys = [
scrumplex
andesite
];
"machines/andesite/refraction.env.age".publicKeys = [
scrumplex
andesite
Expand Down

0 comments on commit 52e9ad1

Please sign in to comment.