Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sane Opsgenie alerting #487

Merged
merged 8 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions component/slos.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ local newSLO(name, group, sloParams) =
name: name,
objective: sloParams.objective,
alerting: {
labels: params.slos.alerting.labels,
page_alert: {
labels: params.slos.alerting.page_labels,
annotations: {
Expand Down Expand Up @@ -86,7 +85,6 @@ local generateSlothInput(name, uptime) =
},
labels+: {
service: 'VSHN' + name,
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
Expand All @@ -104,7 +102,6 @@ local generateSlothInput(name, uptime) =
},
labels+: {
service: 'VSHN' + name,
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
},
},
},
Expand Down
63 changes: 63 additions & 0 deletions component/vshn_alerting.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
local kap = import 'lib/kapitan.libjsonnet';

local inv = kap.inventory();
local params = inv.parameters.appcat;


local genGenericAlertingRule(serviceName) = {
apiVersion: 'monitoring.coreos.com/v1',
kind: 'PrometheusRule',
metadata: {
name: 'vshn-' + std.asciiLower(serviceName) + '-sla',
namespace: params.slos.namespace,
labels: {
syn_team: 'schedar',
syn_component: 'appcat',
syn: 'true',
},
},
spec: {
groups: [
{
name: 'appcat-' + std.asciiLower(serviceName) + '-sla-target',
rules: [
{
alert: 'vshn-' + std.asciiLower(serviceName) + '-sla',
// this query can be read as: if the rate of probes that are not successful is higher than 0.2 in the last 5 minutes and in the last minute, then alert
// rate works on per second basis, so 0.2 means 20% of the probes are failing, which for 5 minutes is 1 minute and for 1 minute is 45 seconds
expr: 'rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="false", maintenance="false"}[1m]) > 0.75',
labels: {
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
runbook: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html',
service: serviceName,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You missed the syn: 'true' label.

severity: 'critical',
syn: 'true',
syn_team: 'schedar',
syn_component: 'appcat',
},
},
{
alert: 'vshn-' + std.asciiLower(serviceName) + '-sla-ha',
// this query can be read as: if the rate of probes that are not successful is higher than 0.2 in the last 5 minutes and in the last minute, then alert
// rate works on per second basis, so 0.2 means 20% of the probes are failing, which for 5 minutes is 1 minute and for 1 minute is 45 seconds
expr: 'rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success", service="' + serviceName + '", ha="true"}[1m]) > 0.75',
labels: {
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end }}',
runbook: 'https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html',
service: serviceName,
severity: 'critical',
syn: 'true',
syn_team: 'schedar',
syn_component: 'appcat',
},
},
],
},
],
},
};


{
GenGenericAlertingRule: genGenericAlertingRule,
}
4 changes: 4 additions & 0 deletions component/vshn_appcat_services.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ local prom = import 'prometheus.libsonnet';
local xrds = import 'xrds.libsonnet';

local slos = import 'slos.libsonnet';
local opsgenieRules = import 'vshn_alerting.jsonnet';


local inv = kap.inventory();
local params = inv.parameters.appcat;
Expand Down Expand Up @@ -188,6 +190,8 @@ local vshn_appcat_service(name, serviceParams) =
[if isOpenshift && std.objectHas(serviceParams, 'openshiftTemplate') then '21_openshift_template_%s_vshn' % name]: osTemplate,
[if params.services.vshn.enabled && serviceParams.enabled then 'sli_exporter/90_slo_vshn_%s' % name]: slos.Get('vshn-' + name),
[if params.services.vshn.enabled && serviceParams.enabled then 'sli_exporter/90_slo_vshn_%s_ha' % name]: slos.Get('vshn-' + name + '-ha'),
['sli_exporter/90_%s_Opsgenie' % name]: opsgenieRules.GenGenericAlertingRule(name),

} else {}
;

Expand Down
4 changes: 3 additions & 1 deletion component/vshn_minio.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ local common = import 'common.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.appcat;
local minioParams = params.services.vshn.minio;

local opsgenieRules = import 'vshn_alerting.jsonnet';

local instances = [
kube._Object('vshn.appcat.vshn.io/v1', 'VSHNMinio', instance.name) +
Expand All @@ -29,4 +29,6 @@ local instances = [

if params.services.vshn.enabled && minioParams.enabled && std.length(instances) != 0 then {
'22_minio_instances': instances,
'sli_exporter/90_VSHNMinio_Opsgenie': opsgenieRules.GenGenericAlertingRule('VSHNMinio'),

} else {}
3 changes: 3 additions & 0 deletions component/vshn_postgres.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ local xrds = import 'xrds.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.appcat;
local pgParams = params.services.vshn.postgres;
local opsgenieRules = import 'vshn_alerting.jsonnet';

local defaultDB = 'postgres';
local defaultUser = 'postgres';
Expand Down Expand Up @@ -307,4 +308,6 @@ if params.services.vshn.enabled && pgParams.enabled then
[if isOpenshift then '12_stackgres_openshift_operator_netpol']: stackgresNetworkPolicy,
[if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql']: slos.Get('vshn-postgresql'),
[if params.slos.enabled && params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_slo_vshn_postgresql_ha']: slos.Get('vshn-postgresql-ha'),
[if params.services.vshn.enabled && params.services.vshn.postgres.enabled then 'sli_exporter/90_VSHNPostgreSQL_Opsgenie']: opsgenieRules.GenGenericAlertingRule('VSHNPostgreSQL'),

} else {}
2 changes: 2 additions & 0 deletions component/vshn_redis.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ local xrds = import 'xrds.libsonnet';
local inv = kap.inventory();
local params = inv.parameters.appcat;
local redisParams = params.services.vshn.redis;
local opsgenieRules = import 'vshn_alerting.jsonnet';

local defaultUser = 'default';
local defaultPort = '6379';
Expand Down Expand Up @@ -566,4 +567,5 @@ if params.services.vshn.enabled && redisParams.enabled then {
[if isOpenshift then '21_openshift_template_redis_vshn']: osTemplate,
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis']: slos.Get('vshn-redis'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_slo_vshn_redis_ha']: slos.Get('vshn-redis-ha'),
[if params.services.vshn.enabled && params.services.vshn.redis.enabled then 'sli_exporter/90_VSHNRedis_Opsgenie']: opsgenieRules.GenGenericAlertingRule('VSHNRedis'),
} else {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
syn: 'true'
syn_component: appcat
syn_team: schedar
name: vshn-minio-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-minio-sla-target
rules:
- alert: vshn-minio-sla
expr: rate(appcat_probes_seconds_count{reason!="success", service="minio",
ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="minio", ha="false", maintenance="false"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: minio
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: vshn-minio-sla-ha
expr: rate(appcat_probes_seconds_count{reason!="success", service="minio",
ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="minio", ha="true"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: minio
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_VSHNMinioUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-minio.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-uptime", sloth_service="appcat-vshn-minio", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_HAVSHNMinioUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-minio.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-minio-ha-uptime", sloth_service="appcat-vshn-minio-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNMinio
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
syn: 'true'
syn_component: appcat
syn_team: schedar
name: vshn-vshnpostgresql-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-vshnpostgresql-sla-target
rules:
- alert: vshn-vshnpostgresql-sla
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL",
ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNPostgreSQL", ha="false", maintenance="false"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNPostgreSQL
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: vshn-vshnpostgresql-sla-ha
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNPostgreSQL",
ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNPostgreSQL", ha="true"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNPostgreSQL
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
syn: 'true'
syn_component: appcat
syn_team: schedar
name: vshn-vshnredis-sla
namespace: appcat-slos
spec:
groups:
- name: appcat-vshnredis-sla-target
rules:
- alert: vshn-vshnredis-sla
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis",
ha="false", maintenance="false"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNRedis", ha="false", maintenance="false"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNRedis
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: vshn-vshnredis-sla-ha
expr: rate(appcat_probes_seconds_count{reason!="success", service="VSHNRedis",
ha="true"}[5m]) > 0.2 and rate(appcat_probes_seconds_count{reason!="success",
service="VSHNRedis", ha="true"}[1m]) > 0.75
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
runbook: https://kb.vshn.ch/app-catalog/how-tos/appcat/GuaranteedUptimeTarget.html
service: VSHNRedis
severity: critical
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_VSHNPostgreSQLUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-postgresql-uptime", sloth_service="appcat-vshn-postgresql", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,9 @@ spec:
)
for: 6m
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: critical
slo: 'true'
sloth_severity: page
syn: 'true'
syn_component: appcat
syn_team: schedar
- alert: SLO_AppCat_HAVSHNPostgreSQLUptime
annotations:
runbook_url: https://hub.syn.tools/appcat/runbooks/vshn-postgresql.html#uptime
Expand All @@ -194,12 +188,6 @@ spec:
max(slo:sli_error:ratio_rate3d{sloth_id="appcat-vshn-postgresql-ha-uptime", sloth_service="appcat-vshn-postgresql-ha", sloth_slo="uptime"} > (1 * 0.0009999999999999432)) without (sloth_window)
)
labels:
OnCall: '{{ if eq $labels.sla "guaranteed" }}true{{ else }}false{{ end
}}'
service: VSHNPostgreSQL
severity: warning
slo: 'true'
sloth_severity: ticket
syn: 'true'
syn_component: appcat
syn_team: schedar
Loading