From fab03b635b3bc212a9d52202ffb514558de139aa Mon Sep 17 00:00:00 2001 From: jackdelahunt Date: Wed, 27 Nov 2024 14:42:01 +0000 Subject: [PATCH] refactor: unique names for odh-controller alerts --- .../prometheus/apps/prometheus-configs.yaml | 12 ++++----- ...-model-controller-alerting.unit-tests.yaml | 26 ++++++++++++------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index ce91e5db118..d7bf59423d9 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -1180,11 +1180,11 @@ data: groups: - name: SLOs-probe_success_model_controller rules: - - alert: ODH Model Controller Probe Success Burn Rate + - alert: ODH Model Controller Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" - summary: ODH Model Controller Probe Success Burn Rate + summary: ODH Model Controller Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"odh-model-controller"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1193,11 +1193,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: ODH Model Controller Probe Success Burn Rate + - alert: ODH Model Controller Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" - summary: ODH Model Controller Probe Success Burn Rate + summary: ODH Model Controller Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"odh-model-controller"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1206,11 +1206,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: ODH Model Controller Probe Success Burn Rate + - alert: ODH Model Controller Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" - summary: ODH Model Controller Probe Success Burn Rate + summary: ODH Model Controller Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"odh-model-controller"}) by (instance) > (3.00 * (1-0.98000)) and diff --git a/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml index 0b37ebac9f8..5580120e0f7 100644 --- a/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml @@ -20,7 +20,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,15 +37,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high namespace: "redhat-ods-applications" instance: "odh-model-controller" severity: critical exp_annotations: - summary: "ODH Model Controller Probe Success Burn Rate" + summary: "ODH Model Controller Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for odh-model-controller (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md' @@ -51,15 +57,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high namespace: "redhat-ods-applications" instance: "odh-model-controller" severity: critical exp_annotations: - summary: "ODH Model Controller Probe Success Burn Rate" + summary: "ODH Model Controller Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for odh-model-controller (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md' @@ -71,14 +77,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high namespace: "redhat-ods-applications" instance: "odh-model-controller" severity: warning exp_annotations: - summary: "ODH Model Controller Probe Success Burn Rate" + summary: "ODH Model Controller Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for odh-model-controller (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'