diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index d7bf59423d9..0f77cfbd537 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -1563,11 +1563,11 @@ data: groups: - name: SLOs-probe_success_trustyai rules: - - alert: TrustyAI Controller Probe Success Burn Rate + - alert: TrustyAI Controller Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" - summary: TrustyAI Controller Probe Success Burn Rate + summary: TrustyAI Controller Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1576,11 +1576,11 @@ data: labels: severity: critical instance: trustyai-service-operator-controller-manager - - alert: TrustyAI Controller Probe Success Burn Rate + - alert: TrustyAI Controller Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" - summary: TrustyAI Controller Probe Success Burn Rate + summary: TrustyAI Controller Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1589,11 +1589,11 @@ data: labels: severity: critical instance: trustyai-service-operator-controller-manager - - alert: TrustyAI Controller Probe Success Burn Rate + - alert: TrustyAI Controller Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" - summary: TrustyAI Controller Probe Success Burn Rate + summary: TrustyAI Controller Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) and diff --git a/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml index 2fd1594cbe7..be37407cbaa 100644 --- a/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml @@ -20,7 +20,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,14 +37,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high instance: "trustyai-service-operator-controller-manager" severity: critical exp_annotations: - summary: "TrustyAI Controller Probe Success Burn Rate" + summary: "TrustyAI Controller Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md' @@ -50,14 +56,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high instance: "trustyai-service-operator-controller-manager" severity: critical exp_annotations: - summary: "TrustyAI Controller Probe Success Burn Rate" + summary: "TrustyAI Controller Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md' @@ -69,13 +75,13 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high instance: "trustyai-service-operator-controller-manager" severity: warning exp_annotations: - summary: "TrustyAI Controller Probe Success Burn Rate" + summary: "TrustyAI Controller Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md'