Skip to content

Commit

Permalink
refactor: unique names for trustyai alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
jackdelahunt committed Nov 27, 2024
1 parent fab03b6 commit b2a21a2
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 16 deletions.
12 changes: 6 additions & 6 deletions config/monitoring/prometheus/apps/prometheus-configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1563,11 +1563,11 @@ data:
groups:
- name: SLOs-probe_success_trustyai
rules:
- alert: TrustyAI Controller Probe Success Burn Rate
- alert: TrustyAI Controller Probe Success 5m and 1h Burn Rate high
annotations:
message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).'
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md"
summary: TrustyAI Controller Probe Success Burn Rate
summary: TrustyAI Controller Probe Success 5m and 1h Burn Rate high
expr: |
sum(probe_success:burnrate5m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (14.40 * (1-0.98000))
and
Expand All @@ -1576,11 +1576,11 @@ data:
labels:
severity: critical
instance: trustyai-service-operator-controller-manager
- alert: TrustyAI Controller Probe Success Burn Rate
- alert: TrustyAI Controller Probe Success 30m and 6h Burn Rate high
annotations:
message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).'
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md"
summary: TrustyAI Controller Probe Success Burn Rate
summary: TrustyAI Controller Probe Success 30m and 6h Burn Rate high
expr: |
sum(probe_success:burnrate30m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (6.00 * (1-0.98000))
and
Expand All @@ -1589,11 +1589,11 @@ data:
labels:
severity: critical
instance: trustyai-service-operator-controller-manager
- alert: TrustyAI Controller Probe Success Burn Rate
- alert: TrustyAI Controller Probe Success 2h and 1d Burn Rate high
annotations:
message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).'
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md"
summary: TrustyAI Controller Probe Success Burn Rate
summary: TrustyAI Controller Probe Success 2h and 1d Burn Rate high
expr: |
sum(probe_success:burnrate2h{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (3.00 * (1-0.98000))
and
Expand Down
26 changes: 16 additions & 10 deletions tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ tests:
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high
exp_alerts: []

- interval: 1m
Expand All @@ -31,14 +37,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high
instance: "trustyai-service-operator-controller-manager"
severity: critical
exp_annotations:
summary: "TrustyAI Controller Probe Success Burn Rate"
summary: "TrustyAI Controller Probe Success 5m and 1h Burn Rate high"
message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 3)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md'

Expand All @@ -50,14 +56,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high
instance: "trustyai-service-operator-controller-manager"
severity: critical
exp_annotations:
summary: "TrustyAI Controller Probe Success Burn Rate"
summary: "TrustyAI Controller Probe Success 30m and 6h Burn Rate high"
message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md'

Expand All @@ -69,13 +75,13 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: TrustyAI Controller Probe Success Burn Rate
alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high
instance: "trustyai-service-operator-controller-manager"
severity: warning
exp_annotations:
summary: "TrustyAI Controller Probe Success Burn Rate"
summary: "TrustyAI Controller Probe Success 2h and 1d Burn Rate high"
message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 61)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md'

0 comments on commit b2a21a2

Please sign in to comment.