Skip to content

Commit

Permalink
refactor: unique names for odh-controller alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
jackdelahunt committed Nov 27, 2024
1 parent 928b1c8 commit fab03b6
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 16 deletions.
12 changes: 6 additions & 6 deletions config/monitoring/prometheus/apps/prometheus-configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1180,11 +1180,11 @@ data:
groups:
- name: SLOs-probe_success_model_controller
rules:
- alert: ODH Model Controller Probe Success Burn Rate
- alert: ODH Model Controller Probe Success 5m and 1h Burn Rate high
annotations:
message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).'
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md"
summary: ODH Model Controller Probe Success Burn Rate
summary: ODH Model Controller Probe Success 5m and 1h Burn Rate high
expr: |
sum(probe_success:burnrate5m{instance=~"odh-model-controller"}) by (instance) > (14.40 * (1-0.98000))
and
Expand All @@ -1193,11 +1193,11 @@ data:
labels:
severity: critical
namespace: redhat-ods-applications
- alert: ODH Model Controller Probe Success Burn Rate
- alert: ODH Model Controller Probe Success 30m and 6h Burn Rate high
annotations:
message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).'
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md"
summary: ODH Model Controller Probe Success Burn Rate
summary: ODH Model Controller Probe Success 30m and 6h Burn Rate high
expr: |
sum(probe_success:burnrate30m{instance=~"odh-model-controller"}) by (instance) > (6.00 * (1-0.98000))
and
Expand All @@ -1206,11 +1206,11 @@ data:
labels:
severity: critical
namespace: redhat-ods-applications
- alert: ODH Model Controller Probe Success Burn Rate
- alert: ODH Model Controller Probe Success 2h and 1d Burn Rate high
annotations:
message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).'
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md"
summary: ODH Model Controller Probe Success Burn Rate
summary: ODH Model Controller Probe Success 2h and 1d Burn Rate high
expr: |
sum(probe_success:burnrate2h{instance=~"odh-model-controller"}) by (instance) > (3.00 * (1-0.98000))
and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ tests:
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high
exp_alerts: []

- interval: 1m
Expand All @@ -31,15 +37,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high
namespace: "redhat-ods-applications"
instance: "odh-model-controller"
severity: critical
exp_annotations:
summary: "ODH Model Controller Probe Success Burn Rate"
summary: "ODH Model Controller Probe Success 5m and 1h Burn Rate high"
message: "High error budget burn for odh-model-controller (current value: 3)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'

Expand All @@ -51,15 +57,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high
namespace: "redhat-ods-applications"
instance: "odh-model-controller"
severity: critical
exp_annotations:
summary: "ODH Model Controller Probe Success Burn Rate"
summary: "ODH Model Controller Probe Success 30m and 6h Burn Rate high"
message: "High error budget burn for odh-model-controller (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'

Expand All @@ -71,14 +77,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: ODH Model Controller Probe Success Burn Rate
alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high
namespace: "redhat-ods-applications"
instance: "odh-model-controller"
severity: warning
exp_annotations:
summary: "ODH Model Controller Probe Success Burn Rate"
summary: "ODH Model Controller Probe Success 2h and 1d Burn Rate high"
message: "High error budget burn for odh-model-controller (current value: 61)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'

0 comments on commit fab03b6

Please sign in to comment.