From feb58381b9cdcb3f2863f3311594276aca96cd4f Mon Sep 17 00:00:00 2001 From: jackdelahunt Date: Thu, 28 Nov 2024 11:37:39 +0000 Subject: [PATCH] refactor: unique alert names --- .../prometheus/apps/prometheus-configs.yaml | 136 +++++++++--------- .../codeflare-alerting.unit-tests.yaml | 26 ++-- ...ipelines-operator-alerting.unit-tests.yaml | 63 +++++--- .../kserve-alerting.unit-tests.yaml | 26 ++-- .../model-mesh-alerting.unit-tests.yaml | 26 ++-- ...-model-controller-alerting.unit-tests.yaml | 26 ++-- .../rhods-dashboard-alerting.unit-tests.yaml | 61 +++++--- .../trustyai-alerting.unit-tests.yaml | 26 ++-- .../workbenches-alerting.unit-tests.yaml | 35 +++-- 9 files changed, 248 insertions(+), 177 deletions(-) diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index a9d8f089536..8c265e84df2 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -534,11 +534,11 @@ data: groups: - name: SLOs-probe_success_codeflare rules: - - alert: CodeFlare Operator Probe Success Burn Rate + - alert: CodeFlare Operator Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' - summary: CodeFlare Operator Probe Success Burn Rate + summary: CodeFlare Operator Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"codeflare-operator"}) by (instance) > (14.40 * (1-0.99950)) and @@ -547,11 +547,11 @@ data: labels: severity: info namespace: redhat-ods-applications - - alert: CodeFlare Operator Probe Success Burn Rate + - alert: CodeFlare Operator Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' - summary: CodeFlare Operator Probe Success Burn Rate + summary: CodeFlare Operator Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"codeflare-operator"}) by (instance) > (6.00 * (1-0.99950)) and @@ -560,11 +560,11 @@ data: labels: severity: info namespace: redhat-ods-applications - - alert: CodeFlare Operator Probe Success Burn Rate + - alert: CodeFlare Operator Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' - summary: CodeFlare Operator Probe Success Burn Rate + summary: CodeFlare Operator Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"codeflare-operator"}) by (instance) > (3.00 * (1-0.99950)) and @@ -703,11 +703,11 @@ data: groups: - name: SLOs-haproxy_backend_http_responses_dashboard rules: - - alert: RHODS Dashboard Route Error Burn Rate + - alert: RHODS Dashboard Route Error 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' - summary: RHODS Dashboard Route Error Burn Rate + summary: RHODS Dashboard Route Error 5m and 1h Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate5m{route=~"rhods-dashboard"}) by (route) > (14.40 * (1-0.99950)) and @@ -716,11 +716,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: RHODS Dashboard Route Error Burn Rate + - alert: RHODS Dashboard Route Error 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' - summary: RHODS Dashboard Route Error Burn Rate + summary: RHODS Dashboard Route Error 30m and 6h Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate30m{route=~"rhods-dashboard"}) by (route) > (6.00 * (1-0.99950)) and @@ -729,11 +729,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: RHODS Dashboard Route Error Burn Rate + - alert: RHODS Dashboard Route Error 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' - summary: RHODS Dashboard Route Error Burn Rate + summary: RHODS Dashboard Route Error 2h and 1d Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate2h{route=~"rhods-dashboard"}) by (route) > (3.00 * (1-0.99950)) and @@ -742,11 +742,11 @@ data: labels: severity: warning namespace: redhat-ods-applications - - alert: RHODS Dashboard Route Error Burn Rate + - alert: RHODS Dashboard Route Error 6h and 3d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' - summary: RHODS Dashboard Route Error Burn Rate + summary: RHODS Dashboard Route Error 6h and 3d Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate6h{route=~"rhods-dashboard"}) by (route) > (1.00 * (1-0.99950)) and @@ -757,11 +757,11 @@ data: namespace: redhat-ods-applications - name: SLOs-probe_success_dashboard rules: - - alert: RHODS Dashboard Probe Success Burn Rate + - alert: RHODS Dashboard Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" - summary: RHODS Dashboard Probe Success Burn Rate + summary: RHODS Dashboard Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{name=~"rhods-dashboard"}) by (name) > (14.40 * (1-0.98)) and @@ -770,11 +770,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: RHODS Dashboard Probe Success Burn Rate + - alert: RHODS Dashboard Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" - summary: RHODS Dashboard Probe Success Burn Rate + summary: RHODS Dashboard Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{name=~"rhods-dashboard"}) by (name) > (6.00 * (1-0.98)) and @@ -783,11 +783,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: RHODS Dashboard Probe Success Burn Rate + - alert: RHODS Dashboard Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" - summary: RHODS Dashboard Probe Success Burn Rate + summary: RHODS Dashboard Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{name=~"rhods-dashboard"}) by (name) > (3.00 * (1-0.98)) and @@ -796,11 +796,11 @@ data: labels: severity: warning namespace: redhat-ods-applications - - alert: RHODS Dashboard Probe Success Burn Rate + - alert: RHODS Dashboard Probe Success 6h and 3d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.name }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md" - summary: RHODS Dashboard Probe Success Burn Rate + summary: RHODS Dashboard Probe Success 6h and 3d Burn Rate high expr: | sum(probe_success:burnrate6h{name=~"rhods-dashboard"}) by (name) > (1.00 * (1-0.98)) and @@ -906,11 +906,11 @@ data: groups: - name: SLOs-haproxy_backend_http_responses_dsp rules: - - alert: Data Science Pipelines Application Route Error Burn Rate + - alert: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - summary: Data Science Pipelines Application Route Error Burn Rate + summary: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate5m{component="dsp"}) by (exported_namespace) > (14.40 * (1-0.99950)) and @@ -919,11 +919,11 @@ data: labels: severity: info namespace: redhat-ods-applications - - alert: Data Science Pipelines Application Route Error Burn Rate + - alert: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - summary: Data Science Pipelines Application Route Error Burn Rate + summary: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate30m{component="dsp"}) by (exported_namespace) > (6.00 * (1-0.99950)) and @@ -932,11 +932,11 @@ data: labels: severity: info namespace: redhat-ods-applications - - alert: Data Science Pipelines Application Route Error Burn Rate + - alert: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - summary: Data Science Pipelines Application Route Error Burn Rate + summary: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate2h{component="dsp"}) by (exported_namespace) > (3.00 * (1-0.99950)) and @@ -945,11 +945,11 @@ data: labels: severity: info namespace: redhat-ods-applications - - alert: Data Science Pipelines Application Route Error Burn Rate + - alert: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.route }} (current value: {{ $value }}).' triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - summary: Data Science Pipelines Application Route Error Burn Rate + summary: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high expr: | sum(haproxy_backend_http_responses_total:burnrate6h{component="dsp"}) by (exported_namespace) > (1.00 * (1-0.99950)) and @@ -960,11 +960,11 @@ data: namespace: redhat-ods-applications - name: SLOs-probe_success_dsp rules: - - alert: Data Science Pipelines Operator Probe Success Burn Rate + - alert: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md" - summary: Data Science Pipelines Operator Probe Success Burn Rate + summary: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"data-science-pipelines-operator"}) by (instance) > (14.40 * (1-0.98000)) and @@ -973,11 +973,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: Data Science Pipelines Operator Probe Success Burn Rate + - alert: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md" - summary: Data Science Pipelines Operator Probe Success Burn Rate + summary: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"data-science-pipelines-operator"}) by (instance) > (6.00 * (1-0.98000)) and @@ -986,11 +986,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: Data Science Pipelines Operator Probe Success Burn Rate + - alert: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md" - summary: Data Science Pipelines Operator Probe Success Burn Rate + summary: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"data-science-pipelines-operator"}) by (instance) > (3.00 * (1-0.98000)) and @@ -1091,11 +1091,11 @@ data: groups: - name: SLOs-probe_success_modelmesh rules: - - alert: Modelmesh Controller Probe Success Burn Rate + - alert: Modelmesh Controller Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md" - summary: Modelmesh Controller Probe Success Burn Rate + summary: Modelmesh Controller Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"modelmesh-controller"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1104,11 +1104,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: Modelmesh Controller Probe Success Burn Rate + - alert: Modelmesh Controller Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md" - summary: Modelmesh Controller Probe Success Burn Rate + summary: Modelmesh Controller Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"modelmesh-controller"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1117,11 +1117,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: Modelmesh Controller Probe Success Burn Rate + - alert: Modelmesh Controller Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md" - summary: Modelmesh Controller Probe Success Burn Rate + summary: Modelmesh Controller Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"modelmesh-controller"}) by (instance) > (3.00 * (1-0.98000)) and @@ -1180,11 +1180,11 @@ data: groups: - name: SLOs-probe_success_model_controller rules: - - alert: ODH Model Controller Probe Success Burn Rate + - alert: ODH Model Controller Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" - summary: ODH Model Controller Probe Success Burn Rate + summary: ODH Model Controller Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"odh-model-controller"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1193,11 +1193,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: ODH Model Controller Probe Success Burn Rate + - alert: ODH Model Controller Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" - summary: ODH Model Controller Probe Success Burn Rate + summary: ODH Model Controller Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"odh-model-controller"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1206,11 +1206,11 @@ data: labels: severity: critical namespace: redhat-ods-applications - - alert: ODH Model Controller Probe Success Burn Rate + - alert: ODH Model Controller Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md" - summary: ODH Model Controller Probe Success Burn Rate + summary: ODH Model Controller Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"odh-model-controller"}) by (instance) > (3.00 * (1-0.98000)) and @@ -1269,11 +1269,11 @@ data: groups: - name: SLOs-probe_success_kserve rules: - - alert: Kserve Controller Probe Success Burn Rate + - alert: Kserve Controller Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" - summary: Kserve Controller Probe Success Burn Rate + summary: Kserve Controller Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"kserve-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1281,11 +1281,11 @@ data: for: 2m labels: severity: critical - - alert: Kserve Controller Probe Success Burn Rate + - alert: Kserve Controller Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" - summary: Kserve Controller Probe Success Burn Rate + summary: Kserve Controller Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"kserve-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1293,11 +1293,11 @@ data: for: 15m labels: severity: critical - - alert: Kserve Controller Probe Success Burn Rate + - alert: Kserve Controller Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" - summary: Kserve Controller Probe Success Burn Rate + summary: Kserve Controller Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"kserve-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) and @@ -1462,11 +1462,11 @@ data: - name: SLOs-probe_success_workbench rules: - - alert: RHODS Jupyter Probe Success Burn Rate + - alert: RHODS Jupyter Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" - summary: RHODS Jupyter Probe Success Burn Rate + summary: RHODS Jupyter Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"notebook-spawner"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1475,11 +1475,11 @@ data: labels: severity: critical instance: notebook-spawner - - alert: RHODS Jupyter Probe Success Burn Rate + - alert: RHODS Jupyter Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" - summary: RHODS Jupyter Probe Success Burn Rate + summary: RHODS Jupyter Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"notebook-spawner"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1488,11 +1488,11 @@ data: labels: severity: critical instance: notebook-spawner - - alert: RHODS Jupyter Probe Success Burn Rate + - alert: RHODS Jupyter Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" - summary: RHODS Jupyter Probe Success Burn Rate + summary: RHODS Jupyter Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"notebook-spawner"}) by (instance) > (3.00 * (1-0.98000)) and @@ -1501,11 +1501,11 @@ data: labels: severity: warning instance: notebook-spawner - - alert: RHODS Jupyter Probe Success Burn Rate + - alert: RHODS Jupyter Probe Success 6h and 3d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md" - summary: RHODS Jupyter Probe Success Burn Rate + summary: RHODS Jupyter Probe Success 6h and 3d Burn Rate high expr: | sum(probe_success:burnrate6h{instance=~"notebook-spawner"}) by (instance) > (1.00 * (1-0.98000)) and @@ -1563,11 +1563,11 @@ data: groups: - name: SLOs-probe_success_trustyai rules: - - alert: TrustyAI Controller Probe Success Burn Rate + - alert: TrustyAI Controller Probe Success 5m and 1h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" - summary: TrustyAI Controller Probe Success Burn Rate + summary: TrustyAI Controller Probe Success 5m and 1h Burn Rate high expr: | sum(probe_success:burnrate5m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (14.40 * (1-0.98000)) and @@ -1576,11 +1576,11 @@ data: labels: severity: critical instance: trustyai-service-operator-controller-manager - - alert: TrustyAI Controller Probe Success Burn Rate + - alert: TrustyAI Controller Probe Success 30m and 6h Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" - summary: TrustyAI Controller Probe Success Burn Rate + summary: TrustyAI Controller Probe Success 30m and 6h Burn Rate high expr: | sum(probe_success:burnrate30m{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (6.00 * (1-0.98000)) and @@ -1589,11 +1589,11 @@ data: labels: severity: critical instance: trustyai-service-operator-controller-manager - - alert: TrustyAI Controller Probe Success Burn Rate + - alert: TrustyAI Controller Probe Success 2h and 1d Burn Rate high annotations: message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md" - summary: TrustyAI Controller Probe Success Burn Rate + summary: TrustyAI Controller Probe Success 2h and 1d Burn Rate high expr: | sum(probe_success:burnrate2h{instance=~"trustyai-service-operator-controller-manager"}) by (instance) > (3.00 * (1-0.98000)) and diff --git a/tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml index 5b4571c8da6..de08cc93433 100644 --- a/tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml @@ -21,7 +21,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: CodeFlare Operator Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: CodeFlare Operator Probe Success 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -32,16 +38,16 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 5m and 1h Burn Rate high instance: "codeflare-operator" namespace: "redhat-ods-applications" severity: info exp_annotations: message: "High error budget burn for codeflare-operator (current value: 3)." - summary: CodeFlare Operator Probe Success Burn Rate + summary: CodeFlare Operator Probe Success 5m and 1h Burn Rate high triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' - interval: 1m @@ -52,16 +58,16 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 30m and 6h Burn Rate high instance: "codeflare-operator" namespace: "redhat-ods-applications" severity: info exp_annotations: message: "High error budget burn for codeflare-operator (current value: 16)." - summary: CodeFlare Operator Probe Success Burn Rate + summary: CodeFlare Operator Probe Success 30m and 6h Burn Rate high triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' - interval: 1m @@ -72,16 +78,16 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: CodeFlare Operator Probe Success Burn Rate + alertname: CodeFlare Operator Probe Success 2h and 1d Burn Rate high instance: "codeflare-operator" namespace: "redhat-ods-applications" severity: info exp_annotations: message: "High error budget burn for codeflare-operator (current value: 61)." - summary: CodeFlare Operator Probe Success Burn Rate + summary: CodeFlare Operator Probe Success 2h and 1d Burn Rate high triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md' # operator running diff --git a/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml index 41218063851..a3ed6391534 100644 --- a/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml @@ -18,9 +18,20 @@ tests: values: "0x60" - series: haproxy_backend_http_responses_total:burnrate1d{component="dsp"} values: "0x60" + - series: haproxy_backend_http_responses_total:burnrate3d{component="dsp"} + values: "0x60" alert_rule_test: - eval_time: 1h - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,14 +42,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high namespace: "redhat-ods-applications" severity: info exp_annotations: - summary: "Data Science Pipelines Application Route Error Burn Rate" + summary: "Data Science Pipelines Application Route Error 5m and 1h Burn Rate high" message: "High error budget burn for (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' @@ -50,14 +61,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high namespace: "redhat-ods-applications" severity: info exp_annotations: - summary: "Data Science Pipelines Application Route Error Burn Rate" + summary: "Data Science Pipelines Application Route Error 30m and 6h Burn Rate high" message: "High error budget burn for (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' @@ -69,14 +80,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high namespace: "redhat-ods-applications" severity: info exp_annotations: - summary: "Data Science Pipelines Application Route Error Burn Rate" + summary: "Data Science Pipelines Application Route Error 2h and 1d Burn Rate high" message: "High error budget burn for (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' @@ -88,14 +99,14 @@ tests: values: "1+1x200" alert_rule_test: - eval_time: 3h - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Application Route Error Burn Rate + alertname: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high namespace: "redhat-ods-applications" severity: info exp_annotations: - summary: "Data Science Pipelines Application Route Error Burn Rate" + summary: "Data Science Pipelines Application Route Error 6h and 3d Burn Rate high" message: "High error budget burn for (current value: 181)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' @@ -116,7 +127,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 3h - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 3h + alertname: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 3h + alertname: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high exp_alerts: [] @@ -128,15 +145,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high instance: "data-science-pipelines-operator" namespace: "redhat-ods-applications" severity: critical exp_annotations: - summary: "Data Science Pipelines Operator Probe Success Burn Rate" + summary: "Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for data-science-pipelines-operator (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md' @@ -148,15 +165,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high instance: "data-science-pipelines-operator" namespace: "redhat-ods-applications" severity: critical exp_annotations: - summary: "Data Science Pipelines Operator Probe Success Burn Rate" + summary: "Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for data-science-pipelines-operator (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md' @@ -168,15 +185,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: Data Science Pipelines Operator Probe Success Burn Rate + alertname: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high instance: "data-science-pipelines-operator" namespace: "redhat-ods-applications" severity: warning exp_annotations: - summary: "Data Science Pipelines Operator Probe Success Burn Rate" + summary: "Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for data-science-pipelines-operator (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md' diff --git a/tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml index 1d409c82831..1f483593687 100644 --- a/tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml @@ -21,8 +21,14 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 5m and 1h Burn Rate high exp_alerts: [] + - eval_time: 1h + alertname: Kserve Controller Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: Kserve Controller Probe Success 2h and 1d Burn Rate high + exp_alerts: [] - interval: 1m input_series: @@ -32,15 +38,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 5m and 1h Burn Rate high instance: "kserve-controller-manager" severity: critical exp_annotations: message: "High error budget burn for kserve-controller-manager (current value: 3)." - summary: Kserve Controller Probe Success Burn Rate + summary: Kserve Controller Probe Success 5m and 1h Burn Rate high triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" - interval: 1m @@ -51,15 +57,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 30m and 6h Burn Rate high instance: "kserve-controller-manager" severity: critical exp_annotations: message: "High error budget burn for kserve-controller-manager (current value: 16)." - summary: Kserve Controller Probe Success Burn Rate + summary: Kserve Controller Probe Success 30m and 6h Burn Rate high triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" - interval: 1m @@ -70,13 +76,13 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: Kserve Controller Probe Success Burn Rate + alertname: Kserve Controller Probe Success 2h and 1d Burn Rate high instance: "kserve-controller-manager" severity: warning exp_annotations: message: "High error budget burn for kserve-controller-manager (current value: 61)." - summary: Kserve Controller Probe Success Burn Rate + summary: Kserve Controller Probe Success 2h and 1d Burn Rate high triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" diff --git a/tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml index 681a102afec..86a7678acd4 100644 --- a/tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml @@ -20,7 +20,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: Modelmesh Controller Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: Modelmesh Controller Probe Success 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,15 +37,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 5m and 1h Burn Rate high namespace: "redhat-ods-applications" instance: "modelmesh-controller" severity: critical exp_annotations: - summary: "Modelmesh Controller Probe Success Burn Rate" + summary: "Modelmesh Controller Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for modelmesh-controller (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md' @@ -51,15 +57,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 30m and 6h Burn Rate high namespace: "redhat-ods-applications" instance: "modelmesh-controller" severity: critical exp_annotations: - summary: "Modelmesh Controller Probe Success Burn Rate" + summary: "Modelmesh Controller Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for modelmesh-controller (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md' @@ -71,14 +77,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: Modelmesh Controller Probe Success Burn Rate + alertname: Modelmesh Controller Probe Success 2h and 1d Burn Rate high namespace: "redhat-ods-applications" instance: "modelmesh-controller" severity: warning exp_annotations: - summary: "Modelmesh Controller Probe Success Burn Rate" + summary: "Modelmesh Controller Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for modelmesh-controller (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md' diff --git a/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml index 0b37ebac9f8..5580120e0f7 100644 --- a/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml @@ -20,7 +20,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,15 +37,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 5m and 1h Burn Rate high namespace: "redhat-ods-applications" instance: "odh-model-controller" severity: critical exp_annotations: - summary: "ODH Model Controller Probe Success Burn Rate" + summary: "ODH Model Controller Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for odh-model-controller (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md' @@ -51,15 +57,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 30m and 6h Burn Rate high namespace: "redhat-ods-applications" instance: "odh-model-controller" severity: critical exp_annotations: - summary: "ODH Model Controller Probe Success Burn Rate" + summary: "ODH Model Controller Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for odh-model-controller (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md' @@ -71,14 +77,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: ODH Model Controller Probe Success Burn Rate + alertname: ODH Model Controller Probe Success 2h and 1d Burn Rate high namespace: "redhat-ods-applications" instance: "odh-model-controller" severity: warning exp_annotations: - summary: "ODH Model Controller Probe Success Burn Rate" + summary: "ODH Model Controller Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for odh-model-controller (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md' diff --git a/tests/prometheus_unit_tests/rhods-dashboard-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/rhods-dashboard-alerting.unit-tests.yaml index 18f4500d4a3..178dc698b92 100644 --- a/tests/prometheus_unit_tests/rhods-dashboard-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/rhods-dashboard-alerting.unit-tests.yaml @@ -20,7 +20,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: RHODS Dashboard Route Error 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: RHODS Dashboard Route Error 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,15 +37,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 5m and 1h Burn Rate high namespace: "redhat-ods-applications" route: "rhods-dashboard" severity: critical exp_annotations: - summary: "RHODS Dashboard Route Error Burn Rate" + summary: "RHODS Dashboard Route Error 5m and 1h Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' @@ -51,15 +57,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 30m and 6h Burn Rate high namespace: "redhat-ods-applications" route: "rhods-dashboard" severity: critical exp_annotations: - summary: "RHODS Dashboard Route Error Burn Rate" + summary: "RHODS Dashboard Route Error 30m and 6h Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' @@ -71,15 +77,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Route Error Burn Rate + alertname: RHODS Dashboard Route Error 2h and 1d Burn Rate high namespace: "redhat-ods-applications" route: "rhods-dashboard" severity: warning exp_annotations: - summary: "RHODS Dashboard Route Error Burn Rate" + summary: "RHODS Dashboard Route Error 2h and 1d Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' @@ -100,7 +106,16 @@ tests: values: "0x60" alert_rule_test: - eval_time: 3h - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 3h + alertname: RHODS Dashboard Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 3h + alertname: RHODS Dashboard Probe Success 2h and 1d Burn Rate high + exp_alerts: [] + - eval_time: 3h + alertname: RHODS Dashboard Probe Success 6h and 3d Burn Rate high exp_alerts: [] - interval: 1m @@ -111,15 +126,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 5m and 1h Burn Rate high name: "rhods-dashboard" namespace: "redhat-ods-applications" severity: critical exp_annotations: - summary: "RHODS Dashboard Probe Success Burn Rate" + summary: "RHODS Dashboard Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' @@ -131,15 +146,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 30m and 6h Burn Rate high name: "rhods-dashboard" namespace: "redhat-ods-applications" severity: critical exp_annotations: - summary: "RHODS Dashboard Probe Success Burn Rate" + summary: "RHODS Dashboard Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' @@ -151,15 +166,15 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RRHODS Dashboard Probe Success 2h and 1d Burn Rate high name: "rhods-dashboard" namespace: "redhat-ods-applications" severity: warning exp_annotations: - summary: "RHODS Dashboard Probe Success Burn Rate" + summary: "RHODS Dashboard Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' @@ -171,14 +186,14 @@ tests: values: "1+1x200" alert_rule_test: - eval_time: 3h - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 6h and 3d Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Dashboard Probe Success Burn Rate + alertname: RHODS Dashboard Probe Success 6h and 3d Burn Rate high name: "rhods-dashboard" namespace: "redhat-ods-applications" severity: warning exp_annotations: - summary: "RHODS Dashboard Probe Success Burn Rate" + summary: "RHODS Dashboard Probe Success 6h and 3d Burn Rate high" message: "High error budget burn for rhods-dashboard (current value: 181)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' diff --git a/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml index 2fd1594cbe7..be37407cbaa 100644 --- a/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml @@ -20,7 +20,13 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high exp_alerts: [] - interval: 1m @@ -31,14 +37,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 5m and 1h Burn Rate high instance: "trustyai-service-operator-controller-manager" severity: critical exp_annotations: - summary: "TrustyAI Controller Probe Success Burn Rate" + summary: "TrustyAI Controller Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md' @@ -50,14 +56,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 30m and 6h Burn Rate high instance: "trustyai-service-operator-controller-manager" severity: critical exp_annotations: - summary: "TrustyAI Controller Probe Success Burn Rate" + summary: "TrustyAI Controller Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md' @@ -69,13 +75,13 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: TrustyAI Controller Probe Success Burn Rate + alertname: TrustyAI Controller Probe Success 2h and 1d Burn Rate high instance: "trustyai-service-operator-controller-manager" severity: warning exp_annotations: - summary: "TrustyAI Controller Probe Success Burn Rate" + summary: "TrustyAI Controller Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md' diff --git a/tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml b/tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml index a8b8e3e2369..d613ef2302f 100644 --- a/tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml +++ b/tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml @@ -73,7 +73,16 @@ tests: values: "0x60" alert_rule_test: - eval_time: 1h - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 5m and 1h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: RHODS Jupyter Probe Success 30m and 6h Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: RHODS Jupyter Probe Success 2h and 1d Burn Rate high + exp_alerts: [] + - eval_time: 1h + alertname: RHODS Jupyter Probe Success 6h and 3d Burn Rate high exp_alerts: [] - interval: 1m @@ -84,14 +93,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 2m - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 5m and 1h Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 5m and 1h Burn Rate high instance: "notebook-spawner" severity: critical exp_annotations: - summary: "RHODS Jupyter Probe Success Burn Rate" + summary: "RHODS Jupyter Probe Success 5m and 1h Burn Rate high" message: "High error budget burn for notebook-spawner (current value: 3)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md' @@ -103,14 +112,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 15m - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 30m and 6h Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 30m and 6h Burn Rate high instance: "notebook-spawner" severity: critical exp_annotations: - summary: "RHODS Jupyter Probe Success Burn Rate" + summary: "RHODS Jupyter Probe Success 30m and 6h Burn Rate high" message: "High error budget burn for notebook-spawner (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md' @@ -122,14 +131,14 @@ tests: values: "1+1x60" alert_rule_test: - eval_time: 1h - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 2h and 1d Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 2h and 1d Burn Rate high instance: "notebook-spawner" severity: warning exp_annotations: - summary: "RHODS Jupyter Probe Success Burn Rate" + summary: "RHODS Jupyter Probe Success 2h and 1d Burn Rate high" message: "High error budget burn for notebook-spawner (current value: 61)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md' @@ -141,14 +150,14 @@ tests: values: "1+1x200" alert_rule_test: - eval_time: 3h - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 6h and 3d Burn Rate high exp_alerts: - exp_labels: - alertname: RHODS Jupyter Probe Success Burn Rate + alertname: RHODS Jupyter Probe Success 6h and 3d Burn Rate high instance: "notebook-spawner" severity: warning exp_annotations: - summary: "RHODS Jupyter Probe Success Burn Rate" + summary: "RHODS Jupyter Probe Success 6h and 3d Burn Rate high" message: "High error budget burn for notebook-spawner (current value: 181)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md'