Skip to content

Commit

Permalink
add type label (#359)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbartz authored Sep 2, 2024
1 parent 43c9ab2 commit 01ff64d
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/loki_alert_rules/capacity.rules
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ groups:
quantile_over_time(0.5, {filename="/var/log/github-runner-metrics.log"} | json event="event",duration="queue_duration",flavor="flavor" | __error__="" | event="runner_start" | unwrap duration[1h]) by(flavor) > 1800
labels:
severity: high
type: runner-capacity
for: 1h
annotations:
description: Job queue duration is higher than 30 minutes for half of the runners of application {{$labels.flavor}}
Expand All @@ -15,6 +16,7 @@ groups:
sum by(flavor)(last_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event",idle_runners="idle_runners",flavor="flavor" | event="reconciliation" | unwrap idle_runners[1h])) == 0
labels:
severity: high
type: runner-capacity
for: 2h
annotations:
summary: No idle runners for application "{{$labels.flavor}}"
Expand All @@ -23,6 +25,7 @@ groups:
avg_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event",idle="idle",flavor="flavor" | event="runner_start" | unwrap idle[1h]) by(flavor) < 300
labels:
severity: high
type: runner-capacity
for: 2h
annotations:
description: Idle time for application "{{$labels.flavor}}" is on average smaller than 5 minutes.
Expand Down
3 changes: 3 additions & 0 deletions src/loki_alert_rules/failure.rules
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ groups:
(sum_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event", crashed_runners="crashed_runners" | event = `reconciliation` | unwrap crashed_runners [1h])) > 0
labels:
severity: high
type: runner-failure
for: 0s
annotations:
summary: A runner in unit {{ $labels.juju_unit }} crashed.
Expand All @@ -14,6 +15,7 @@ groups:
sum by (filename) (count_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event" | event="runner_stop" | json status="status" | status="repo-policy-check-failure" | json http_code="status_info.code" | http_code=~"4.." [10m])) / sum by (filename) (count_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event" | event="runner_stop" [10m])) > 0.5
labels:
severity: high
type: runner-failure
for: 3h
annotations:
description: More than 50 % of jobs have failed the repo-policy check (4xx status code)
Expand All @@ -23,6 +25,7 @@ groups:
count_over_time({filename="/var/log/github-runner-metrics.log"} | json event="event" | event="runner_stop" | json status="status",repo="repo" | status="repo-policy-check-failure" | json http_code="status_info.code" | http_code=~"5.." [1h]) > 0
labels:
severity: high
type: runner-failure
for: 0s
annotations:
description: A repo-policy server error ({{ $labels.http_code }}) was encountered in a runner in unit {{ $labels.juju_unit }} and repository {{ $labels.repo }}.
Expand Down

0 comments on commit 01ff64d

Please sign in to comment.