diff --git a/Makefile b/Makefile index 0df77484041..4c885a19af3 100644 --- a/Makefile +++ b/Makefile @@ -260,7 +260,7 @@ mimir-build-image/$(UPTODATE): mimir-build-image/* # All the boiler plate for building golang follows: SUDO := $(shell docker info >/dev/null 2>&1 || echo "sudo -E") BUILD_IN_CONTAINER ?= true -LATEST_BUILD_IMAGE_TAG ?= pr8030-cf783bf13c +LATEST_BUILD_IMAGE_TAG ?= pr8032-d6540935d8 # TTY is parameterized to allow Google Cloud Builder to run builds, # as it currently disallows TTY devices. This value needs to be overridden diff --git a/mimir-build-image/Dockerfile b/mimir-build-image/Dockerfile index f6528da4de6..db7cd36849e 100644 --- a/mimir-build-image/Dockerfile +++ b/mimir-build-image/Dockerfile @@ -50,7 +50,7 @@ RUN GO111MODULE=on \ go install github.com/fatih/faillint@v1.12.0 && \ go install github.com/campoy/embedmd@v1.0.0 && \ go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.4.0 && \ - go install github.com/monitoring-mixins/mixtool/cmd/mixtool@bca3066 && \ + go install github.com/monitoring-mixins/mixtool/cmd/mixtool@779c8b3 && \ go install github.com/mikefarah/yq/v4@v4.13.4 && \ go install github.com/google/go-jsonnet/cmd/jsonnet@v0.19.1 && \ go install github.com/google/go-jsonnet/cmd/jsonnetfmt@v0.19.1 && \ diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 230d38f4df3..485fde177cf 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -11,1067 +11,1017 @@ metadata: namespace: "citestns" spec: groups: - - name: mimir_alerts - rules: - - alert: MimirIngesterUnhealthy - annotations: - message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ - printf "%f" $value }} unhealthy ingester(s). - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy - expr: | - min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 - for: 15m - labels: - severity: critical - - alert: MimirRequestErrors - annotations: - message: | - The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors - expr: | - # The following 5xx errors considered as non-error: - # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) - # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body - ( - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) - / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - ) * 100 > 1 - for: 15m - labels: - severity: critical - - alert: MimirRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency - expr: | - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} - > - 2.5 - for: 15m - labels: - severity: warning - - alert: MimirInconsistentRuntimeConfig - annotations: - message: | - An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig - expr: | - count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 - for: 1h - labels: - severity: critical - - alert: MimirBadRuntimeConfig - annotations: - message: | - {{ $labels.job }} failed to reload runtime config. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig - expr: | - # The metric value is reset to 0 on error while reloading the config at runtime. - cortex_runtime_config_last_reload_successful == 0 - for: 5m - labels: - severity: critical - - alert: MimirFrontendQueriesStuck - annotations: - message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck - expr: | - sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 - for: 5m - labels: - severity: critical - - alert: MimirSchedulerQueriesStuck - annotations: - message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck - expr: | - sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 - for: 7m - labels: - severity: critical - - alert: MimirCacheRequestErrors - annotations: - message: | - The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors - expr: | - ( - sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) - or - rate(thanos_cache_operation_failures_total[1m]) - ) - / - sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operations_total[1m]) - or - rate(thanos_cache_operations_total[1m]) - ) - ) * 100 > 5 - for: 5m - labels: - severity: warning - - alert: MimirIngesterRestarts - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts - expr: | - ( - sum by(cluster, namespace, pod) ( - increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) - ) - >= 2 - ) - and - ( - count by(cluster, namespace, pod) (cortex_build_info) > 0 - ) - labels: - severity: warning - - alert: MimirKVStoreFailure - annotations: - message: | - Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure - expr: | - ( - sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) - / - sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) - ) - # We want to get alerted only in case there's a constant failure. - == 1 - for: 5m - labels: - severity: critical - - alert: MimirMemoryMapAreasTooHigh - annotations: - message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas - close to the limit.' - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh - expr: | - process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 - for: 5m - labels: - severity: critical - - alert: MimirIngesterInstanceHasNoTenants - annotations: - message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has no tenants assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants - expr: | - (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) - and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load - ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) - ) > 100000 - for: 1h - labels: - severity: warning - - alert: MimirRulerInstanceHasNoRuleGroups - annotations: - message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has no rule groups assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups - expr: | - # Alert on ruler instances in microservices mode that have no rule groups assigned, - min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*mimir-)?ruler.*"}) == 0 - # but only if other ruler instances of the same cell do have rule groups assigned - and on (cluster, namespace) - (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) - # and there are more than two instances overall - and on (cluster, namespace) - (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) - for: 1h - labels: - severity: warning - - alert: MimirIngestedDataTooFarInTheFuture - annotations: - message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has ingested samples with timestamps more than 1h in the future. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture - expr: | - max by(cluster, namespace, pod) ( - cortex_ingester_tsdb_head_max_timestamp_seconds - time() - and - cortex_ingester_tsdb_head_max_timestamp_seconds > 0 - ) > 60*60 - for: 5m - labels: - severity: warning - - alert: MimirStoreGatewayTooManyFailedOperations - annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ - $labels.operation }} on the object storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations - expr: | - sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 - for: 5m - labels: - severity: warning - - alert: MimirRingMembersMismatch - annotations: - message: | - Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch - expr: | - ( - avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) - != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) - ) - and - ( - count by(cluster, namespace) (cortex_build_info) > 0 - ) - for: 15m - labels: - component: ingester - severity: warning - - name: mimir_instance_limits_alerts - rules: - - alert: MimirIngesterReachingSeriesLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit - expr: | - ( - (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.8 - for: 3h - labels: - severity: warning - - alert: MimirIngesterReachingSeriesLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit - expr: | - ( - (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.9 - for: 5m - labels: - severity: critical - - alert: MimirIngesterReachingTenantsLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit - expr: | - ( - (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_tenants"} > 0) - ) > 0.7 - for: 5m - labels: - severity: warning - - alert: MimirIngesterReachingTenantsLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit - expr: | - ( - (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_tenants"} > 0) - ) > 0.8 - for: 5m - labels: - severity: critical - - alert: MimirReachingTCPConnectionsLimit - annotations: - message: | - Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit - expr: | - cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and - cortex_tcp_connections_limit > 0 - for: 5m - labels: - severity: critical - - alert: MimirDistributorReachingInflightPushRequestLimit - annotations: - message: | - Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit - expr: | - ( - (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) - and ignoring (limit) - (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) - ) > 0.8 - for: 5m - labels: - severity: critical - - name: mimir-rollout-alerts - rules: - - alert: MimirRolloutStuck - annotations: - message: | - The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck - expr: | - ( - max without (revision) ( - sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - unless - sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - ) - * - ( - sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - != - sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - ) - ) and ( - changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) - == - 0 - ) - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - for: 30m - labels: - severity: warning - workload_type: statefulset - - alert: MimirRolloutStuck - annotations: - message: | - The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck - expr: | - ( - sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) - != - sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) - ) and ( - changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) - == - 0 - ) - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - for: 30m - labels: - severity: warning - workload_type: deployment - - alert: RolloutOperatorNotReconciling - annotations: - message: | - Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling - expr: | - max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 - for: 5m - labels: - severity: critical - - name: mimir-provisioning - rules: - - alert: MimirAllocatingTooMuchMemory - annotations: - message: | - Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory - expr: | - ( - # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. - # See: https://github.com/grafana/mimir/issues/2466 - container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} - / - ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) - ) - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - > 0.65 - for: 15m - labels: - severity: warning - - alert: MimirAllocatingTooMuchMemory - annotations: - message: | - Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory - expr: | - ( - # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. - # See: https://github.com/grafana/mimir/issues/2466 - container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} - / - ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) - ) - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - > 0.8 - for: 15m - labels: - severity: critical - - name: ruler_alerts - rules: - - alert: MimirRulerTooManyFailedPushes - annotations: - message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes - expr: | - 100 * ( - sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m])) - ) > 1 - for: 5m - labels: - severity: critical - - alert: MimirRulerTooManyFailedQueries - annotations: - message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries - expr: | - 100 * ( - sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m])) - ) > 1 - for: 5m - labels: - severity: critical - - alert: MimirRulerMissedEvaluations - annotations: - message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations - expr: | - 100 * ( - sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) - / - sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) - ) > 1 - for: 5m - labels: - severity: warning - - alert: MimirRulerFailedRingCheck - annotations: - message: | - Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck - expr: | - sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) - > 0 - for: 5m - labels: - severity: critical - - alert: MimirRulerRemoteEvaluationFailing - annotations: - message: | - Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing - expr: | - 100 * ( - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) - / - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) - ) > 1 - for: 5m - labels: - severity: warning - - name: gossip_alerts - rules: - - alert: MimirGossipMembersTooHigh - annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace - }} consistently sees a higher than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh - expr: | - max by (cluster, namespace) (memberlist_client_cluster_members_count) - > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) - for: 20m - labels: - severity: warning - - alert: MimirGossipMembersTooLow - annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace - }} consistently sees a lower than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow - expr: | - min by (cluster, namespace) (memberlist_client_cluster_members_count) - < - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) - for: 20m - labels: - severity: warning - - name: etcd_alerts - rules: - - alert: EtcdAllocatingTooMuchMemory - annotations: - message: | - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory - expr: | - ( - container_memory_working_set_bytes{container="etcd"} - / - ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) - ) > 0.65 - for: 15m - labels: - severity: warning - - alert: EtcdAllocatingTooMuchMemory - annotations: - message: | - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory - expr: | - ( - container_memory_working_set_bytes{container="etcd"} - / - ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) - ) > 0.8 - for: 15m - labels: - severity: critical - - name: alertmanager_alerts - rules: - - alert: MimirAlertmanagerSyncConfigsFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing - expr: | - rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 - for: 30m - labels: - severity: critical - - alert: MimirAlertmanagerRingCheckFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing - expr: | - rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerPartialStateMergeFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing - expr: | - rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerReplicationFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing - expr: | - rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerPersistStateFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing - expr: | - rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 - for: 1h - labels: - severity: critical - - alert: MimirAlertmanagerInitialSyncFailed - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed - expr: | - increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 - labels: - severity: critical - - alert: MimirAlertmanagerAllocatingTooMuchMemory - annotations: - message: | - Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory - expr: | - (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80 - and - (container_spec_memory_limit_bytes{container="alertmanager"} > 0) - for: 15m - labels: - severity: warning - - alert: MimirAlertmanagerAllocatingTooMuchMemory - annotations: - message: | - Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory - expr: | - (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90 - and - (container_spec_memory_limit_bytes{container="alertmanager"} > 0) - for: 15m - labels: - severity: critical - - alert: MimirAlertmanagerInstanceHasNoTenants - annotations: - message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} owns no tenants. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants - expr: | - # Alert on alertmanager instances in microservices mode that own no tenants, - min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*mimir-)?alertmanager.*"}) == 0 - # but only if other instances of the same cell do have tenants assigned. - and on (cluster, namespace) - max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 - for: 1h - labels: - severity: warning - - name: mimir_blocks_alerts - rules: - - alert: MimirIngesterHasNotShippedBlocks - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks - expr: | - (min by(cluster, namespace, pod) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) - and - (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) - and - # Only if the ingester has ingested samples over the last 4h. - (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - and - # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica - # had ingested samples in the past, then no traffic was received for a long period and then it starts - # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving - # samples, while the a block shipping is expected within the next 4h. - (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) - for: 15m - labels: - severity: critical - - alert: MimirIngesterHasNotShippedBlocksSinceStart - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart - expr: | - (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) - and - (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - for: 4h - labels: - severity: critical - - alert: MimirIngesterHasUnshippedBlocks - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't - been successfully uploaded to the storage yet. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks - expr: | - (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) - and - (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) - for: 15m - labels: - severity: critical - - alert: MimirIngesterTSDBHeadCompactionFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to compact TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed - expr: | - rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: MimirIngesterTSDBHeadTruncationFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to truncate TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed - expr: | - rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBCheckpointCreationFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to create TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed - expr: | - rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBCheckpointDeletionFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to delete TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed - expr: | - rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBWALTruncationFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to truncate TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed - expr: | - rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 - labels: - severity: warning - - alert: MimirIngesterTSDBWALCorrupted - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted - expr: | - # alert when there are more than one corruptions - count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 - and - # and there is only one zone - count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 - labels: - deployment: single-zone - severity: critical - - alert: MimirIngesterTSDBWALCorrupted - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted - expr: | - # alert when there are more than one corruptions - count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 - and - # and there are multiple zones - count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 - labels: - deployment: multi-zone - severity: critical - - alert: MimirIngesterTSDBWALWritesFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to write to TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed - expr: | - rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 - for: 3m - labels: - severity: critical - - alert: MimirStoreGatewayHasNotSyncTheBucket - annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not successfully synched the bucket since {{ $value | humanizeDuration - }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket - expr: | - (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) - and - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 - for: 5m - labels: - severity: critical - - alert: MimirStoreGatewayNoSyncedTenants - annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is not syncing any blocks for any tenant. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants - expr: | - min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 - for: 1h - labels: - severity: warning - - alert: MimirBucketIndexNotUpdated - annotations: - message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster - }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration - }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated - expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 - labels: - severity: critical - - name: mimir_compactor_alerts - rules: - - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not successfully cleaned up blocks in the last 6 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) - for: 1h - labels: - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) - and - (cortex_compactor_last_successful_run_timestamp_seconds > 0) - for: 1h - labels: - reason: in-last-24h - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - cortex_compactor_last_successful_run_timestamp_seconds == 0 - for: 24h - labels: - reason: since-startup - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} failed to run 2 consecutive compactions. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 - labels: - reason: consecutive-failures - severity: critical - - alert: MimirCompactorHasNotUploadedBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not uploaded any block in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks - expr: | - (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) - and - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) - and - # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). - (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) - for: 15m - labels: - severity: critical - time_period: 24h - - alert: MimirCompactorHasNotUploadedBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not uploaded any block since its start. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks - expr: | - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) - and - # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). - (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) - for: 24h - labels: - severity: critical - time_period: since-start - - alert: MimirCompactorSkippedUnhealthyBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks - expr: | - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 - for: 1m - labels: - severity: warning - - alert: MimirCompactorSkippedUnhealthyBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks - expr: | - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 - for: 30m - labels: - severity: critical - - name: mimir_autoscaling - rules: - - alert: MimirAutoscalerNotActive - annotations: - message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler - }} in {{ $labels.namespace }} is not active. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive - expr: | - ( - label_replace(( - kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + - name: mimir_alerts + rules: + - alert: MimirIngesterUnhealthy + annotations: + message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ printf "%f" $value }} unhealthy ingester(s). + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy + expr: | + min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 + for: 15m + labels: + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) + / + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 + for: 15m + labels: + severity: critical + - alert: MimirRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency + expr: | + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} + > + 2.5 + for: 15m + labels: + severity: warning + - alert: MimirInconsistentRuntimeConfig + annotations: + message: | + An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig + expr: | + count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 + for: 1h + labels: + severity: critical + - alert: MimirBadRuntimeConfig + annotations: + message: | + {{ $labels.job }} failed to reload runtime config. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig + expr: | + # The metric value is reset to 0 on error while reloading the config at runtime. + cortex_runtime_config_last_reload_successful == 0 + for: 5m + labels: + severity: critical + - alert: MimirFrontendQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirSchedulerQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 + for: 7m + labels: + severity: critical + - alert: MimirCacheRequestErrors + annotations: + message: | + The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors + expr: | + ( + sum by(cluster, namespace, name, operation) ( + rate(thanos_memcached_operation_failures_total[1m]) + or + rate(thanos_cache_operation_failures_total[1m]) + ) + / + sum by(cluster, namespace, name, operation) ( + rate(thanos_memcached_operations_total[1m]) + or + rate(thanos_cache_operations_total[1m]) + ) + ) * 100 > 5 + for: 5m + labels: + severity: warning + - alert: MimirIngesterRestarts + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts + expr: | + ( + sum by(cluster, namespace, pod) ( + increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) + ) + >= 2 + ) + and + ( + count by(cluster, namespace, pod) (cortex_build_info) > 0 + ) + labels: + severity: warning + - alert: MimirKVStoreFailure + annotations: + message: | + Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure + expr: | + ( + sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + for: 5m + labels: + severity: critical + - alert: MimirMemoryMapAreasTooHigh + annotations: + message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas close to the limit.' + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh + expr: | + process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirIngesterInstanceHasNoTenants + annotations: + message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no tenants assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants + expr: | + (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) + and on (cluster, namespace) + # Only if there are more time-series than would be expected due to continuous testing load + ( + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) > 100000 + for: 1h + labels: + severity: warning + - alert: MimirRulerInstanceHasNoRuleGroups + annotations: + message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no rule groups assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups + expr: | + # Alert on ruler instances in microservices mode that have no rule groups assigned, + min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*mimir-)?ruler.*"}) == 0 + # but only if other ruler instances of the same cell do have rule groups assigned + and on (cluster, namespace) + (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) + # and there are more than two instances overall + and on (cluster, namespace) + (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) + for: 1h + labels: + severity: warning + - alert: MimirIngestedDataTooFarInTheFuture + annotations: + message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has ingested samples with timestamps more than 1h in the future. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture + expr: | + max by(cluster, namespace, pod) ( + cortex_ingester_tsdb_head_max_timestamp_seconds - time() + and + cortex_ingester_tsdb_head_max_timestamp_seconds > 0 + ) > 60*60 + for: 5m + labels: + severity: warning + - alert: MimirStoreGatewayTooManyFailedOperations + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations + expr: | + sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRingMembersMismatch + annotations: + message: | + Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch + expr: | + ( + avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) + != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) + ) + and + ( + count by(cluster, namespace) (cortex_build_info) > 0 + ) + for: 15m + labels: + component: ingester + severity: warning + - name: mimir_instance_limits_alerts + rules: + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.8 + for: 3h + labels: + severity: warning + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.9 + for: 5m + labels: + severity: critical + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.7 + for: 5m + labels: + severity: warning + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirReachingTCPConnectionsLimit + annotations: + message: | + Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit + expr: | + cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and + cortex_tcp_connections_limit > 0 + for: 5m + labels: + severity: critical + - alert: MimirDistributorReachingInflightPushRequestLimit + annotations: + message: | + Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit + expr: | + ( + (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) + and ignoring (limit) + (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - name: mimir-rollout-alerts + rules: + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + workload_type: statefulset + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + workload_type: deployment + - alert: RolloutOperatorNotReconciling + annotations: + message: | + Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling + expr: | + max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 + for: 5m + labels: + severity: critical + - name: mimir-provisioning + rules: + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. + # See: https://github.com/grafana/mimir/issues/2466 + container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + / + ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ) + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + > 0.65 + for: 15m + labels: + severity: warning + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. + # See: https://github.com/grafana/mimir/issues/2466 + container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + / + ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ) # Match only Mimir namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") - > 0), - "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" - ) - ) - # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, - # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported - # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) - for: 1h - labels: - severity: critical - - alert: MimirAutoscalerKedaFailing - annotations: - message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace - }} is experiencing errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing - expr: | - ( - # Find KEDA scalers reporting errors. - label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - ) - > 0 - for: 1h - labels: - severity: critical - - name: mimir_ingest_storage_alerts - rules: - - alert: MimirIngesterLastConsumedOffsetCommitFailed - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to commit the last consumed offset. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed - expr: | - sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) - / - sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) - > 0.2 - for: 15m - labels: - severity: critical - - alert: MimirIngesterFailedToReadRecordsFromKafka - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to read records from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka - expr: | - sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) - > 0 - for: 5m - labels: - severity: critical - - alert: MimirIngesterKafkaFetchErrorsRateTooHigh - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is receiving fetch errors when reading records from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh - expr: | - sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) - / - sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) - > 0.1 - for: 15m - labels: - severity: critical - - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} in "starting" phase is not reducing consumption lag of write requests read - from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing - expr: | - deriv(( - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) - )[5m:1m]) > 0 - for: 5m - labels: - severity: warning - - alert: MimirRunningIngesterReceiveDelayTooHigh - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} in "running" phase is too far behind in its consumption of write requests - from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh - expr: | - ( - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) - ) > (10 * 60) - for: 5m - labels: - severity: critical - - alert: MimirIngesterFailsToProcessRecordsFromKafka - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} fails to consume write requests read from Kafka due to internal errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka - expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 - for: 5m - labels: - severity: critical - - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} fails to enforce strong-consistency on read-path. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath - expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 - for: 5m - labels: - severity: critical - - name: mimir_continuous_test - rules: - - alert: MimirContinuousTestNotRunningOnWrites - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not effectively running because writes are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 - for: 1h - labels: - severity: warning - - alert: MimirContinuousTestNotRunningOnReads - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not effectively running because queries are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 - for: 1h - labels: - severity: warning - - alert: MimirContinuousTestFailed - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} failed when asserting query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 - labels: - severity: warning + > 0.8 + for: 15m + labels: + severity: critical + - name: ruler_alerts + rules: + - alert: MimirRulerTooManyFailedPushes + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes + expr: | + 100 * ( + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerTooManyFailedQueries + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries + expr: | + 100 * ( + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerMissedEvaluations + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations + expr: | + 100 * ( + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + / + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + ) > 1 + for: 5m + labels: + severity: warning + - alert: MimirRulerFailedRingCheck + annotations: + message: | + Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck + expr: | + sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + 100 * ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + / + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) > 1 + for: 5m + labels: + severity: warning + - name: gossip_alerts + rules: + - alert: MimirGossipMembersTooHigh + annotations: + message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a higher than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh + expr: | + max by (cluster, namespace) (memberlist_client_cluster_members_count) + > + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + for: 20m + labels: + severity: warning + - alert: MimirGossipMembersTooLow + annotations: + message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a lower than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow + expr: | + min by (cluster, namespace) (memberlist_client_cluster_members_count) + < + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) + for: 20m + labels: + severity: warning + - name: etcd_alerts + rules: + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.65 + for: 15m + labels: + severity: warning + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.8 + for: 15m + labels: + severity: critical + - name: alertmanager_alerts + rules: + - alert: MimirAlertmanagerSyncConfigsFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing + expr: | + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + for: 30m + labels: + severity: critical + - alert: MimirAlertmanagerRingCheckFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing + expr: | + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPartialStateMergeFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing + expr: | + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerReplicationFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing + expr: | + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPersistStateFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing + expr: | + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + for: 1h + labels: + severity: critical + - alert: MimirAlertmanagerInitialSyncFailed + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed + expr: | + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + labels: + severity: critical + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80 + and + (container_spec_memory_limit_bytes{container="alertmanager"} > 0) + for: 15m + labels: + severity: warning + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90 + and + (container_spec_memory_limit_bytes{container="alertmanager"} > 0) + for: 15m + labels: + severity: critical + - alert: MimirAlertmanagerInstanceHasNoTenants + annotations: + message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} owns no tenants. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants + expr: | + # Alert on alertmanager instances in microservices mode that own no tenants, + min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*mimir-)?alertmanager.*"}) == 0 + # but only if other instances of the same cell do have tenants assigned. + and on (cluster, namespace) + max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 + for: 1h + labels: + severity: warning + - name: mimir_blocks_alerts + rules: + - alert: MimirIngesterHasNotShippedBlocks + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks + expr: | + (min by(cluster, namespace, pod) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) + and + (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) + and + # Only if the ingester has ingested samples over the last 4h. + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + and + # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica + # had ingested samples in the past, then no traffic was received for a long period and then it starts + # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving + # samples, while the a block shipping is expected within the next 4h. + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterHasNotShippedBlocksSinceStart + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart + expr: | + (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) + and + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + for: 4h + labels: + severity: critical + - alert: MimirIngesterHasUnshippedBlocks + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks + expr: | + (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) + and + (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadCompactionFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to compact TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed + expr: | + rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed + expr: | + rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointCreationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to create TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointDeletionFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to delete TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBWALTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed + expr: | + rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 + labels: + severity: warning + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + # alert when there are more than one corruptions + count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 + and + # and there is only one zone + count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 + labels: + deployment: single-zone + severity: critical + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + # alert when there are more than one corruptions + count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 + and + # and there are multiple zones + count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 + labels: + deployment: multi-zone + severity: critical + - alert: MimirIngesterTSDBWALWritesFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to write to TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed + expr: | + rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 + for: 3m + labels: + severity: critical + - alert: MimirStoreGatewayHasNotSyncTheBucket + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully synched the bucket since {{ $value | humanizeDuration }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket + expr: | + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) + and + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 + for: 5m + labels: + severity: critical + - alert: MimirStoreGatewayNoSyncedTenants + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not syncing any blocks for any tenant. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants + expr: | + min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 + for: 1h + labels: + severity: warning + - alert: MimirBucketIndexNotUpdated + annotations: + message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated + expr: | + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 + labels: + severity: critical + - name: mimir_compactor_alerts + rules: + - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully cleaned up blocks in the last 6 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) + for: 1h + labels: + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) + and + (cortex_compactor_last_successful_run_timestamp_seconds > 0) + for: 1h + labels: + reason: in-last-24h + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + cortex_compactor_last_successful_run_timestamp_seconds == 0 + for: 24h + labels: + reason: since-startup + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed to run 2 consecutive compactions. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 + labels: + reason: consecutive-failures + severity: critical + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) + and + (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 15m + labels: + severity: critical + time_period: 24h + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block since its start. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 24h + labels: + severity: critical + time_period: since-start + - alert: MimirCompactorSkippedUnhealthyBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 + for: 1m + labels: + severity: warning + - alert: MimirCompactorSkippedUnhealthyBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 + for: 30m + labels: + severity: critical + - name: mimir_autoscaling + rules: + - alert: MimirAutoscalerNotActive + annotations: + message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler }} in {{ $labels.namespace }} is not active. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive + expr: | + ( + label_replace(( + kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + # Add "metric" label. + + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + > 0), + "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" + ) + ) + # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, + # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported + # by KEDA could not exist at all or being exposed with a value of 0. + and on (cluster, namespace, metric, scaledObject) + (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + for: 1h + labels: + severity: critical + - alert: MimirAutoscalerKedaFailing + annotations: + message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace }} is experiencing errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing + expr: | + ( + # Find KEDA scalers reporting errors. + label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + ) + > 0 + for: 1h + labels: + severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (10 * 60) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical + - name: mimir_continuous_test + rules: + - alert: MimirContinuousTestNotRunningOnWrites + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because writes are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestNotRunningOnReads + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because queries are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestFailed + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed when asserting query results. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 + labels: + severity: warning diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/recording-rules.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/recording-rules.yaml index 304e695fd6b..b33794c9cad 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/recording-rules.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/recording-rules.yaml @@ -11,523 +11,463 @@ metadata: namespace: "citestns" spec: groups: - - name: mimir_api_1 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:sum_rate - - name: mimir_api_2 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:sum_rate - - name: mimir_api_3 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, namespace, job, - route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:sum_rate - - name: mimir_querier_api - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by - (cluster, job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) - by (cluster, namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate - - name: mimir_storage - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:50quantile - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds:avg - - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate - - name: mimir_queries - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:50quantile - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) - by (cluster, job) - record: cluster_job:cortex_query_frontend_retries:avg - - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by - (cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, - cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate - - name: mimir_ingester_queries - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:50quantile - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_series:avg - - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:50quantile - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples:avg - - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:50quantile - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars:avg - - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate - - name: mimir_received_samples - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_samples:rate5m - - name: mimir_exemplars_in - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) - record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m - - name: mimir_received_exemplars - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m - - name: mimir_exemplars_ingested - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) - record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m - - name: mimir_exemplars_appended - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) - record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m - - name: mimir_scaling_rules - rules: - - expr: | - # Convenience rule to get the number of replicas for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - sum by (cluster, namespace, deployment) ( - label_replace( - kube_deployment_spec_replicas, - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - or - sum by (cluster, namespace, deployment) ( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") - ) - record: cluster_namespace_deployment:actual_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m - )[24h:] - ) - / 240000 - ) - labels: - deployment: distributor - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) - * 0.59999999999999998 / 240000 - ) - labels: - deployment: distributor - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m - )[24h:] - ) - * 3 / 80000 - ) - labels: - deployment: ingester - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by(cluster, namespace) ( - cortex_ingester_memory_series - )[24h:] - ) - / 1500000 - ) - labels: - deployment: ingester - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) - * 3 * 0.59999999999999998 / 1500000 - ) - labels: - deployment: ingester - reason: active_series_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) - * 0.59999999999999998 / 80000 - ) - labels: - deployment: ingester - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} - ) / 4) - / - avg by (cluster, namespace) ( - memcached_limit_bytes{job=~".+/memcached"} - ) - ) - labels: - deployment: memcached - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate - - expr: | - # Convenience rule to get the CPU request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 - # that remove resource metrics, ref: - # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 - # - https://github.com/kubernetes/kube-state-metrics/pull/1004 - # - # This is the old expression, compatible with kube-state-metrics < v2.0.0, - # where kube_pod_container_resource_requests_cpu_cores was removed: - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - or - # This expression is compatible with kube-state-metrics >= v1.4.0, - # where kube_pod_container_resource_requests was introduced. - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | - # Jobs should be sized to their CPU usage. - # We do this by comparing 99th percentile usage over the last 24hrs to - # their current provisioned #replicas and resource requests. - ceil( - cluster_namespace_deployment:actual_replicas:count - * - quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) - / - cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - ) - labels: - reason: cpu_usage - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - # Convenience rule to get the Memory utilization for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - container_memory_usage_bytes{image!=""}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - record: cluster_namespace_deployment:container_memory_usage_bytes:sum - - expr: | - # Convenience rule to get the Memory request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 - # that remove resource metrics, ref: - # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 - # - https://github.com/kubernetes/kube-state-metrics/pull/1004 - # - # This is the old expression, compatible with kube-state-metrics < v2.0.0, - # where kube_pod_container_resource_requests_memory_bytes was removed: - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - or - # This expression is compatible with kube-state-metrics >= v1.4.0, - # where kube_pod_container_resource_requests was introduced. - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | - # Jobs should be sized to their Memory usage. - # We do this by comparing 99th percentile usage over the last 24hrs to - # their current provisioned #replicas and resource requests. - ceil( - cluster_namespace_deployment:actual_replicas:count - * - quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) - / - cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - ) - labels: - reason: memory_usage - record: cluster_namespace_deployment_reason:required_replicas:count - - name: mimir_alertmanager_rules - rules: - - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_alerts) - record: cluster_job_pod:cortex_alertmanager_alerts:sum - - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_silences) - record: cluster_job_pod:cortex_alertmanager_silences:sum - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m - - name: mimir_ingester_rules - rules: - - expr: | - sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) - record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m + - name: mimir_api_1 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:sum_rate + - name: mimir_api_2 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:sum_rate + - name: mimir_api_3 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:sum_rate + - name: mimir_querier_api + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - name: mimir_storage + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:50quantile + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds:avg + - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate + - name: mimir_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:50quantile + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries:avg + - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate + - name: mimir_ingester_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:50quantile + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series:avg + - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:50quantile + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples:avg + - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:50quantile + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars:avg + - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate + - name: mimir_received_samples + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_samples:rate5m + - name: mimir_exemplars_in + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) + record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m + - name: mimir_received_exemplars + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m + - name: mimir_exemplars_ingested + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) + record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m + - name: mimir_exemplars_appended + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) + record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m + - name: mimir_scaling_rules + rules: + - expr: | + # Convenience rule to get the number of replicas for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + kube_deployment_spec_replicas, + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + or + sum by (cluster, namespace, deployment) ( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") + ) + record: cluster_namespace_deployment:actual_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / 240000 + ) + labels: + deployment: distributor + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 240000 + ) + labels: + deployment: distributor + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / 1500000 + ) + labels: + deployment: ingester + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + * 3 * 0.59999999999999998 / 1500000 + ) + labels: + deployment: ingester + reason: active_series_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + labels: + deployment: memcached + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate + - expr: | + # Convenience rule to get the CPU request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_cpu_cores was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + # Jobs should be sized to their CPU usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + ) + labels: + reason: cpu_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + # Convenience rule to get the Memory utilization for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + container_memory_usage_bytes{image!=""}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_memory_usage_bytes:sum + - expr: | + # Convenience rule to get the Memory request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_memory_bytes was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + # Jobs should be sized to their Memory usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + ) + labels: + reason: memory_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - name: mimir_alertmanager_rules + rules: + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_alerts) + record: cluster_job_pod:cortex_alertmanager_alerts:sum + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_silences) + record: cluster_job_pod:cortex_alertmanager_silences:sum + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m + - name: mimir_ingester_rules + rules: + - expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) + record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 7ff2ffe1cd7..0ae5537fed8 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -1,1052 +1,1001 @@ groups: -- name: mimir_alerts - rules: - - alert: MimirIngesterUnhealthy - annotations: - message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ - printf "%f" $value }} unhealthy ingester(s). - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy - expr: | - min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 - for: 15m - labels: - severity: critical - - alert: MimirRequestErrors - annotations: - message: | - The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors - expr: | - # The following 5xx errors considered as non-error: - # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) - # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body - ( - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) - / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - ) * 100 > 1 - for: 15m - labels: - severity: critical - - alert: MimirRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency - expr: | - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} - > - 2.5 - for: 15m - labels: - severity: warning - - alert: MimirInconsistentRuntimeConfig - annotations: - message: | - An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig - expr: | - count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 - for: 1h - labels: - severity: critical - - alert: MimirBadRuntimeConfig - annotations: - message: | - {{ $labels.job }} failed to reload runtime config. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig - expr: | - # The metric value is reset to 0 on error while reloading the config at runtime. - cortex_runtime_config_last_reload_successful == 0 - for: 5m - labels: - severity: critical - - alert: MimirFrontendQueriesStuck - annotations: - message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck - expr: | - sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 - for: 5m - labels: - severity: critical - - alert: MimirSchedulerQueriesStuck - annotations: - message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck - expr: | - sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 - for: 7m - labels: - severity: critical - - alert: MimirCacheRequestErrors - annotations: - message: | - The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors - expr: | - ( - sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) - or - rate(thanos_cache_operation_failures_total[1m]) - ) - / - sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operations_total[1m]) - or - rate(thanos_cache_operations_total[1m]) - ) - ) * 100 > 5 - for: 5m - labels: - severity: warning - - alert: MimirIngesterRestarts - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts - expr: | - ( - sum by(cluster, namespace, instance) ( - increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) - ) - >= 2 - ) - and - ( - count by(cluster, namespace, instance) (cortex_build_info) > 0 - ) - labels: - severity: warning - - alert: MimirKVStoreFailure - annotations: - message: | - Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure - expr: | - ( - sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) - / - sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) - ) - # We want to get alerted only in case there's a constant failure. - == 1 - for: 5m - labels: - severity: critical - - alert: MimirMemoryMapAreasTooHigh - annotations: - message: '{{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas - close to the limit.' - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh - expr: | - process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 - for: 5m - labels: - severity: critical - - alert: MimirIngesterInstanceHasNoTenants - annotations: - message: Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has no tenants assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants - expr: | - (min by(cluster, namespace, instance) (cortex_ingester_memory_users) == 0) - and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load - ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) - ) > 100000 - for: 1h - labels: - severity: warning - - alert: MimirRulerInstanceHasNoRuleGroups - annotations: - message: Mimir ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has no rule groups assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups - expr: | - # Alert on ruler instances in microservices mode that have no rule groups assigned, - min by(cluster, namespace, instance) (cortex_ruler_managers_total{instance=~".*ruler.*"}) == 0 - # but only if other ruler instances of the same cell do have rule groups assigned - and on (cluster, namespace) - (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) - # and there are more than two instances overall - and on (cluster, namespace) - (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) - for: 1h - labels: - severity: warning - - alert: MimirIngestedDataTooFarInTheFuture - annotations: - message: Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has ingested samples with timestamps more than 1h in the future. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture - expr: | - max by(cluster, namespace, instance) ( - cortex_ingester_tsdb_head_max_timestamp_seconds - time() - and - cortex_ingester_tsdb_head_max_timestamp_seconds > 0 - ) > 60*60 - for: 5m - labels: - severity: warning - - alert: MimirStoreGatewayTooManyFailedOperations - annotations: - message: Mimir store-gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors - while doing {{ $labels.operation }} on the object storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations - expr: | - sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 - for: 5m - labels: - severity: warning - - alert: MimirRingMembersMismatch - annotations: - message: | - Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch - expr: | - ( - avg by(cluster, namespace) (sum by(cluster, namespace, instance) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) - != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) - ) - and - ( - count by(cluster, namespace) (cortex_build_info) > 0 - ) - for: 15m - labels: - component: ingester - severity: warning -- name: mimir_instance_limits_alerts - rules: - - alert: MimirIngesterReachingSeriesLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit - expr: | - ( - (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.8 - for: 3h - labels: - severity: warning - - alert: MimirIngesterReachingSeriesLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit - expr: | - ( - (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.9 - for: 5m - labels: - severity: critical - - alert: MimirIngesterReachingTenantsLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit - expr: | - ( - (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_tenants"} > 0) - ) > 0.7 - for: 5m - labels: - severity: warning - - alert: MimirIngesterReachingTenantsLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit - expr: | - ( - (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_tenants"} > 0) - ) > 0.8 - for: 5m - labels: - severity: critical - - alert: MimirReachingTCPConnectionsLimit - annotations: - message: | - Mimir instance {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit - expr: | - cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and - cortex_tcp_connections_limit > 0 - for: 5m - labels: - severity: critical - - alert: MimirDistributorReachingInflightPushRequestLimit - annotations: - message: | - Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit - expr: | - ( - (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) - and ignoring (limit) - (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) - ) > 0.8 - for: 5m - labels: - severity: critical -- name: mimir-rollout-alerts - rules: - - alert: MimirRolloutStuck - annotations: - message: | - The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck - expr: | - ( - max without (revision) ( - sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - unless - sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - ) - * - ( - sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - != - sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - ) - ) and ( - changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) - == - 0 - ) - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - for: 30m - labels: - severity: warning - workload_type: statefulset - - alert: MimirRolloutStuck - annotations: - message: | - The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck - expr: | - ( - sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) - != - sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) - ) and ( - changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) - == - 0 - ) - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - for: 30m - labels: - severity: warning - workload_type: deployment - - alert: RolloutOperatorNotReconciling - annotations: - message: | - Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling - expr: | - max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 - for: 5m - labels: - severity: critical -- name: mimir-provisioning - rules: - - alert: MimirAllocatingTooMuchMemory - annotations: - message: | - Instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory - expr: | - ( - process_resident_memory_bytes{job=~".*/(ingester|mimir-write|mimir-backend)"} - / - on(instance) node_memory_MemTotal_bytes{} - ) > 0.65 - for: 15m - labels: - severity: warning - - alert: MimirAllocatingTooMuchMemory - annotations: - message: | - Instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory - expr: | - ( - process_resident_memory_bytes{job=~".*/(ingester|mimir-write|mimir-backend)"} - / - on(instance) node_memory_MemTotal_bytes{} - ) > 0.8 - for: 15m - labels: - severity: critical -- name: ruler_alerts - rules: - - alert: MimirRulerTooManyFailedPushes - annotations: - message: | - Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes - expr: | - 100 * ( - sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_failed_total[1m])) - / - sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_total[1m])) - ) > 1 - for: 5m - labels: - severity: critical - - alert: MimirRulerTooManyFailedQueries - annotations: - message: | - Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries - expr: | - 100 * ( - sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_failed_total[1m])) - / - sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_total[1m])) - ) > 1 - for: 5m - labels: - severity: critical - - alert: MimirRulerMissedEvaluations - annotations: - message: | - Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations - expr: | - 100 * ( - sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) - / - sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) - ) > 1 - for: 5m - labels: - severity: warning - - alert: MimirRulerFailedRingCheck - annotations: - message: | - Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck - expr: | - sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) - > 0 - for: 5m - labels: - severity: critical - - alert: MimirRulerRemoteEvaluationFailing - annotations: - message: | - Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing - expr: | - 100 * ( - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) - / - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) - ) > 1 - for: 5m - labels: - severity: warning -- name: gossip_alerts - rules: - - alert: MimirGossipMembersTooHigh - annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace - }} consistently sees a higher than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh - expr: | - max by (cluster, namespace) (memberlist_client_cluster_members_count) - > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) - for: 20m - labels: - severity: warning - - alert: MimirGossipMembersTooLow - annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace - }} consistently sees a lower than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow - expr: | - min by (cluster, namespace) (memberlist_client_cluster_members_count) - < - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) - for: 20m - labels: - severity: warning -- name: etcd_alerts - rules: - - alert: EtcdAllocatingTooMuchMemory - annotations: - message: | - Too much memory being used by {{ $labels.namespace }}/{{ $labels.instance }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory - expr: | - ( - container_memory_working_set_bytes{container="etcd"} - / - ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) - ) > 0.65 - for: 15m - labels: - severity: warning - - alert: EtcdAllocatingTooMuchMemory - annotations: - message: | - Too much memory being used by {{ $labels.namespace }}/{{ $labels.instance }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory - expr: | - ( - container_memory_working_set_bytes{container="etcd"} - / - ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) - ) > 0.8 - for: 15m - labels: - severity: critical -- name: alertmanager_alerts - rules: - - alert: MimirAlertmanagerSyncConfigsFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing - expr: | - rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 - for: 30m - labels: - severity: critical - - alert: MimirAlertmanagerRingCheckFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing - expr: | - rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerPartialStateMergeFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing - expr: | - rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerReplicationFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing - expr: | - rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerPersistStateFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing - expr: | - rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 - for: 1h - labels: - severity: critical - - alert: MimirAlertmanagerInitialSyncFailed - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed - expr: | - increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 - labels: - severity: critical - - alert: MimirAlertmanagerAllocatingTooMuchMemory - annotations: - message: | - Alertmanager {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory - expr: | - (process_resident_memory_bytes{job=~".*/alertmanager"} / on(instance) node_memory_MemTotal_bytes{}) > 0.80 - for: 15m - labels: - severity: warning - - alert: MimirAlertmanagerAllocatingTooMuchMemory - annotations: - message: | - Alertmanager {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory - expr: | - (process_resident_memory_bytes{job=~".*/alertmanager"} / on(instance) node_memory_MemTotal_bytes{}) > 0.90 - for: 15m - labels: - severity: critical - - alert: MimirAlertmanagerInstanceHasNoTenants - annotations: - message: Mimir alertmanager {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} owns no tenants. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants - expr: | - # Alert on alertmanager instances in microservices mode that own no tenants, - min by(cluster, namespace, instance) (cortex_alertmanager_tenants_owned{instance=~".*alertmanager.*"}) == 0 - # but only if other instances of the same cell do have tenants assigned. - and on (cluster, namespace) - max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 - for: 1h - labels: - severity: warning -- name: mimir_blocks_alerts - rules: - - alert: MimirIngesterHasNotShippedBlocks - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks - expr: | - (min by(cluster, namespace, instance) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) - and - (max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) - and - # Only if the ingester has ingested samples over the last 4h. - (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - and - # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica - # had ingested samples in the past, then no traffic was received for a long period and then it starts - # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving - # samples, while the a block shipping is expected within the next 4h. - (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) - for: 15m - labels: - severity: critical - - alert: MimirIngesterHasNotShippedBlocksSinceStart - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart - expr: | - (max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) - and - (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - for: 4h - labels: - severity: critical - - alert: MimirIngesterHasUnshippedBlocks - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't - been successfully uploaded to the storage yet. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks - expr: | - (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) - and - (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) - for: 15m - labels: - severity: critical - - alert: MimirIngesterTSDBHeadCompactionFailed - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to compact TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed - expr: | - rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: MimirIngesterTSDBHeadTruncationFailed - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to truncate TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed - expr: | - rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBCheckpointCreationFailed - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to create TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed - expr: | - rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBCheckpointDeletionFailed - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to delete TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed - expr: | - rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBWALTruncationFailed - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to truncate TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed - expr: | - rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 - labels: - severity: warning - - alert: MimirIngesterTSDBWALCorrupted - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted - expr: | - # alert when there are more than one corruptions - count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 - and - # and there is only one zone - count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 - labels: - deployment: single-zone - severity: critical - - alert: MimirIngesterTSDBWALCorrupted - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted - expr: | - # alert when there are more than one corruptions - count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 - and - # and there are multiple zones - count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 - labels: - deployment: multi-zone - severity: critical - - alert: MimirIngesterTSDBWALWritesFailed - annotations: - message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to write to TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed - expr: | - rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 - for: 3m - labels: - severity: critical - - alert: MimirStoreGatewayHasNotSyncTheBucket - annotations: - message: Mimir store-gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not successfully synched the bucket since {{ $value - | humanizeDuration }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket - expr: | - (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) - and - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 - for: 5m - labels: - severity: critical - - alert: MimirStoreGatewayNoSyncedTenants - annotations: - message: Mimir store-gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not syncing any blocks for any tenant. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants - expr: | - min by(cluster, namespace, instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 - for: 1h - labels: - severity: warning - - alert: MimirBucketIndexNotUpdated - annotations: - message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster - }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration - }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated - expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 - labels: - severity: critical -- name: mimir_compactor_alerts - rules: - - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not successfully cleaned up blocks in the last 6 - hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) - for: 1h - labels: - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) - and - (cortex_compactor_last_successful_run_timestamp_seconds > 0) - for: 1h - labels: - reason: in-last-24h - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - cortex_compactor_last_successful_run_timestamp_seconds == 0 - for: 24h - labels: - reason: since-startup - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} failed to run 2 consecutive compactions. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 - labels: - reason: consecutive-failures - severity: critical - - alert: MimirCompactorHasNotUploadedBlocks - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not uploaded any block in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks - expr: | - (time() - (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) - and - (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) - and - # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). - (sum by(cluster, namespace, instance) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) - for: 15m - labels: - severity: critical - time_period: 24h - - alert: MimirCompactorHasNotUploadedBlocks - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has not uploaded any block since its start. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks - expr: | - (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) - and - # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). - (sum by(cluster, namespace, instance) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) - for: 24h - labels: - severity: critical - time_period: since-start - - alert: MimirCompactorSkippedUnhealthyBlocks - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks - expr: | - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 - for: 1m - labels: - severity: warning - - alert: MimirCompactorSkippedUnhealthyBlocks - annotations: - message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ - $labels.namespace }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks - expr: | - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 - for: 30m - labels: - severity: critical -- name: mimir_autoscaling - rules: - - alert: MimirAutoscalerNotActive - annotations: - message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler - }} in {{ $labels.namespace }} is not active. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive - expr: | - ( - label_replace(( - kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} - # Match only Mimir namespaces. + - name: mimir_alerts + rules: + - alert: MimirIngesterUnhealthy + annotations: + message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ printf "%f" $value }} unhealthy ingester(s). + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy + expr: | + min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 + for: 15m + labels: + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) + / + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 + for: 15m + labels: + severity: critical + - alert: MimirRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency + expr: | + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} + > + 2.5 + for: 15m + labels: + severity: warning + - alert: MimirInconsistentRuntimeConfig + annotations: + message: | + An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig + expr: | + count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 + for: 1h + labels: + severity: critical + - alert: MimirBadRuntimeConfig + annotations: + message: | + {{ $labels.job }} failed to reload runtime config. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig + expr: | + # The metric value is reset to 0 on error while reloading the config at runtime. + cortex_runtime_config_last_reload_successful == 0 + for: 5m + labels: + severity: critical + - alert: MimirFrontendQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirSchedulerQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 + for: 7m + labels: + severity: critical + - alert: MimirCacheRequestErrors + annotations: + message: | + The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors + expr: | + ( + sum by(cluster, namespace, name, operation) ( + rate(thanos_memcached_operation_failures_total[1m]) + or + rate(thanos_cache_operation_failures_total[1m]) + ) + / + sum by(cluster, namespace, name, operation) ( + rate(thanos_memcached_operations_total[1m]) + or + rate(thanos_cache_operations_total[1m]) + ) + ) * 100 > 5 + for: 5m + labels: + severity: warning + - alert: MimirIngesterRestarts + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts + expr: | + ( + sum by(cluster, namespace, instance) ( + increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) + ) + >= 2 + ) + and + ( + count by(cluster, namespace, instance) (cortex_build_info) > 0 + ) + labels: + severity: warning + - alert: MimirKVStoreFailure + annotations: + message: | + Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure + expr: | + ( + sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(cluster, namespace, instance, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + for: 5m + labels: + severity: critical + - alert: MimirMemoryMapAreasTooHigh + annotations: + message: '{{ $labels.job }}/{{ $labels.instance }} has a number of mmap-ed areas close to the limit.' + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh + expr: | + process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirIngesterInstanceHasNoTenants + annotations: + message: Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no tenants assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants + expr: | + (min by(cluster, namespace, instance) (cortex_ingester_memory_users) == 0) + and on (cluster, namespace) + # Only if there are more time-series than would be expected due to continuous testing load + ( + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) > 100000 + for: 1h + labels: + severity: warning + - alert: MimirRulerInstanceHasNoRuleGroups + annotations: + message: Mimir ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no rule groups assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups + expr: | + # Alert on ruler instances in microservices mode that have no rule groups assigned, + min by(cluster, namespace, instance) (cortex_ruler_managers_total{instance=~".*ruler.*"}) == 0 + # but only if other ruler instances of the same cell do have rule groups assigned + and on (cluster, namespace) + (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) + # and there are more than two instances overall + and on (cluster, namespace) + (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) + for: 1h + labels: + severity: warning + - alert: MimirIngestedDataTooFarInTheFuture + annotations: + message: Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has ingested samples with timestamps more than 1h in the future. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture + expr: | + max by(cluster, namespace, instance) ( + cortex_ingester_tsdb_head_max_timestamp_seconds - time() + and + cortex_ingester_tsdb_head_max_timestamp_seconds > 0 + ) > 60*60 + for: 5m + labels: + severity: warning + - alert: MimirStoreGatewayTooManyFailedOperations + annotations: + message: Mimir store-gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations + expr: | + sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRingMembersMismatch + annotations: + message: | + Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch + expr: | + ( + avg by(cluster, namespace) (sum by(cluster, namespace, instance) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) + != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) + ) + and + ( + count by(cluster, namespace) (cortex_build_info) > 0 + ) + for: 15m + labels: + component: ingester + severity: warning + - name: mimir_instance_limits_alerts + rules: + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.8 + for: 3h + labels: + severity: warning + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.9 + for: 5m + labels: + severity: critical + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.7 + for: 5m + labels: + severity: warning + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirReachingTCPConnectionsLimit + annotations: + message: | + Mimir instance {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit + expr: | + cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and + cortex_tcp_connections_limit > 0 + for: 5m + labels: + severity: critical + - alert: MimirDistributorReachingInflightPushRequestLimit + annotations: + message: | + Distributor {{ $labels.job }}/{{ $labels.instance }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit + expr: | + ( + (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) + and ignoring (limit) + (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - name: mimir-rollout-alerts + rules: + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") - > 0), - "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" - ) - ) - # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, - # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported - # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) - for: 1h - labels: - severity: critical - - alert: MimirAutoscalerKedaFailing - annotations: - message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace - }} is experiencing errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing - expr: | - ( - # Find KEDA scalers reporting errors. - label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - ) - > 0 - for: 1h - labels: - severity: critical -- name: mimir_ingest_storage_alerts - rules: - - alert: MimirIngesterLastConsumedOffsetCommitFailed - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to commit the last consumed offset. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed - expr: | - sum by(cluster, namespace, instance) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) - / - sum by(cluster, namespace, instance) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) - > 0.2 - for: 15m - labels: - severity: critical - - alert: MimirIngesterFailedToReadRecordsFromKafka - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to read records from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka - expr: | - sum by(cluster, namespace, instance, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) - > 0 - for: 5m - labels: - severity: critical - - alert: MimirIngesterKafkaFetchErrorsRateTooHigh - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is receiving fetch errors when reading records from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh - expr: | - sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) - / - sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetches_total[5m])) - > 0.1 - for: 15m - labels: - severity: critical - - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} in "starting" phase is not reducing consumption lag of write requests read - from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing - expr: | - deriv(( - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) - / - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) - )[5m:1m]) > 0 - for: 5m - labels: - severity: warning - - alert: MimirRunningIngesterReceiveDelayTooHigh - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} in "running" phase is too far behind in its consumption of write requests - from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh - expr: | - ( - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) - / - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) - ) > (10 * 60) - for: 5m - labels: - severity: critical - - alert: MimirIngesterFailsToProcessRecordsFromKafka - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} fails to consume write requests read from Kafka due to internal errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka - expr: | - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 - for: 5m - labels: - severity: critical - - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath - annotations: - message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace - }} fails to enforce strong-consistency on read-path. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath - expr: | - sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 - for: 5m - labels: - severity: critical -- name: mimir_continuous_test - rules: - - alert: MimirContinuousTestNotRunningOnWrites - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not effectively running because writes are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 - for: 1h - labels: - severity: warning - - alert: MimirContinuousTestNotRunningOnReads - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not effectively running because queries are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 - for: 1h - labels: - severity: warning - - alert: MimirContinuousTestFailed - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} failed when asserting query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 - labels: - severity: warning + for: 30m + labels: + severity: warning + workload_type: statefulset + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + workload_type: deployment + - alert: RolloutOperatorNotReconciling + annotations: + message: | + Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling + expr: | + max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 + for: 5m + labels: + severity: critical + - name: mimir-provisioning + rules: + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + process_resident_memory_bytes{job=~".*/(ingester|mimir-write|mimir-backend)"} + / + on(instance) node_memory_MemTotal_bytes{} + ) > 0.65 + for: 15m + labels: + severity: warning + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + process_resident_memory_bytes{job=~".*/(ingester|mimir-write|mimir-backend)"} + / + on(instance) node_memory_MemTotal_bytes{} + ) > 0.8 + for: 15m + labels: + severity: critical + - name: ruler_alerts + rules: + - alert: MimirRulerTooManyFailedPushes + annotations: + message: | + Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes + expr: | + 100 * ( + sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_failed_total[1m])) + / + sum by (cluster, namespace, instance) (rate(cortex_ruler_write_requests_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerTooManyFailedQueries + annotations: + message: | + Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries + expr: | + 100 * ( + sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (cluster, namespace, instance) (rate(cortex_ruler_queries_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerMissedEvaluations + annotations: + message: | + Mimir Ruler {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations + expr: | + 100 * ( + sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + / + sum by (cluster, namespace, instance, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + ) > 1 + for: 5m + labels: + severity: warning + - alert: MimirRulerFailedRingCheck + annotations: + message: | + Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck + expr: | + sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + 100 * ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + / + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) > 1 + for: 5m + labels: + severity: warning + - name: gossip_alerts + rules: + - alert: MimirGossipMembersTooHigh + annotations: + message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a higher than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh + expr: | + max by (cluster, namespace) (memberlist_client_cluster_members_count) + > + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + for: 20m + labels: + severity: warning + - alert: MimirGossipMembersTooLow + annotations: + message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a lower than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow + expr: | + min by (cluster, namespace) (memberlist_client_cluster_members_count) + < + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) + for: 20m + labels: + severity: warning + - name: etcd_alerts + rules: + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.instance }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.65 + for: 15m + labels: + severity: warning + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.instance }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.8 + for: 15m + labels: + severity: critical + - name: alertmanager_alerts + rules: + - alert: MimirAlertmanagerSyncConfigsFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing + expr: | + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + for: 30m + labels: + severity: critical + - alert: MimirAlertmanagerRingCheckFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing + expr: | + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPartialStateMergeFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing + expr: | + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerReplicationFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing + expr: | + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPersistStateFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing + expr: | + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + for: 1h + labels: + severity: critical + - alert: MimirAlertmanagerInitialSyncFailed + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed + expr: | + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + labels: + severity: critical + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (process_resident_memory_bytes{job=~".*/alertmanager"} / on(instance) node_memory_MemTotal_bytes{}) > 0.80 + for: 15m + labels: + severity: warning + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (process_resident_memory_bytes{job=~".*/alertmanager"} / on(instance) node_memory_MemTotal_bytes{}) > 0.90 + for: 15m + labels: + severity: critical + - alert: MimirAlertmanagerInstanceHasNoTenants + annotations: + message: Mimir alertmanager {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} owns no tenants. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants + expr: | + # Alert on alertmanager instances in microservices mode that own no tenants, + min by(cluster, namespace, instance) (cortex_alertmanager_tenants_owned{instance=~".*alertmanager.*"}) == 0 + # but only if other instances of the same cell do have tenants assigned. + and on (cluster, namespace) + max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 + for: 1h + labels: + severity: warning + - name: mimir_blocks_alerts + rules: + - alert: MimirIngesterHasNotShippedBlocks + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks + expr: | + (min by(cluster, namespace, instance) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) + and + (max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) + and + # Only if the ingester has ingested samples over the last 4h. + (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + and + # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica + # had ingested samples in the past, then no traffic was received for a long period and then it starts + # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving + # samples, while the a block shipping is expected within the next 4h. + (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterHasNotShippedBlocksSinceStart + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart + expr: | + (max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) + and + (max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + for: 4h + labels: + severity: critical + - alert: MimirIngesterHasUnshippedBlocks + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks + expr: | + (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) + and + (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadCompactionFailed + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to compact TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed + expr: | + rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed + expr: | + rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointCreationFailed + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to create TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointDeletionFailed + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to delete TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBWALTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed + expr: | + rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 + labels: + severity: warning + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + # alert when there are more than one corruptions + count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 + and + # and there is only one zone + count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 + labels: + deployment: single-zone + severity: critical + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + # alert when there are more than one corruptions + count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 + and + # and there are multiple zones + count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 + labels: + deployment: multi-zone + severity: critical + - alert: MimirIngesterTSDBWALWritesFailed + annotations: + message: Mimir Ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to write to TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed + expr: | + rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 + for: 3m + labels: + severity: critical + - alert: MimirStoreGatewayHasNotSyncTheBucket + annotations: + message: Mimir store-gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully synched the bucket since {{ $value | humanizeDuration }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket + expr: | + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) + and + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 + for: 5m + labels: + severity: critical + - alert: MimirStoreGatewayNoSyncedTenants + annotations: + message: Mimir store-gateway {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not syncing any blocks for any tenant. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants + expr: | + min by(cluster, namespace, instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 + for: 1h + labels: + severity: warning + - alert: MimirBucketIndexNotUpdated + annotations: + message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated + expr: | + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 + labels: + severity: critical + - name: mimir_compactor_alerts + rules: + - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully cleaned up blocks in the last 6 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) + for: 1h + labels: + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) + and + (cortex_compactor_last_successful_run_timestamp_seconds > 0) + for: 1h + labels: + reason: in-last-24h + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + cortex_compactor_last_successful_run_timestamp_seconds == 0 + for: 24h + labels: + reason: since-startup + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed to run 2 consecutive compactions. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 + labels: + reason: consecutive-failures + severity: critical + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (time() - (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) + and + (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, instance) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 15m + labels: + severity: critical + time_period: 24h + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block since its start. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (max by(cluster, namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, instance) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 24h + labels: + severity: critical + time_period: since-start + - alert: MimirCompactorSkippedUnhealthyBlocks + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 + for: 1m + labels: + severity: warning + - alert: MimirCompactorSkippedUnhealthyBlocks + annotations: + message: Mimir Compactor {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 + for: 30m + labels: + severity: critical + - name: mimir_autoscaling + rules: + - alert: MimirAutoscalerNotActive + annotations: + message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler }} in {{ $labels.namespace }} is not active. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive + expr: | + ( + label_replace(( + kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + # Add "metric" label. + + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + > 0), + "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" + ) + ) + # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, + # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported + # by KEDA could not exist at all or being exposed with a value of 0. + and on (cluster, namespace, metric, scaledObject) + (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + for: 1h + labels: + severity: critical + - alert: MimirAutoscalerKedaFailing + annotations: + message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace }} is experiencing errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing + expr: | + ( + # Find KEDA scalers reporting errors. + label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + ) + > 0 + for: 1h + labels: + severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, instance) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, instance) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, instance, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, instance) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (10 * 60) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, instance) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical + - name: mimir_continuous_test + rules: + - alert: MimirContinuousTestNotRunningOnWrites + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because writes are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestNotRunningOnReads + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because queries are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestFailed + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed when asserting query results. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 + labels: + severity: warning diff --git a/operations/mimir-mixin-compiled-baremetal/rules.yaml b/operations/mimir-mixin-compiled-baremetal/rules.yaml index a88ae0d4f9a..61d3dbb27d2 100644 --- a/operations/mimir-mixin-compiled-baremetal/rules.yaml +++ b/operations/mimir-mixin-compiled-baremetal/rules.yaml @@ -1,439 +1,379 @@ groups: -- name: mimir_api_1 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:sum_rate -- name: mimir_api_2 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:sum_rate -- name: mimir_api_3 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, namespace, job, - route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:sum_rate -- name: mimir_querier_api - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by - (cluster, job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) - by (cluster, namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate -- name: mimir_storage - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:50quantile - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds:avg - - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate -- name: mimir_queries - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:50quantile - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) - by (cluster, job) - record: cluster_job:cortex_query_frontend_retries:avg - - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by - (cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, - cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate -- name: mimir_ingester_queries - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:50quantile - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_series:avg - - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:50quantile - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples:avg - - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:50quantile - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars:avg - - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate -- name: mimir_received_samples - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_samples:rate5m -- name: mimir_exemplars_in - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) - record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m -- name: mimir_received_exemplars - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m -- name: mimir_exemplars_ingested - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) - record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m -- name: mimir_exemplars_appended - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) - record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m -- name: mimir_scaling_rules - rules: - - expr: | - sum by (cluster, namespace, deployment) ( - label_replace( - cortex_build_info{namespace="baremetal"}, - "deployment", "$1", "job", "baremetal/(.*)" - ) - ) - record: cluster_namespace_deployment:actual_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m - )[24h:] - ) - / 240000 - ) - labels: - deployment: distributor - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) - * 0.59999999999999998 / 240000 - ) - labels: - deployment: distributor - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m - )[24h:] - ) - * 3 / 80000 - ) - labels: - deployment: ingester - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by(cluster, namespace) ( - cortex_ingester_memory_series - )[24h:] - ) - / 1500000 - ) - labels: - deployment: ingester - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) - * 3 * 0.59999999999999998 / 1500000 - ) - labels: - deployment: ingester - reason: active_series_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) - * 0.59999999999999998 / 80000 - ) - labels: - deployment: ingester - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} - ) / 4) - / - avg by (cluster, namespace) ( - memcached_limit_bytes{job=~".+/memcached"} - ) - ) - labels: - deployment: memcached - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - sum by (cluster, namespace, deployment) ( - irate( - label_replace( - process_cpu_seconds_total{namespace="baremetal"}, - "deployment", "$1", "job", "baremetal/(.*)" - )[5m:] - ) - ) - record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate - - expr: | - sum by (cluster, namespace, deployment) ( - count without(cpu, mode) ( - label_replace( - node_cpu_seconds_total{mode="idle"}, - "deployment", "$1", "instance", ".*(distributor|ingester|mimir-write|query-frontend|querier|ruler-query-frontend|ruler-querier|mimir-read|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter|mimir-backend).*" - ) - ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | - ceil( - cluster_namespace_deployment:actual_replicas:count - * - quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) - / - cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - / - 0.66000000000000003 - ) - labels: - reason: cpu_usage - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - sum by (cluster, namespace, deployment) ( - label_replace( - process_resident_memory_bytes{namespace="baremetal"}, - "deployment", "$1", "job", "baremetal/(.*)" - ) - ) - record: cluster_namespace_deployment:container_memory_usage_bytes:sum - - expr: | - sum by (cluster, namespace, deployment) ( - label_replace( - node_memory_MemTotal_bytes, - "deployment", "$1", "instance", ".*(distributor|ingester|mimir-write|query-frontend|querier|ruler-query-frontend|ruler-querier|mimir-read|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter|mimir-backend).*" - ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | - ceil( - cluster_namespace_deployment:actual_replicas:count - * - quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) - / - cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - / - 0.66000000000000003 - ) - labels: - reason: memory_usage - record: cluster_namespace_deployment_reason:required_replicas:count -- name: mimir_alertmanager_rules - rules: - - expr: | - sum by (cluster, job, instance) (cortex_alertmanager_alerts) - record: cluster_job_instance:cortex_alertmanager_alerts:sum - - expr: | - sum by (cluster, job, instance) (cortex_alertmanager_silences) - record: cluster_job_instance:cortex_alertmanager_silences:sum - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m -- name: mimir_ingester_rules - rules: - - expr: | - sum by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[1m])) - record: cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m + - name: mimir_api_1 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:sum_rate + - name: mimir_api_2 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:sum_rate + - name: mimir_api_3 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:sum_rate + - name: mimir_querier_api + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - name: mimir_storage + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:50quantile + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds:avg + - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate + - name: mimir_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:50quantile + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries:avg + - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate + - name: mimir_ingester_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:50quantile + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series:avg + - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:50quantile + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples:avg + - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:50quantile + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars:avg + - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate + - name: mimir_received_samples + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_samples:rate5m + - name: mimir_exemplars_in + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) + record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m + - name: mimir_received_exemplars + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m + - name: mimir_exemplars_ingested + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) + record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m + - name: mimir_exemplars_appended + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) + record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m + - name: mimir_scaling_rules + rules: + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + cortex_build_info{namespace="baremetal"}, + "deployment", "$1", "job", "baremetal/(.*)" + ) + ) + record: cluster_namespace_deployment:actual_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / 240000 + ) + labels: + deployment: distributor + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 240000 + ) + labels: + deployment: distributor + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / 1500000 + ) + labels: + deployment: ingester + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + * 3 * 0.59999999999999998 / 1500000 + ) + labels: + deployment: ingester + reason: active_series_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + labels: + deployment: memcached + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + sum by (cluster, namespace, deployment) ( + irate( + label_replace( + process_cpu_seconds_total{namespace="baremetal"}, + "deployment", "$1", "job", "baremetal/(.*)" + )[5m:] + ) + ) + record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (cluster, namespace, deployment) ( + count without(cpu, mode) ( + label_replace( + node_cpu_seconds_total{mode="idle"}, + "deployment", "$1", "instance", ".*(distributor|ingester|mimir-write|query-frontend|querier|ruler-query-frontend|ruler-querier|mimir-read|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter|mimir-backend).*" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + / + 0.66000000000000003 + ) + labels: + reason: cpu_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + process_resident_memory_bytes{namespace="baremetal"}, + "deployment", "$1", "job", "baremetal/(.*)" + ) + ) + record: cluster_namespace_deployment:container_memory_usage_bytes:sum + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + node_memory_MemTotal_bytes, + "deployment", "$1", "instance", ".*(distributor|ingester|mimir-write|query-frontend|querier|ruler-query-frontend|ruler-querier|mimir-read|query-scheduler|ruler-query-scheduler|ruler|store-gateway|compactor|alertmanager|overrides-exporter|mimir-backend).*" + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + / + 0.66000000000000003 + ) + labels: + reason: memory_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - name: mimir_alertmanager_rules + rules: + - expr: | + sum by (cluster, job, instance) (cortex_alertmanager_alerts) + record: cluster_job_instance:cortex_alertmanager_alerts:sum + - expr: | + sum by (cluster, job, instance) (cortex_alertmanager_silences) + record: cluster_job_instance:cortex_alertmanager_silences:sum + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m + - name: mimir_ingester_rules + rules: + - expr: | + sum by(cluster, namespace, instance) (rate(cortex_ingester_ingested_samples_total[1m])) + record: cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index ba103799679..702968852bf 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -1,1065 +1,1015 @@ groups: -- name: mimir_alerts - rules: - - alert: MimirIngesterUnhealthy - annotations: - message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ - printf "%f" $value }} unhealthy ingester(s). - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy - expr: | - min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 - for: 15m - labels: - severity: critical - - alert: MimirRequestErrors - annotations: - message: | - The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors - expr: | - # The following 5xx errors considered as non-error: - # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) - # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body - ( - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) - / - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) - ) * 100 > 1 - for: 15m - labels: - severity: critical - - alert: MimirRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency - expr: | - cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} - > - 2.5 - for: 15m - labels: - severity: warning - - alert: MimirInconsistentRuntimeConfig - annotations: - message: | - An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig - expr: | - count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 - for: 1h - labels: - severity: critical - - alert: MimirBadRuntimeConfig - annotations: - message: | - {{ $labels.job }} failed to reload runtime config. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig - expr: | - # The metric value is reset to 0 on error while reloading the config at runtime. - cortex_runtime_config_last_reload_successful == 0 - for: 5m - labels: - severity: critical - - alert: MimirFrontendQueriesStuck - annotations: - message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck - expr: | - sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 - for: 5m - labels: - severity: critical - - alert: MimirSchedulerQueriesStuck - annotations: - message: | - There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck - expr: | - sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 - for: 7m - labels: - severity: critical - - alert: MimirCacheRequestErrors - annotations: - message: | - The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors - expr: | - ( - sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operation_failures_total[1m]) - or - rate(thanos_cache_operation_failures_total[1m]) - ) - / - sum by(cluster, namespace, name, operation) ( - rate(thanos_memcached_operations_total[1m]) - or - rate(thanos_cache_operations_total[1m]) - ) - ) * 100 > 5 - for: 5m - labels: - severity: warning - - alert: MimirIngesterRestarts - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts - expr: | - ( - sum by(cluster, namespace, pod) ( - increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) - ) - >= 2 - ) - and - ( - count by(cluster, namespace, pod) (cortex_build_info) > 0 - ) - labels: - severity: warning - - alert: MimirKVStoreFailure - annotations: - message: | - Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure - expr: | - ( - sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) - / - sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) - ) - # We want to get alerted only in case there's a constant failure. - == 1 - for: 5m - labels: - severity: critical - - alert: MimirMemoryMapAreasTooHigh - annotations: - message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas - close to the limit.' - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh - expr: | - process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 - for: 5m - labels: - severity: critical - - alert: MimirIngesterInstanceHasNoTenants - annotations: - message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has no tenants assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants - expr: | - (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) - and on (cluster, namespace) - # Only if there are more time-series than would be expected due to continuous testing load - ( - sum by(cluster, namespace) (cortex_ingester_memory_series) - / - max by(cluster, namespace) (cortex_distributor_replication_factor) - ) > 100000 - for: 1h - labels: - severity: warning - - alert: MimirRulerInstanceHasNoRuleGroups - annotations: - message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has no rule groups assigned. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups - expr: | - # Alert on ruler instances in microservices mode that have no rule groups assigned, - min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*mimir-)?ruler.*"}) == 0 - # but only if other ruler instances of the same cell do have rule groups assigned - and on (cluster, namespace) - (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) - # and there are more than two instances overall - and on (cluster, namespace) - (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) - for: 1h - labels: - severity: warning - - alert: MimirIngestedDataTooFarInTheFuture - annotations: - message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has ingested samples with timestamps more than 1h in the future. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture - expr: | - max by(cluster, namespace, pod) ( - cortex_ingester_tsdb_head_max_timestamp_seconds - time() - and - cortex_ingester_tsdb_head_max_timestamp_seconds > 0 - ) > 60*60 - for: 5m - labels: - severity: warning - - alert: MimirStoreGatewayTooManyFailedOperations - annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ - $labels.operation }} on the object storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations - expr: | - sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 - for: 5m - labels: - severity: warning - - alert: MimirRingMembersMismatch - annotations: - message: | - Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch - expr: | - ( - avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) - != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) - ) - and - ( - count by(cluster, namespace) (cortex_build_info) > 0 - ) - for: 15m - labels: - component: ingester - severity: warning -- name: mimir_instance_limits_alerts - rules: - - alert: MimirIngesterReachingSeriesLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit - expr: | - ( - (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.8 - for: 3h - labels: - severity: warning - - alert: MimirIngesterReachingSeriesLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit - expr: | - ( - (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_series"} > 0) - ) > 0.9 - for: 5m - labels: - severity: critical - - alert: MimirIngesterReachingTenantsLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit - expr: | - ( - (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_tenants"} > 0) - ) > 0.7 - for: 5m - labels: - severity: warning - - alert: MimirIngesterReachingTenantsLimit - annotations: - message: | - Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit - expr: | - ( - (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) - and ignoring (limit) - (cortex_ingester_instance_limits{limit="max_tenants"} > 0) - ) > 0.8 - for: 5m - labels: - severity: critical - - alert: MimirReachingTCPConnectionsLimit - annotations: - message: | - Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit - expr: | - cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and - cortex_tcp_connections_limit > 0 - for: 5m - labels: - severity: critical - - alert: MimirDistributorReachingInflightPushRequestLimit - annotations: - message: | - Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit - expr: | - ( - (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) - and ignoring (limit) - (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) - ) > 0.8 - for: 5m - labels: - severity: critical -- name: mimir-rollout-alerts - rules: - - alert: MimirRolloutStuck - annotations: - message: | - The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck - expr: | - ( - max without (revision) ( - sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - unless - sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - ) - * - ( - sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - != - sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) - ) - ) and ( - changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) - == - 0 - ) - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - for: 30m - labels: - severity: warning - workload_type: statefulset - - alert: MimirRolloutStuck - annotations: - message: | - The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck - expr: | - ( - sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) - != - sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) - ) and ( - changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) - == - 0 - ) - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - for: 30m - labels: - severity: warning - workload_type: deployment - - alert: RolloutOperatorNotReconciling - annotations: - message: | - Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling - expr: | - max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 - for: 5m - labels: - severity: critical -- name: mimir-provisioning - rules: - - alert: MimirAllocatingTooMuchMemory - annotations: - message: | - Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory - expr: | - ( - # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. - # See: https://github.com/grafana/mimir/issues/2466 - container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} - / - ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) - ) - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - > 0.65 - for: 15m - labels: - severity: warning - - alert: MimirAllocatingTooMuchMemory - annotations: - message: | - Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory - expr: | - ( - # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. - # See: https://github.com/grafana/mimir/issues/2466 - container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} - / - ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) - ) - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - > 0.8 - for: 15m - labels: - severity: critical -- name: ruler_alerts - rules: - - alert: MimirRulerTooManyFailedPushes - annotations: - message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes - expr: | - 100 * ( - sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m])) - ) > 1 - for: 5m - labels: - severity: critical - - alert: MimirRulerTooManyFailedQueries - annotations: - message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries - expr: | - 100 * ( - sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m])) - ) > 1 - for: 5m - labels: - severity: critical - - alert: MimirRulerMissedEvaluations - annotations: - message: | - Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations - expr: | - 100 * ( - sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) - / - sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) - ) > 1 - for: 5m - labels: - severity: warning - - alert: MimirRulerFailedRingCheck - annotations: - message: | - Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck - expr: | - sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) - > 0 - for: 5m - labels: - severity: critical - - alert: MimirRulerRemoteEvaluationFailing - annotations: - message: | - Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing - expr: | - 100 * ( - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) - / - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) - ) > 1 - for: 5m - labels: - severity: warning -- name: gossip_alerts - rules: - - alert: MimirGossipMembersTooHigh - annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace - }} consistently sees a higher than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh - expr: | - max by (cluster, namespace) (memberlist_client_cluster_members_count) - > - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) - for: 20m - labels: - severity: warning - - alert: MimirGossipMembersTooLow - annotations: - message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace - }} consistently sees a lower than expected number of gossip members. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow - expr: | - min by (cluster, namespace) (memberlist_client_cluster_members_count) - < - (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) - for: 20m - labels: - severity: warning -- name: etcd_alerts - rules: - - alert: EtcdAllocatingTooMuchMemory - annotations: - message: | - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory - expr: | - ( - container_memory_working_set_bytes{container="etcd"} - / - ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) - ) > 0.65 - for: 15m - labels: - severity: warning - - alert: EtcdAllocatingTooMuchMemory - annotations: - message: | - Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory - expr: | - ( - container_memory_working_set_bytes{container="etcd"} - / - ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) - ) > 0.8 - for: 15m - labels: - severity: critical -- name: alertmanager_alerts - rules: - - alert: MimirAlertmanagerSyncConfigsFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing - expr: | - rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 - for: 30m - labels: - severity: critical - - alert: MimirAlertmanagerRingCheckFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing - expr: | - rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerPartialStateMergeFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing - expr: | - rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerReplicationFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing - expr: | - rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 - for: 10m - labels: - severity: critical - - alert: MimirAlertmanagerPersistStateFailing - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing - expr: | - rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 - for: 1h - labels: - severity: critical - - alert: MimirAlertmanagerInitialSyncFailed - annotations: - message: | - Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed - expr: | - increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 - labels: - severity: critical - - alert: MimirAlertmanagerAllocatingTooMuchMemory - annotations: - message: | - Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory - expr: | - (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80 - and - (container_spec_memory_limit_bytes{container="alertmanager"} > 0) - for: 15m - labels: - severity: warning - - alert: MimirAlertmanagerAllocatingTooMuchMemory - annotations: - message: | - Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory - expr: | - (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90 - and - (container_spec_memory_limit_bytes{container="alertmanager"} > 0) - for: 15m - labels: - severity: critical - - alert: MimirAlertmanagerInstanceHasNoTenants - annotations: - message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} owns no tenants. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants - expr: | - # Alert on alertmanager instances in microservices mode that own no tenants, - min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*mimir-)?alertmanager.*"}) == 0 - # but only if other instances of the same cell do have tenants assigned. - and on (cluster, namespace) - max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 - for: 1h - labels: - severity: warning -- name: mimir_blocks_alerts - rules: - - alert: MimirIngesterHasNotShippedBlocks - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks - expr: | - (min by(cluster, namespace, pod) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) - and - (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) - and - # Only if the ingester has ingested samples over the last 4h. - (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - and - # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica - # had ingested samples in the past, then no traffic was received for a long period and then it starts - # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving - # samples, while the a block shipping is expected within the next 4h. - (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) - for: 15m - labels: - severity: critical - - alert: MimirIngesterHasNotShippedBlocksSinceStart - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not shipped any block in the last 4 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart - expr: | - (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) - and - (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) - for: 4h - labels: - severity: critical - - alert: MimirIngesterHasUnshippedBlocks - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't - been successfully uploaded to the storage yet. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks - expr: | - (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) - and - (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) - for: 15m - labels: - severity: critical - - alert: MimirIngesterTSDBHeadCompactionFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to compact TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed - expr: | - rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: MimirIngesterTSDBHeadTruncationFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to truncate TSDB head. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed - expr: | - rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBCheckpointCreationFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to create TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed - expr: | - rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBCheckpointDeletionFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to delete TSDB checkpoint. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed - expr: | - rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 - labels: - severity: critical - - alert: MimirIngesterTSDBWALTruncationFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to truncate TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed - expr: | - rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 - labels: - severity: warning - - alert: MimirIngesterTSDBWALCorrupted - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted - expr: | - # alert when there are more than one corruptions - count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 - and - # and there is only one zone - count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 - labels: - deployment: single-zone - severity: critical - - alert: MimirIngesterTSDBWALCorrupted - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} got a corrupted TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted - expr: | - # alert when there are more than one corruptions - count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 - and - # and there are multiple zones - count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 - labels: - deployment: multi-zone - severity: critical - - alert: MimirIngesterTSDBWALWritesFailed - annotations: - message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to write to TSDB WAL. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed - expr: | - rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 - for: 3m - labels: - severity: critical - - alert: MimirStoreGatewayHasNotSyncTheBucket - annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not successfully synched the bucket since {{ $value | humanizeDuration - }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket - expr: | - (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) - and - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 - for: 5m - labels: - severity: critical - - alert: MimirStoreGatewayNoSyncedTenants - annotations: - message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is not syncing any blocks for any tenant. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants - expr: | - min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 - for: 1h - labels: - severity: warning - - alert: MimirBucketIndexNotUpdated - annotations: - message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster - }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration - }}. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated - expr: | - min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 - labels: - severity: critical -- name: mimir_compactor_alerts - rules: - - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not successfully cleaned up blocks in the last 6 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) - for: 1h - labels: - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) - and - (cortex_compactor_last_successful_run_timestamp_seconds > 0) - for: 1h - labels: - reason: in-last-24h - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not run compaction in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - # The "last successful run" metric is updated even if the compactor owns no tenants, - # so this alert correctly doesn't fire if compactor has nothing to do. - cortex_compactor_last_successful_run_timestamp_seconds == 0 - for: 24h - labels: - reason: since-startup - severity: critical - - alert: MimirCompactorHasNotSuccessfullyRunCompaction - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} failed to run 2 consecutive compactions. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction - expr: | - increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 - labels: - reason: consecutive-failures - severity: critical - - alert: MimirCompactorHasNotUploadedBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not uploaded any block in the last 24 hours. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks - expr: | - (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) - and - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) - and - # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). - (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) - for: 15m - labels: - severity: critical - time_period: 24h - - alert: MimirCompactorHasNotUploadedBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has not uploaded any block since its start. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks - expr: | - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) - and - # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do - # (e.g. there are more replicas than required because running as part of mimir-backend). - (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) - for: 24h - labels: - severity: critical - time_period: since-start - - alert: MimirCompactorSkippedUnhealthyBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks - expr: | - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 - for: 1m - labels: - severity: warning - - alert: MimirCompactorSkippedUnhealthyBlocks - annotations: - message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} has found and ignored unhealthy blocks. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks - expr: | - increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 - for: 30m - labels: - severity: critical -- name: mimir_autoscaling - rules: - - alert: MimirAutoscalerNotActive - annotations: - message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler - }} in {{ $labels.namespace }} is not active. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive - expr: | - ( - label_replace(( - kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + - name: mimir_alerts + rules: + - alert: MimirIngesterUnhealthy + annotations: + message: Mimir cluster {{ $labels.cluster }}/{{ $labels.namespace }} has {{ printf "%f" $value }} unhealthy ingester(s). + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterunhealthy + expr: | + min by (cluster, namespace) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 + for: 15m + labels: + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) + / + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) + ) * 100 > 1 + for: 15m + labels: + severity: critical + - alert: MimirRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency + expr: | + cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"} + > + 2.5 + for: 15m + labels: + severity: warning + - alert: MimirInconsistentRuntimeConfig + annotations: + message: | + An inconsistent runtime config file is used across cluster {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirinconsistentruntimeconfig + expr: | + count(count by(cluster, namespace, job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 + for: 1h + labels: + severity: critical + - alert: MimirBadRuntimeConfig + annotations: + message: | + {{ $labels.job }} failed to reload runtime config. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbadruntimeconfig + expr: | + # The metric value is reset to 0 on error while reloading the config at runtime. + cortex_runtime_config_last_reload_successful == 0 + for: 5m + labels: + severity: critical + - alert: MimirFrontendQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirfrontendqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_frontend_queue_length[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirSchedulerQueriesStuck + annotations: + message: | + There are {{ $value }} queued up queries in {{ $labels.cluster }}/{{ $labels.namespace }} {{ $labels.job }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirschedulerqueriesstuck + expr: | + sum by (cluster, namespace, job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 + for: 7m + labels: + severity: critical + - alert: MimirCacheRequestErrors + annotations: + message: | + The cache {{ $labels.name }} used by Mimir {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircacherequesterrors + expr: | + ( + sum by(cluster, namespace, name, operation) ( + rate(thanos_memcached_operation_failures_total[1m]) + or + rate(thanos_cache_operation_failures_total[1m]) + ) + / + sum by(cluster, namespace, name, operation) ( + rate(thanos_memcached_operations_total[1m]) + or + rate(thanos_cache_operations_total[1m]) + ) + ) * 100 > 5 + for: 5m + labels: + severity: warning + - alert: MimirIngesterRestarts + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts + expr: | + ( + sum by(cluster, namespace, pod) ( + increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m]) + ) + >= 2 + ) + and + ( + count by(cluster, namespace, pod) (cortex_build_info) > 0 + ) + labels: + severity: warning + - alert: MimirKVStoreFailure + annotations: + message: | + Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to talk to the KV store {{ $labels.kv_name }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirkvstorefailure + expr: | + ( + sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(cluster, namespace, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + for: 5m + labels: + severity: critical + - alert: MimirMemoryMapAreasTooHigh + annotations: + message: '{{ $labels.job }}/{{ $labels.pod }} has a number of mmap-ed areas close to the limit.' + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirmemorymapareastoohigh + expr: | + process_memory_map_areas{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} / process_memory_map_areas_limit{job=~".*/(ingester.*|cortex|mimir|mimir-write.*|store-gateway.*|cortex|mimir|mimir-backend.*)"} > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirIngesterInstanceHasNoTenants + annotations: + message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no tenants assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterinstancehasnotenants + expr: | + (min by(cluster, namespace, pod) (cortex_ingester_memory_users) == 0) + and on (cluster, namespace) + # Only if there are more time-series than would be expected due to continuous testing load + ( + sum by(cluster, namespace) (cortex_ingester_memory_series) + / + max by(cluster, namespace) (cortex_distributor_replication_factor) + ) > 100000 + for: 1h + labels: + severity: warning + - alert: MimirRulerInstanceHasNoRuleGroups + annotations: + message: Mimir ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has no rule groups assigned. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerinstancehasnorulegroups + expr: | + # Alert on ruler instances in microservices mode that have no rule groups assigned, + min by(cluster, namespace, pod) (cortex_ruler_managers_total{pod=~"(.*mimir-)?ruler.*"}) == 0 + # but only if other ruler instances of the same cell do have rule groups assigned + and on (cluster, namespace) + (max by(cluster, namespace) (cortex_ruler_managers_total) > 0) + # and there are more than two instances overall + and on (cluster, namespace) + (count by (cluster, namespace) (cortex_ruler_managers_total) > 2) + for: 1h + labels: + severity: warning + - alert: MimirIngestedDataTooFarInTheFuture + annotations: + message: Mimir ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has ingested samples with timestamps more than 1h in the future. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture + expr: | + max by(cluster, namespace, pod) ( + cortex_ingester_tsdb_head_max_timestamp_seconds - time() + and + cortex_ingester_tsdb_head_max_timestamp_seconds > 0 + ) > 60*60 + for: 5m + labels: + severity: warning + - alert: MimirStoreGatewayTooManyFailedOperations + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ $value | humanizePercentage }} errors while doing {{ $labels.operation }} on the object storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaytoomanyfailedoperations + expr: | + sum by(cluster, namespace, operation) (rate(thanos_objstore_bucket_operation_failures_total{component="store-gateway"}[1m])) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRingMembersMismatch + annotations: + message: | + Number of members in Mimir ingester hash ring does not match the expected number in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirringmembersmismatch + expr: | + ( + avg by(cluster, namespace) (sum by(cluster, namespace, pod) (cortex_ring_members{name="ingester",job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"})) + != sum by(cluster, namespace) (up{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}) + ) + and + ( + count by(cluster, namespace) (cortex_build_info) > 0 + ) + for: 15m + labels: + component: ingester + severity: warning + - name: mimir_instance_limits_alerts + rules: + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.8 + for: 3h + labels: + severity: warning + - alert: MimirIngesterReachingSeriesLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its series limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingserieslimit + expr: | + ( + (cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_series"} > 0) + ) > 0.9 + for: 5m + labels: + severity: critical + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.7 + for: 5m + labels: + severity: warning + - alert: MimirIngesterReachingTenantsLimit + annotations: + message: | + Ingester {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its tenant limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterreachingtenantslimit + expr: | + ( + (cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"}) + and ignoring (limit) + (cortex_ingester_instance_limits{limit="max_tenants"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - alert: MimirReachingTCPConnectionsLimit + annotations: + message: | + Mimir instance {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its TCP connections limit for {{ $labels.protocol }} protocol. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirreachingtcpconnectionslimit + expr: | + cortex_tcp_connections / cortex_tcp_connections_limit > 0.8 and + cortex_tcp_connections_limit > 0 + for: 5m + labels: + severity: critical + - alert: MimirDistributorReachingInflightPushRequestLimit + annotations: + message: | + Distributor {{ $labels.job }}/{{ $labels.pod }} has reached {{ $value | humanizePercentage }} of its inflight push request limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirdistributorreachinginflightpushrequestlimit + expr: | + ( + (cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"}) + and ignoring (limit) + (cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0) + ) > 0.8 + for: 5m + labels: + severity: critical + - name: mimir-rollout-alerts + rules: + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + workload_type: statefulset + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 30m + labels: + severity: warning + workload_type: deployment + - alert: RolloutOperatorNotReconciling + annotations: + message: | + Rollout operator is not reconciling the rollout group {{ $labels.rollout_group }} in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#rolloutoperatornotreconciling + expr: | + max by(cluster, namespace, rollout_group) (time() - rollout_operator_last_successful_group_reconcile_timestamp_seconds) > 600 + for: 5m + labels: + severity: critical + - name: mimir-provisioning + rules: + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. + # See: https://github.com/grafana/mimir/issues/2466 + container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + / + ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ) + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + > 0.65 + for: 15m + labels: + severity: warning + - alert: MimirAllocatingTooMuchMemory + annotations: + message: | + Instance {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirallocatingtoomuchmemory + expr: | + ( + # We use RSS instead of working set memory because of the ingester's extensive usage of mmap. + # See: https://github.com/grafana/mimir/issues/2466 + container_memory_rss{container=~"(ingester|mimir-write|mimir-backend)"} + / + ( container_spec_memory_limit_bytes{container=~"(ingester|mimir-write|mimir-backend)"} > 0 ) + ) # Match only Mimir namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") - > 0), - "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" - ) - ) - # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, - # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported - # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) - for: 1h - labels: - severity: critical - - alert: MimirAutoscalerKedaFailing - annotations: - message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace - }} is experiencing errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing - expr: | - ( - # Find KEDA scalers reporting errors. - label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") - # Match only Mimir namespaces. - * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) - ) - > 0 - for: 1h - labels: - severity: critical -- name: mimir_ingest_storage_alerts - rules: - - alert: MimirIngesterLastConsumedOffsetCommitFailed - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to commit the last consumed offset. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed - expr: | - sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) - / - sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) - > 0.2 - for: 15m - labels: - severity: critical - - alert: MimirIngesterFailedToReadRecordsFromKafka - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is failing to read records from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka - expr: | - sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) - > 0 - for: 5m - labels: - severity: critical - - alert: MimirIngesterKafkaFetchErrorsRateTooHigh - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} is receiving fetch errors when reading records from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh - expr: | - sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) - / - sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) - > 0.1 - for: 15m - labels: - severity: critical - - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} in "starting" phase is not reducing consumption lag of write requests read - from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing - expr: | - deriv(( - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) - )[5m:1m]) > 0 - for: 5m - labels: - severity: warning - - alert: MimirRunningIngesterReceiveDelayTooHigh - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} in "running" phase is too far behind in its consumption of write requests - from Kafka. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh - expr: | - ( - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) - / - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) - ) > (10 * 60) - for: 5m - labels: - severity: critical - - alert: MimirIngesterFailsToProcessRecordsFromKafka - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} fails to consume write requests read from Kafka due to internal errors. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka - expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 - for: 5m - labels: - severity: critical - - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath - annotations: - message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace - }} fails to enforce strong-consistency on read-path. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath - expr: | - sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 - for: 5m - labels: - severity: critical -- name: mimir_continuous_test - rules: - - alert: MimirContinuousTestNotRunningOnWrites - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not effectively running because writes are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 - for: 1h - labels: - severity: warning - - alert: MimirContinuousTestNotRunningOnReads - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} is not effectively running because queries are failing. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 - for: 1h - labels: - severity: warning - - alert: MimirContinuousTestFailed - annotations: - message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ - $labels.namespace }} failed when asserting query results. - runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed - expr: | - sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 - labels: - severity: warning + > 0.8 + for: 15m + labels: + severity: critical + - name: ruler_alerts + rules: + - alert: MimirRulerTooManyFailedPushes + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedpushes + expr: | + 100 * ( + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_failed_total[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ruler_write_requests_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerTooManyFailedQueries + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulertoomanyfailedqueries + expr: | + 100 * ( + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_failed_total[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ruler_queries_total[1m])) + ) > 1 + for: 5m + labels: + severity: critical + - alert: MimirRulerMissedEvaluations + annotations: + message: | + Mimir Ruler {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulermissedevaluations + expr: | + 100 * ( + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_missed_total[1m])) + / + sum by (cluster, namespace, pod, rule_group) (rate(cortex_prometheus_rule_group_iterations_total[1m])) + ) > 1 + for: 5m + labels: + severity: warning + - alert: MimirRulerFailedRingCheck + annotations: + message: | + Mimir Rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are experiencing errors when checking the ring for rule group ownership. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerfailedringcheck + expr: | + sum by (cluster, namespace, job) (rate(cortex_ruler_ring_check_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + 100 * ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + / + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) > 1 + for: 5m + labels: + severity: warning + - name: gossip_alerts + rules: + - alert: MimirGossipMembersTooHigh + annotations: + message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a higher than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoohigh + expr: | + max by (cluster, namespace) (memberlist_client_cluster_members_count) + > + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) + 10) + for: 20m + labels: + severity: warning + - alert: MimirGossipMembersTooLow + annotations: + message: One or more Mimir instances in {{ $labels.cluster }}/{{ $labels.namespace }} consistently sees a lower than expected number of gossip members. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmemberstoolow + expr: | + min by (cluster, namespace) (memberlist_client_cluster_members_count) + < + (sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor.*|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"}) * 0.5) + for: 20m + labels: + severity: warning + - name: etcd_alerts + rules: + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.65 + for: 15m + labels: + severity: warning + - alert: EtcdAllocatingTooMuchMemory + annotations: + message: | + Too much memory being used by {{ $labels.namespace }}/{{ $labels.pod }} - bump memory limit. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#etcdallocatingtoomuchmemory + expr: | + ( + container_memory_working_set_bytes{container="etcd"} + / + ( container_spec_memory_limit_bytes{container="etcd"} > 0 ) + ) > 0.8 + for: 15m + labels: + severity: critical + - name: alertmanager_alerts + rules: + - alert: MimirAlertmanagerSyncConfigsFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to read tenant configurations from storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagersyncconfigsfailing + expr: | + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + for: 30m + labels: + severity: critical + - alert: MimirAlertmanagerRingCheckFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to check tenants ownership via the ring. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerringcheckfailing + expr: | + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPartialStateMergeFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to merge partial state changes received from a replica. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpartialstatemergefailing + expr: | + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerReplicationFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is failing to replicating partial state to its replicas. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerreplicationfailing + expr: | + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + for: 10m + labels: + severity: critical + - alert: MimirAlertmanagerPersistStateFailing + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} is unable to persist full state snaphots to remote storage. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerpersiststatefailing + expr: | + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + for: 1h + labels: + severity: critical + - alert: MimirAlertmanagerInitialSyncFailed + annotations: + message: | + Mimir Alertmanager {{ $labels.job }}/{{ $labels.pod }} was unable to obtain some initial state when starting up. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinitialsyncfailed + expr: | + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + labels: + severity: critical + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.80 + and + (container_spec_memory_limit_bytes{container="alertmanager"} > 0) + for: 15m + labels: + severity: warning + - alert: MimirAlertmanagerAllocatingTooMuchMemory + annotations: + message: | + Alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is using too much memory. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerallocatingtoomuchmemory + expr: | + (container_memory_working_set_bytes{container="alertmanager"} / container_spec_memory_limit_bytes{container="alertmanager"}) > 0.90 + and + (container_spec_memory_limit_bytes{container="alertmanager"} > 0) + for: 15m + labels: + severity: critical + - alert: MimirAlertmanagerInstanceHasNoTenants + annotations: + message: Mimir alertmanager {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} owns no tenants. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiralertmanagerinstancehasnotenants + expr: | + # Alert on alertmanager instances in microservices mode that own no tenants, + min by(cluster, namespace, pod) (cortex_alertmanager_tenants_owned{pod=~"(.*mimir-)?alertmanager.*"}) == 0 + # but only if other instances of the same cell do have tenants assigned. + and on (cluster, namespace) + max by(cluster, namespace) (cortex_alertmanager_tenants_owned) > 0 + for: 1h + labels: + severity: warning + - name: mimir_blocks_alerts + rules: + - alert: MimirIngesterHasNotShippedBlocks + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks + expr: | + (min by(cluster, namespace, pod) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4) + and + (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0) + and + # Only if the ingester has ingested samples over the last 4h. + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + and + # Only if the ingester was ingesting samples 4h ago. This protects against the case where the ingester replica + # had ingested samples in the past, then no traffic was received for a long period and then it starts + # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving + # samples, while the a block shipping is expected within the next 4h. + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterHasNotShippedBlocksSinceStart + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not shipped any block in the last 4 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart + expr: | + (max by(cluster, namespace, pod) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0) + and + (max by(cluster, namespace, pod) (max_over_time(cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0) + for: 4h + labels: + severity: critical + - alert: MimirIngesterHasUnshippedBlocks + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasunshippedblocks + expr: | + (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) + and + (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadCompactionFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to compact TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadcompactionfailed + expr: | + rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: MimirIngesterTSDBHeadTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB head. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbheadtruncationfailed + expr: | + rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointCreationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to create TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointcreationfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBCheckpointDeletionFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to delete TSDB checkpoint. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbcheckpointdeletionfailed + expr: | + rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0 + labels: + severity: critical + - alert: MimirIngesterTSDBWALTruncationFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to truncate TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwaltruncationfailed + expr: | + rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0 + labels: + severity: warning + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + # alert when there are more than one corruptions + count by (cluster, namespace) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0) > 1 + and + # and there is only one zone + count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) == 1 + labels: + deployment: single-zone + severity: critical + - alert: MimirIngesterTSDBWALCorrupted + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} got a corrupted TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalcorrupted + expr: | + # alert when there are more than one corruptions + count by (cluster, namespace) (sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_wal_corruptions_total[5m]) > 0)) > 1 + and + # and there are multiple zones + count by (cluster, namespace) (group by (cluster, namespace, job) (cortex_ingester_tsdb_wal_corruptions_total)) > 1 + labels: + deployment: multi-zone + severity: critical + - alert: MimirIngesterTSDBWALWritesFailed + annotations: + message: Mimir Ingester {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to write to TSDB WAL. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringestertsdbwalwritesfailed + expr: | + rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0 + for: 3m + labels: + severity: critical + - alert: MimirStoreGatewayHasNotSyncTheBucket + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully synched the bucket since {{ $value | humanizeDuration }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewayhasnotsyncthebucket + expr: | + (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 60 * 30) + and + cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 + for: 5m + labels: + severity: critical + - alert: MimirStoreGatewayNoSyncedTenants + annotations: + message: Mimir store-gateway {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not syncing any blocks for any tenant. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstoregatewaynosyncedtenants + expr: | + min by(cluster, namespace, pod) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0 + for: 1h + labels: + severity: warning + - alert: MimirBucketIndexNotUpdated + annotations: + message: Mimir bucket index for tenant {{ $labels.user }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirbucketindexnotupdated + expr: | + min by(cluster, namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 + labels: + severity: critical + - name: mimir_compactor_alerts + rules: + - alert: MimirCompactorHasNotSuccessfullyCleanedUpBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not successfully cleaned up blocks in the last 6 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullycleanedupblocks + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 60 * 60 * 6) + for: 1h + labels: + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + (time() - cortex_compactor_last_successful_run_timestamp_seconds > 60 * 60 * 24) + and + (cortex_compactor_last_successful_run_timestamp_seconds > 0) + for: 1h + labels: + reason: in-last-24h + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not run compaction in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + cortex_compactor_last_successful_run_timestamp_seconds == 0 + for: 24h + labels: + reason: since-startup + severity: critical + - alert: MimirCompactorHasNotSuccessfullyRunCompaction + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed to run 2 consecutive compactions. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotsuccessfullyruncompaction + expr: | + increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) >= 2 + labels: + reason: consecutive-failures + severity: critical + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block in the last 24 hours. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (time() - (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"})) > 60 * 60 * 24) + and + (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) > 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 15m + labels: + severity: critical + time_period: 24h + - alert: MimirCompactorHasNotUploadedBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has not uploaded any block since its start. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorhasnotuploadedblocks + expr: | + (max by(cluster, namespace, pod) (thanos_objstore_bucket_last_successful_upload_time{component="compactor"}) == 0) + and + # Only if some compactions have started. We don't want to fire this alert if the compactor has nothing to do + # (e.g. there are more replicas than required because running as part of mimir-backend). + (sum by(cluster, namespace, pod) (rate(cortex_compactor_group_compaction_runs_started_total[24h])) > 0) + for: 24h + labels: + severity: critical + time_period: since-start + - alert: MimirCompactorSkippedUnhealthyBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0 + for: 1m + labels: + severity: warning + - alert: MimirCompactorSkippedUnhealthyBlocks + annotations: + message: Mimir Compactor {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} has found and ignored unhealthy blocks. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircompactorskippedunhealthyblocks + expr: | + increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 1 + for: 30m + labels: + severity: critical + - name: mimir_autoscaling + rules: + - alert: MimirAutoscalerNotActive + annotations: + message: The Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler }} in {{ $labels.namespace }} is not active. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalernotactive + expr: | + ( + label_replace(( + kube_horizontalpodautoscaler_status_condition{condition="ScalingActive",status="false"} + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + # Add "metric" label. + + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + > 0), + "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" + ) + ) + # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, + # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported + # by KEDA could not exist at all or being exposed with a value of 0. + and on (cluster, namespace, metric, scaledObject) + (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + for: 1h + labels: + severity: critical + - alert: MimirAutoscalerKedaFailing + annotations: + message: The Keda ScaledObject {{ $labels.scaledObject }} in {{ $labels.namespace }} is experiencing errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirautoscalerkedafailing + expr: | + ( + # Find KEDA scalers reporting errors. + label_replace(rate(keda_scaler_errors[5m]), "namespace", "$1", "exported_namespace", "(.*)") + # Match only Mimir namespaces. + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + ) + > 0 + for: 1h + labels: + severity: critical + - name: mimir_ingest_storage_alerts + rules: + - alert: MimirIngesterLastConsumedOffsetCommitFailed + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to commit the last consumed offset. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterlastconsumedoffsetcommitfailed + expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_failures_total[5m])) + / + sum by(cluster, namespace, pod) (rate(cortex_ingest_storage_reader_offset_commit_requests_total[5m])) + > 0.2 + for: 15m + labels: + severity: critical + - alert: MimirIngesterFailedToReadRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is failing to read records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailedtoreadrecordsfromkafka + expr: | + sum by(cluster, namespace, pod, node_id) (rate(cortex_ingest_storage_reader_read_errors_total[1m])) + > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterKafkaFetchErrorsRateTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} is receiving fetch errors when reading records from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterkafkafetcherrorsratetoohigh + expr: | + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetch_errors_total[5m])) + / + sum by (cluster, namespace, pod) (rate (cortex_ingest_storage_reader_fetches_total[5m])) + > 0.1 + for: 15m + labels: + severity: critical + - alert: MimirStartingIngesterKafkaReceiveDelayIncreasing + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "starting" phase is not reducing consumption lag of write requests read from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirstartingingesterkafkareceivedelayincreasing + expr: | + deriv(( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="starting"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="starting"}[1m])) + )[5m:1m]) > 0 + for: 5m + labels: + severity: warning + - alert: MimirRunningIngesterReceiveDelayTooHigh + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} in "running" phase is too far behind in its consumption of write requests from Kafka. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrunningingesterreceivedelaytoohigh + expr: | + ( + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_sum{phase="running"}[1m])) + / + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_receive_delay_seconds_count{phase="running"}[1m])) + ) > (10 * 60) + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsToProcessRecordsFromKafka + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to consume write requests read from Kafka due to internal errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailstoprocessrecordsfromkafka + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_reader_records_failed_total{cause="server"}[1m])) > 0 + for: 5m + labels: + severity: critical + - alert: MimirIngesterFailsEnforceStrongConsistencyOnReadPath + annotations: + message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to enforce strong-consistency on read-path. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterfailsenforcestrongconsistencyonreadpath + expr: | + sum by (cluster, namespace, pod) (rate(cortex_ingest_storage_strong_consistency_failures_total[1m])) > 0 + for: 5m + labels: + severity: critical + - name: mimir_continuous_test + rules: + - alert: MimirContinuousTestNotRunningOnWrites + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because writes are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonwrites + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_writes_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestNotRunningOnReads + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} is not effectively running because queries are failing. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestnotrunningonreads + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_queries_failed_total[5m])) > 0 + for: 1h + labels: + severity: warning + - alert: MimirContinuousTestFailed + annotations: + message: Mimir continuous test {{ $labels.test }} in {{ $labels.cluster }}/{{ $labels.namespace }} failed when asserting query results. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimircontinuoustestfailed + expr: | + sum by(cluster, namespace, test) (rate(mimir_continuous_test_query_result_checks_failed_total[10m])) > 0 + labels: + severity: warning diff --git a/operations/mimir-mixin-compiled/rules.yaml b/operations/mimir-mixin-compiled/rules.yaml index bd28aa9ec03..a4a60974a7c 100644 --- a/operations/mimir-mixin-compiled/rules.yaml +++ b/operations/mimir-mixin-compiled/rules.yaml @@ -1,521 +1,461 @@ groups: -- name: mimir_api_1 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:sum_rate -- name: mimir_api_2 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:sum_rate -- name: mimir_api_3 - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate - - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, namespace, job, - route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:sum_rate -- name: mimir_querier_api - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by - (cluster, job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) - by (cluster, namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate -- name: mimir_storage - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:50quantile - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds:avg - - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate -- name: mimir_queries - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:50quantile - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) - by (cluster, job) - record: cluster_job:cortex_query_frontend_retries:avg - - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by - (cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, - cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate -- name: mimir_ingester_queries - rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:50quantile - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_series:avg - - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:50quantile - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples:avg - - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:50quantile - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars:avg - - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate -- name: mimir_received_samples - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_samples:rate5m -- name: mimir_exemplars_in - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) - record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m -- name: mimir_received_exemplars - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m -- name: mimir_exemplars_ingested - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) - record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m -- name: mimir_exemplars_appended - rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) - record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m -- name: mimir_scaling_rules - rules: - - expr: | - # Convenience rule to get the number of replicas for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - sum by (cluster, namespace, deployment) ( - label_replace( - kube_deployment_spec_replicas, - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - or - sum by (cluster, namespace, deployment) ( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") - ) - record: cluster_namespace_deployment:actual_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m - )[24h:] - ) - / 240000 - ) - labels: - deployment: distributor - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) - * 0.59999999999999998 / 240000 - ) - labels: - deployment: distributor - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by (cluster, namespace) ( - cluster_namespace_job:cortex_distributor_received_samples:rate5m - )[24h:] - ) - * 3 / 80000 - ) - labels: - deployment: ingester - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - quantile_over_time(0.99, - sum by(cluster, namespace) ( - cortex_ingester_memory_series - )[24h:] - ) - / 1500000 - ) - labels: - deployment: ingester - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) - * 3 * 0.59999999999999998 / 1500000 - ) - labels: - deployment: ingester - reason: active_series_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) - * 0.59999999999999998 / 80000 - ) - labels: - deployment: ingester - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - ceil( - (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} - ) / 4) - / - avg by (cluster, namespace) ( - memcached_limit_bytes{job=~".+/memcached"} - ) - ) - labels: - deployment: memcached - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate - - expr: | - # Convenience rule to get the CPU request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 - # that remove resource metrics, ref: - # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 - # - https://github.com/kubernetes/kube-state-metrics/pull/1004 - # - # This is the old expression, compatible with kube-state-metrics < v2.0.0, - # where kube_pod_container_resource_requests_cpu_cores was removed: - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - or - # This expression is compatible with kube-state-metrics >= v1.4.0, - # where kube_pod_container_resource_requests was introduced. - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | - # Jobs should be sized to their CPU usage. - # We do this by comparing 99th percentile usage over the last 24hrs to - # their current provisioned #replicas and resource requests. - ceil( - cluster_namespace_deployment:actual_replicas:count - * - quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) - / - cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - ) - labels: - reason: cpu_usage - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | - # Convenience rule to get the Memory utilization for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - container_memory_usage_bytes{image!=""}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - record: cluster_namespace_deployment:container_memory_usage_bytes:sum - - expr: | - # Convenience rule to get the Memory request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. - # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 - # that remove resource metrics, ref: - # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 - # - https://github.com/kubernetes/kube-state-metrics/pull/1004 - # - # This is the old expression, compatible with kube-state-metrics < v2.0.0, - # where kube_pod_container_resource_requests_memory_bytes was removed: - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - or - # This expression is compatible with kube-state-metrics >= v1.4.0, - # where kube_pod_container_resource_requests was introduced. - ( - sum by (cluster, namespace, deployment) ( - label_replace( - label_replace( - kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" - ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it - # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" - ) - ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | - # Jobs should be sized to their Memory usage. - # We do this by comparing 99th percentile usage over the last 24hrs to - # their current provisioned #replicas and resource requests. - ceil( - cluster_namespace_deployment:actual_replicas:count - * - quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) - / - cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - ) - labels: - reason: memory_usage - record: cluster_namespace_deployment_reason:required_replicas:count -- name: mimir_alertmanager_rules - rules: - - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_alerts) - record: cluster_job_pod:cortex_alertmanager_alerts:sum - - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_silences) - record: cluster_job_pod:cortex_alertmanager_silences:sum - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m -- name: mimir_ingester_rules - rules: - - expr: | - sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) - record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m + - name: mimir_api_1 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job) + record: cluster_job:cortex_request_duration_seconds:sum_rate + - name: mimir_api_2 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_request_duration_seconds:sum_rate + - name: mimir_api_3 + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:avg + - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate + - expr: sum(rate(cortex_request_duration_seconds[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_request_duration_seconds:sum_rate + - name: mimir_querier_api + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg + - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate + - name: mimir_storage + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_kv_request_duration_seconds:50quantile + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds:avg + - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate + - name: mimir_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_retries:50quantile + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries:avg + - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_retries_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate + - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate + - name: mimir_ingester_queries + rules: + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_series:50quantile + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series:avg + - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_series_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_samples:50quantile + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples:avg + - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_samples_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:99quantile + - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job)) + record: cluster_job:cortex_ingester_queried_exemplars:50quantile + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars:avg + - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate + - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) + record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate + - name: mimir_received_samples + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_samples:rate5m + - name: mimir_exemplars_in + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) + record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m + - name: mimir_received_exemplars + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) + record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m + - name: mimir_exemplars_ingested + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) + record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m + - name: mimir_exemplars_appended + rules: + - expr: | + sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) + record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m + - name: mimir_scaling_rules + rules: + - expr: | + # Convenience rule to get the number of replicas for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + kube_deployment_spec_replicas, + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + or + sum by (cluster, namespace, deployment) ( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") + ) + record: cluster_namespace_deployment:actual_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / 240000 + ) + labels: + deployment: distributor + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 240000 + ) + labels: + deployment: distributor + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / 1500000 + ) + labels: + deployment: ingester + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + * 3 * 0.59999999999999998 / 1500000 + ) + labels: + deployment: ingester + reason: active_series_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + * 0.59999999999999998 / 80000 + ) + labels: + deployment: ingester + reason: sample_rate_limits + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + labels: + deployment: memcached + reason: active_series + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate + - expr: | + # Convenience rule to get the CPU request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_cpu_cores was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="cpu"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + # Jobs should be sized to their CPU usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum + ) + labels: + reason: cpu_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - expr: | + # Convenience rule to get the Memory utilization for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + container_memory_usage_bytes{image!=""}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + record: cluster_namespace_deployment:container_memory_usage_bytes:sum + - expr: | + # Convenience rule to get the Memory request for both a deployment and a statefulset. + # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 + # that remove resource metrics, ref: + # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 + # - https://github.com/kubernetes/kube-state-metrics/pull/1004 + # + # This is the old expression, compatible with kube-state-metrics < v2.0.0, + # where kube_pod_container_resource_requests_memory_bytes was removed: + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + or + # This expression is compatible with kube-state-metrics >= v1.4.0, + # where kube_pod_container_resource_requests was introduced. + ( + sum by (cluster, namespace, deployment) ( + label_replace( + label_replace( + kube_pod_container_resource_requests{resource="memory"}, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ), + # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # always matches everything and the (optional) zone is not removed. + "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + ) + ) + ) + record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + # Jobs should be sized to their Memory usage. + # We do this by comparing 99th percentile usage over the last 24hrs to + # their current provisioned #replicas and resource requests. + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) + / + cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum + ) + labels: + reason: memory_usage + record: cluster_namespace_deployment_reason:required_replicas:count + - name: mimir_alertmanager_rules + rules: + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_alerts) + record: cluster_job_pod:cortex_alertmanager_alerts:sum + - expr: | + sum by (cluster, job, pod) (cortex_alertmanager_silences) + record: cluster_job_pod:cortex_alertmanager_silences:sum + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m + - expr: | + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m + - expr: | + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m + - name: mimir_ingester_rules + rules: + - expr: | + sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) + record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m diff --git a/operations/mimir-mixin/.lint b/operations/mimir-mixin/.lint new file mode 100644 index 00000000000..ba1a8492541 --- /dev/null +++ b/operations/mimir-mixin/.lint @@ -0,0 +1,39 @@ +exclusions: + template-instance-rule: + reason: We don't use "instance" dashboard template variable. + target-instance-rule: + reason: We don't want to enforce "instance" selector in every query. + template-job-rule: + reason: We don't want to make "job" selector a dashboard template variable. + target-job-rule: + reason: We don't want to make "job" selector to be "$job" but we want to explicitly set the actual label matcher regexp. + target-rate-interval-rule: + reason: Some panels intentionally use an explicit time window instead of $__rate_interval. + entries: + - dashboard: Mimir / Top tenants + panel: Top $limit users by received samples rate in last 5m + - dashboard: Mimir / Top tenants + panel: Top $limit users by discarded samples rate in last 5m + - dashboard: Mimir / Top tenants + panel: Top $limit users by received exemplars rate in last 5m + target-promql-rule: + reason: Skipping in dashboards where the linter parses a Loki query as Prometheus one. + entries: + - dashboard: Mimir / Slow queries + template-datasource-rule: + reason: We prefer to keep calling "datasource" the Prometheus datasource to keep consistency between dashboards. + entries: + - dashboard: Mimir / Slow queries + panel-title-description-rule: + reason: We have many panels without description. This exclusion can be removed once we'll have added a description for each panel (if we'll ever do it). + template-on-time-change-reload-rule: + reason: We don't want to refresh "cluster" and "namespace" each time the time range changes, given we don't expect it to change frequently. + entries: + - variable: cluster + - variable: namespace + alert-summary-missing-rule: + reason: We don't set the summary on alerts. + alert-description-missing-rule: + reason: We don't set the description on alerts. + panel-units-rule: + reason: We have many panels without an unit defined. We should fix it but it's low priority.