Skip to content

Commit

Permalink
refactor recording rules and alerts code
Browse files Browse the repository at this point in the history
Signed-off-by: avlitman <[email protected]>
  • Loading branch information
avlitman committed Feb 1, 2024
1 parent c92dda8 commit b4891e5
Show file tree
Hide file tree
Showing 10 changed files with 422 additions and 0 deletions.
89 changes: 89 additions & 0 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
)

func operatorAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
{
Alert: "VirtualMachineCRCErrors",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages.",
"summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
}
}
52 changes: 52 additions & 0 deletions pkg/monitoring/rules/alerts/prometheus.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package alerts

import (
"errors"
"fmt"
"os"
"strings"

"github.com/machadovilaca/operator-observability/pkg/operatorrules"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
)

const (
prometheusRunbookAnnotationKey = "runbook_url"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "ssp-operator"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
)

func Register() error {
alerts := [][]promv1.Rule{
operatorAlerts(),
}

runbookURLTemplate := getRunbookURLTemplate()
for _, alertGroup := range alerts {
for _, alert := range alertGroup {
alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue
alert.Labels[componentAlertLabelKey] = componentAlertLabelValue
alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert)
}

}

return operatorrules.RegisterAlerts(alerts...)
}

func getRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}

return runbookURLTemplate
}
55 changes: 55 additions & 0 deletions pkg/monitoring/rules/recordingrules/operator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package recordingrules

import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
"k8s.io/apimachinery/pkg/util/intstr"
)

const (
CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))"
TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))"
)

var operatorRecordingRules = []operatorrules.RecordingRule{
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_ssp_operator_up",
Help: "The total number of running ssp-operator pods",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_ssp_template_validator_up",
Help: "The total number of running virt-template-validator pods",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated",
Help: "The total number of ssp-operator pods reconciling with no errors",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_ssp_template_validator_rejected_increase",
Help: "The increase in the number of rejected template validators, over the last hour",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_ssp_common_templates_restored_increase",
Help: "The increase in the number of common templates restored by the operator back to their original state, over the last hour",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"),
},
}
9 changes: 9 additions & 0 deletions pkg/monitoring/rules/recordingrules/recordingrules.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
package recordingrules

import "github.com/machadovilaca/operator-observability/pkg/operatorrules"

func Register() error {
return operatorrules.RegisterRecordingRules(
operatorRecordingRules,
)
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b4891e5

Please sign in to comment.