diff --git a/pkg/monitoring/rules/alerts/operator.go b/pkg/monitoring/rules/alerts/operator.go new file mode 100644 index 000000000..2440fca63 --- /dev/null +++ b/pkg/monitoring/rules/alerts/operator.go @@ -0,0 +1,89 @@ +package alerts + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" +) + +const ( + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" +) + +func operatorAlerts() []promv1.Rule { + return []promv1.Rule{ + { + Alert: "SSPDown", + Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All SSP operator pods are down.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPTemplateValidatorDown", + Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All Template Validator pods are down.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPFailingToReconcile", + Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "The ssp-operator pod is up but failing to reconcile.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPHighRateRejectedVms", + Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "High rate of rejected Vms.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + }, + }, + { + Alert: "SSPCommonTemplatesModificationReverted", + Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), + For: ptr.To[promv1.Duration]("0m"), + Annotations: map[string]string{ + "summary": "Common Templates manual modifications were reverted by the operator.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }, + { + Alert: "VirtualMachineCRCErrors", + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), + Annotations: map[string]string{ + "description": "{{ $value }} Virtual Machines are in risk of causing CRC errors and major service outages.", + "summary": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', it will report bad crc/signature errors and cluster performance will be severely degraded if krbd:rxbounce is not set.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }, + } +} diff --git a/pkg/monitoring/rules/alerts/prometheus.go b/pkg/monitoring/rules/alerts/prometheus.go new file mode 100644 index 000000000..a6af2f6b8 --- /dev/null +++ b/pkg/monitoring/rules/alerts/prometheus.go @@ -0,0 +1,52 @@ +package alerts + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +const ( + prometheusRunbookAnnotationKey = "runbook_url" + partOfAlertLabelKey = "kubernetes_operator_part_of" + partOfAlertLabelValue = "kubevirt" + componentAlertLabelKey = "kubernetes_operator_component" + componentAlertLabelValue = "ssp-operator" + defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" + runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" +) + +func Register() error { + alerts := [][]promv1.Rule{ + operatorAlerts(), + } + + runbookURLTemplate := getRunbookURLTemplate() + for _, alertGroup := range alerts { + for _, alert := range alertGroup { + alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue + alert.Labels[componentAlertLabelKey] = componentAlertLabelValue + alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert) + } + + } + + return operatorrules.RegisterAlerts(alerts...) +} + +func getRunbookURLTemplate() string { + runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) + if !exists { + runbookURLTemplate = defaultRunbookURLTemplate + } + + if strings.Count(runbookURLTemplate, "%s") != 1 { + panic(errors.New("runbook URL template must have exactly 1 %s substring")) + } + + return runbookURLTemplate +} diff --git a/pkg/monitoring/rules/recordingrules/operator.go b/pkg/monitoring/rules/recordingrules/operator.go new file mode 100644 index 000000000..8149889ed --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/operator.go @@ -0,0 +1,55 @@ +package recordingrules + +import ( + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + "k8s.io/apimachinery/pkg/util/intstr" +) + +const ( + CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" + TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" +) + +var operatorRecordingRules = []operatorrules.RecordingRule{ + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_operator_up", + Help: "The total number of running ssp-operator pods", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_template_validator_up", + Help: "The total number of running virt-template-validator pods", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", + Help: "The total number of ssp-operator pods reconciling with no errors", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_template_validator_rejected_increase", + Help: "The increase in the number of rejected template validators, over the last hour", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_common_templates_restored_increase", + Help: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), + }, +} diff --git a/pkg/monitoring/rules/recordingrules/recordingrules.go b/pkg/monitoring/rules/recordingrules/recordingrules.go new file mode 100644 index 000000000..3b95c3ae6 --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/recordingrules.go @@ -0,0 +1,9 @@ +package recordingrules + +import "github.com/machadovilaca/operator-observability/pkg/operatorrules" + +func Register() error { + return operatorrules.RegisterRecordingRules( + operatorRecordingRules, + ) +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go new file mode 100644 index 000000000..1f12d3e26 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go @@ -0,0 +1,73 @@ +package operatorrules + +import ( + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +// BuildPrometheusRule builds a PrometheusRule object from the registered recording rules and alerts. +func BuildPrometheusRule(name, namespace string, labels map[string]string) (*promv1.PrometheusRule, error) { + spec, err := buildPrometheusRuleSpec() + if err != nil { + return nil, err + } + + return &promv1.PrometheusRule{ + TypeMeta: metav1.TypeMeta{ + APIVersion: promv1.SchemeGroupVersion.String(), + Kind: promv1.PrometheusRuleKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: labels, + }, + Spec: *spec, + }, nil +} + +func buildPrometheusRuleSpec() (*promv1.PrometheusRuleSpec, error) { + var groups []promv1.RuleGroup + + if len(operatorRegistry.registeredRecordingRules) != 0 { + groups = append(groups, promv1.RuleGroup{ + Name: "recordingRules.rules", + Rules: buildRecordingRulesRules(), + }) + } + + if len(operatorRegistry.registeredAlerts) != 0 { + groups = append(groups, promv1.RuleGroup{ + Name: "alerts.rules", + Rules: buildAlertsRules(), + }) + } + + if len(groups) == 0 { + return nil, fmt.Errorf("no registered recording rule or alert") + } + + return &promv1.PrometheusRuleSpec{Groups: groups}, nil +} + +func buildRecordingRulesRules() []promv1.Rule { + var rules []promv1.Rule + + for _, recordingRule := range operatorRegistry.registeredRecordingRules { + rules = append(rules, promv1.Rule{ + Record: recordingRule.MetricsOpts.Name, + Expr: recordingRule.Expr, + }) + } + + return rules +} + +func buildAlertsRules() []promv1.Rule { + var rules []promv1.Rule + rules = append(rules, operatorRegistry.registeredAlerts...) + return rules +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go new file mode 100644 index 000000000..da582ceec --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go @@ -0,0 +1,45 @@ +package operatorrules + +import ( + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func BuildRoleAndRoleBinding(namePrefix, namespace, promSAName, promSANamespace string, labels map[string]string) (*rbacv1.Role, *rbacv1.RoleBinding) { + r := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-role", + Namespace: namespace, + Labels: labels, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"services", "endpoints", "pods"}, + Verbs: []string{"get", "list"}, + }, + }, + } + + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-rolebinding", + Namespace: namespace, + Labels: labels, + }, + RoleRef: rbacv1.RoleRef{ + Kind: "Role", + Name: namePrefix + "-role", + APIGroup: rbacv1.GroupName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: promSAName, + Namespace: promSANamespace, + }, + }, + } + + return r, rb +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go new file mode 100644 index 000000000..8dfc50c3e --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go @@ -0,0 +1,24 @@ +package operatorrules + +import ( + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" +) + +// RecordingRule is a struct that represents a Prometheus recording rule. +type RecordingRule struct { + MetricsOpts operatormetrics.MetricOpts + MetricType operatormetrics.MetricType + Expr intstr.IntOrString +} + +// GetOpts returns the metric options of the recording rule. +func (c RecordingRule) GetOpts() operatormetrics.MetricOpts { + return c.MetricsOpts +} + +// GetType returns the metric type of the recording rule. +func (c RecordingRule) GetType() operatormetrics.MetricType { + return c.MetricType +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go new file mode 100644 index 000000000..cf75f6d01 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go @@ -0,0 +1,52 @@ +package operatorrules + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +var operatorRegistry = newRegistry() + +type operatorRegisterer struct { + registeredRecordingRules []RecordingRule + registeredAlerts []promv1.Rule +} + +func newRegistry() operatorRegisterer { + return operatorRegisterer{ + registeredRecordingRules: []RecordingRule{}, + registeredAlerts: []promv1.Rule{}, + } +} + +// RegisterRecordingRules registers the given recording rules. +func RegisterRecordingRules(recordingRules ...[]RecordingRule) error { + for _, recordingRuleList := range recordingRules { + operatorRegistry.registeredRecordingRules = append(operatorRegistry.registeredRecordingRules, recordingRuleList...) + } + + return nil +} + +// RegisterAlerts registers the given alerts. +func RegisterAlerts(alerts ...[]promv1.Rule) error { + for _, alertList := range alerts { + operatorRegistry.registeredAlerts = append(operatorRegistry.registeredAlerts, alertList...) + } + + return nil +} + +// ListRecordingRules returns the registered recording rules. +func ListRecordingRules() []RecordingRule { + return operatorRegistry.registeredRecordingRules +} + +// ListAlerts returns the registered alerts. +func ListAlerts() []promv1.Rule { + return operatorRegistry.registeredAlerts +} + +// CleanRegistry removes all registered rules and alerts. +func CleanRegistry() { + operatorRegistry = newRegistry() +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go new file mode 100644 index 000000000..c06a032d7 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go @@ -0,0 +1,22 @@ +package operatorrules + +import ( + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/runtime" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func AddToScheme(scheme *runtime.Scheme) error { + err := promv1.AddToScheme(scheme) + if err != nil { + return err + } + + err = rbacv1.AddToScheme(scheme) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 222d80e8e..5337a9418 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -183,6 +183,7 @@ github.com/kubevirt/monitoring/pkg/metrics/parser # github.com/machadovilaca/operator-observability v0.0.11 ## explicit; go 1.20 github.com/machadovilaca/operator-observability/pkg/operatormetrics +github.com/machadovilaca/operator-observability/pkg/operatorrules # github.com/mailru/easyjson v0.7.7 ## explicit; go 1.12 github.com/mailru/easyjson/buffer