From 4fd4022504e543e1a5cbfb72f0b5ff852d85aded Mon Sep 17 00:00:00 2001 From: avlitman Date: Sun, 14 Jan 2024 20:39:37 +0200 Subject: [PATCH] refactor recording rules and alerts code Following the work started in kubevirt/kubevirt#10044 , and according to the kubevirt/community#219 proposal, this PR refactors monitoring recording rules and alerts Signed-off-by: avlitman --- docs/metrics.md | 19 +- internal/operands/metrics/reconcile.go | 7 +- internal/operands/metrics/reconcile_test.go | 10 +- internal/operands/metrics/resources.go | 48 +---- internal/template-validator/validator/app.go | 6 +- main.go | 15 +- .../metrics/ssp-operator/metrics.go | 13 +- .../metrics/template-validator/metrics.go | 13 +- pkg/monitoring/rules/alerts/operator.go | 89 ++++++++ pkg/monitoring/rules/alerts/prometheus.go | 51 +++++ .../rules/recordingrules/operator.go | 57 +++++ .../rules/recordingrules/recordingrules.go | 9 + pkg/monitoring/rules/rules-tests.yaml | 32 +-- pkg/monitoring/rules/rules.go | 196 +++--------------- tests/metrics_test.go | 7 +- tests/monitoring_test.go | 6 +- tools/metricsdocs/metricsdocs.go | 110 +++------- .../metrics_collector.go | 31 --- .../metrics_json_generator.go | 57 ++++- tools/test-rules-writer/test_rules_writer.go | 18 +- .../operator-observability/pkg/docs/alerts.go | 96 +++++++++ .../pkg/docs/metrics.go | 106 ++++++++++ .../pkg/operatorrules/prometheusrules.go | 73 +++++++ .../pkg/operatorrules/rbac.go | 45 ++++ .../pkg/operatorrules/recordingrule.go | 24 +++ .../pkg/operatorrules/registry.go | 78 +++++++ .../pkg/operatorrules/schema.go | 22 ++ vendor/modules.txt | 2 + 28 files changed, 857 insertions(+), 383 deletions(-) create mode 100644 pkg/monitoring/rules/alerts/operator.go create mode 100644 pkg/monitoring/rules/alerts/prometheus.go create mode 100644 pkg/monitoring/rules/recordingrules/operator.go create mode 100644 pkg/monitoring/rules/recordingrules/recordingrules.go delete mode 100644 tools/prom-metrics-collector/metrics_collector.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go create mode 100644 vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go diff --git a/docs/metrics.md b/docs/metrics.md index df129aa42..8566418ef 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,25 +1,34 @@ # SSP Operator metrics -This document aims to help users that are not familiar with metrics exposed by the SSP Operator. -All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed. -## SSP Operator Metrics List ### kubevirt_ssp_common_templates_restored_increase The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge. + ### kubevirt_ssp_common_templates_restored_total The total number of common templates restored by the operator back to their original state. Type: Counter. + ### kubevirt_ssp_operator_reconcile_succeeded Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge. + ### kubevirt_ssp_operator_reconcile_succeeded_aggregated The total number of ssp-operator pods reconciling with no errors. Type: Gauge. + ### kubevirt_ssp_operator_up The total number of running ssp-operator pods. Type: Gauge. + ### kubevirt_ssp_template_validator_rejected_increase The increase in the number of rejected template validators, over the last hour. Type: Gauge. + ### kubevirt_ssp_template_validator_rejected_total The total number of rejected template validators. Type: Counter. + ### kubevirt_ssp_template_validator_up The total number of running virt-template-validator pods. Type: Gauge. + ### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce -VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge. +[ALPHA] VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge. + ## Developing new metrics -After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document. + +All metrics documented here are auto-generated and reflect exactly what is being +exposed. After developing new metrics or changing old ones please regenerate +this document. diff --git a/internal/operands/metrics/reconcile.go b/internal/operands/metrics/reconcile.go index 7b1cef3fa..48241d73b 100644 --- a/internal/operands/metrics/reconcile.go +++ b/internal/operands/metrics/reconcile.go @@ -6,6 +6,7 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "kubevirt.io/ssp-operator/internal/common" "kubevirt.io/ssp-operator/internal/operands" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) // Define RBAC rules needed by this operand: @@ -96,7 +97,11 @@ func reconcileMonitoringRbacRoleBinding(request *common.Request) (common.Reconci } func reconcilePrometheusRule(request *common.Request) (common.ReconcileResult, error) { - prometheusRule, err := newPrometheusRule(request.Namespace) + if err := rules.SetupRules(); err != nil { + return common.ReconcileResult{}, err + } + + prometheusRule, err := rules.BuildPrometheusRule(request.Namespace) if err != nil { return common.ReconcileResult{}, err } diff --git a/internal/operands/metrics/reconcile_test.go b/internal/operands/metrics/reconcile_test.go index e51022404..33a30f590 100644 --- a/internal/operands/metrics/reconcile_test.go +++ b/internal/operands/metrics/reconcile_test.go @@ -18,6 +18,7 @@ import ( ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/common" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) var log = logf.Log.WithName("metrics_operand") @@ -34,6 +35,8 @@ var _ = Describe("Metrics operand", func() { ) BeforeEach(func() { + Expect(rules.SetupRules()).To(Succeed()) + client := fake.NewClientBuilder().WithScheme(common.Scheme).Build() request = common.Request{ Request: reconcile.Request{ @@ -67,7 +70,7 @@ var _ = Describe("Metrics operand", func() { _, err := operand.Reconcile(&request) Expect(err).ToNot(HaveOccurred()) - prometheusRule, err := newPrometheusRule(namespace) + prometheusRule, err := rules.BuildPrometheusRule(namespace) Expect(err).ToNot(HaveOccurred()) ExpectResourceExists(prometheusRule, request) @@ -82,7 +85,7 @@ var _ = Describe("Metrics operand", func() { os.Setenv(runbookURLTemplateEnv, template) } - prometheusRule, err := newPrometheusRule(namespace) + err := rules.SetupRules() if strings.Count(template, "%s") != 1 || strings.Count(template, "%") != 1 { Expect(err).To(HaveOccurred()) @@ -91,6 +94,9 @@ var _ = Describe("Metrics operand", func() { Expect(err).ToNot(HaveOccurred()) + prometheusRule, err := rules.BuildPrometheusRule(namespace) + Expect(err).ToNot(HaveOccurred()) + for _, group := range prometheusRule.Spec.Groups { for _, rule := range group.Rules { if rule.Alert != "" { diff --git a/internal/operands/metrics/resources.go b/internal/operands/metrics/resources.go index 6fb8c1555..173d23274 100644 --- a/internal/operands/metrics/resources.go +++ b/internal/operands/metrics/resources.go @@ -1,10 +1,6 @@ package metrics import ( - "errors" - "os" - "strings" - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" rbac "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -13,7 +9,6 @@ import ( ) const ( - PrometheusRuleName = "prometheus-k8s-rules-cnv" MonitorNamespace = "openshift-monitoring" defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" @@ -69,7 +64,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor { return &promv1.ServiceMonitor{ ObjectMeta: metav1.ObjectMeta{ Namespace: namespace, - Name: PrometheusRuleName, + Name: rules.RuleName, Labels: ServiceMonitorLabels(), }, Spec: promv1.ServiceMonitorSpec{ @@ -96,44 +91,3 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor { }, } } - -func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { - runbookURLTemplate, err := getRunbookURLTemplate() - if err != nil { - return nil, err - } - - return &promv1.PrometheusRule{ - ObjectMeta: metav1.ObjectMeta{ - Name: PrometheusRuleName, - Namespace: namespace, - Labels: map[string]string{ - "prometheus": "k8s", - "role": "alert-rules", - "kubevirt.io": "prometheus-rules", - PrometheusLabelKey: PrometheusLabelValue, - }, - }, - Spec: promv1.PrometheusRuleSpec{ - Groups: []promv1.RuleGroup{ - { - Name: "cnv.rules", - Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...), - }, - }, - }, - }, nil -} - -func getRunbookURLTemplate() (string, error) { - runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) - if !exists { - runbookURLTemplate = defaultRunbookURLTemplate - } - - if strings.Count(runbookURLTemplate, "%s") != 1 || strings.Count(runbookURLTemplate, "%") != 1 { - return "", errors.New("runbook URL template must have exactly 1 %s substring") - } - - return runbookURLTemplate, nil -} diff --git a/internal/template-validator/validator/app.go b/internal/template-validator/validator/app.go index ba2750611..e9165d4db 100644 --- a/internal/template-validator/validator/app.go +++ b/internal/template-validator/validator/app.go @@ -77,7 +77,11 @@ func (app *App) Run() { registerReadinessProbe() // setup monitoring - validatorMetrics.SetupMetrics() + err = validatorMetrics.SetupMetrics() + if err != nil { + logger.Log.Error(err, "Error setting up metrics") + panic(err) + } logger.Log.Info("TLS certs directory", "directory", app.TLSInfo.CertsDirectory) diff --git a/main.go b/main.go index 42f8ab3a7..8f09b390f 100644 --- a/main.go +++ b/main.go @@ -45,6 +45,7 @@ import ( "kubevirt.io/ssp-operator/controllers" "kubevirt.io/ssp-operator/internal/common" sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" "kubevirt.io/ssp-operator/webhooks" // +kubebuilder:scaffold:imports ) @@ -181,15 +182,21 @@ func (s *prometheusServer) getPrometheusTLSConfig(ctx context.Context, certWatch } } -func newPrometheusServer(metricsAddr string, cache cache.Cache) *prometheusServer { - sspMetrics.SetupMetrics() +func newPrometheusServer(metricsAddr string, cache cache.Cache) (*prometheusServer, error) { + if err := sspMetrics.SetupMetrics(); err != nil { + return nil, err + } + + if err := rules.SetupRules(); err != nil { + return nil, err + } return &prometheusServer{ certPath: path.Join(sdkTLSDir, sdkTLSCrt), keyPath: path.Join(sdkTLSDir, sdkTLSKey), cache: cache, serverAddress: metricsAddr, - } + }, nil } func main() { @@ -249,7 +256,7 @@ func main() { } } - metricsServer := newPrometheusServer(metricsAddr, mgr.GetCache()) + metricsServer, err := newPrometheusServer(metricsAddr, mgr.GetCache()) if err != nil { setupLog.Error(err, "unable create Prometheus server") os.Exit(1) diff --git a/pkg/monitoring/metrics/ssp-operator/metrics.go b/pkg/monitoring/metrics/ssp-operator/metrics.go index 2d930aaae..84a3ca101 100644 --- a/pkg/monitoring/metrics/ssp-operator/metrics.go +++ b/pkg/monitoring/metrics/ssp-operator/metrics.go @@ -5,14 +5,17 @@ import ( runtimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics" ) -func SetupMetrics() { +func SetupMetrics() error { operatormetrics.Register = runtimemetrics.Registry.Register - if err := operatormetrics.RegisterMetrics( + return operatormetrics.RegisterMetrics( operatorMetrics, rbdMetrics, templateMetrics, - ); err != nil { - panic(err) - } + ) +} + +// ListMetrics registered prometheus metrics +func ListMetrics() []operatormetrics.Metric { + return operatormetrics.ListMetrics() } diff --git a/pkg/monitoring/metrics/template-validator/metrics.go b/pkg/monitoring/metrics/template-validator/metrics.go index d91d581a9..deb25ee23 100644 --- a/pkg/monitoring/metrics/template-validator/metrics.go +++ b/pkg/monitoring/metrics/template-validator/metrics.go @@ -4,10 +4,13 @@ import ( "github.com/machadovilaca/operator-observability/pkg/operatormetrics" ) -func SetupMetrics() { - if err := operatormetrics.RegisterMetrics( +func SetupMetrics() error { + return operatormetrics.RegisterMetrics( templateMetrics, - ); err != nil { - panic(err) - } + ) +} + +// ListMetrics registered prometheus metrics +func ListMetrics() []operatormetrics.Metric { + return operatormetrics.ListMetrics() } diff --git a/pkg/monitoring/rules/alerts/operator.go b/pkg/monitoring/rules/alerts/operator.go new file mode 100644 index 000000000..0ceea395a --- /dev/null +++ b/pkg/monitoring/rules/alerts/operator.go @@ -0,0 +1,89 @@ +package alerts + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" +) + +const ( + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" +) + +func operatorAlerts() []promv1.Rule { + return []promv1.Rule{ + { + Alert: "SSPDown", + Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All SSP operator pods are down.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPTemplateValidatorDown", + Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "All Template Validator pods are down.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPFailingToReconcile", + Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "The ssp-operator pod is up but failing to reconcile.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: "SSPHighRateRejectedVms", + Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), + For: ptr.To[promv1.Duration]("5m"), + Annotations: map[string]string{ + "summary": "High rate of rejected Vms.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + }, + }, + { + Alert: "SSPCommonTemplatesModificationReverted", + Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), + For: ptr.To[promv1.Duration]("0m"), + Annotations: map[string]string{ + "summary": "Common Templates manual modifications were reverted by the operator.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }, + { + Alert: "VMStorageClassWarning", + Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), + Annotations: map[string]string{ + "summary": "{{ $value }} Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns.", + "description": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', VMs may cause reports of bad crc/signature errors due to certain I/O patterns. Cluster performance can be severely degraded if the number of re-transmissions due to crc errors causes network saturation.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "none", + }, + }, + } +} diff --git a/pkg/monitoring/rules/alerts/prometheus.go b/pkg/monitoring/rules/alerts/prometheus.go new file mode 100644 index 000000000..b85c842e1 --- /dev/null +++ b/pkg/monitoring/rules/alerts/prometheus.go @@ -0,0 +1,51 @@ +package alerts + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/machadovilaca/operator-observability/pkg/operatorrules" +) + +const ( + prometheusRunbookAnnotationKey = "runbook_url" + partOfAlertLabelKey = "kubernetes_operator_part_of" + partOfAlertLabelValue = "kubevirt" + componentAlertLabelKey = "kubernetes_operator_component" + componentAlertLabelValue = "ssp-operator" + defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" + runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" +) + +func Register() error { + alerts := operatorAlerts() + + runbookURLTemplate, err := getRunbookURLTemplate() + if err != nil { + return err + } + + for i := range alerts { + alert := &alerts[i] + alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue + alert.Labels[componentAlertLabelKey] = componentAlertLabelValue + alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert) + } + + return operatorrules.RegisterAlerts(alerts) +} + +func getRunbookURLTemplate() (string, error) { + runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) + if !exists { + runbookURLTemplate = defaultRunbookURLTemplate + } + + if strings.Count(runbookURLTemplate, "%s") != 1 { + return "", errors.New("runbook URL template must have exactly 1 %s substring") + } + + return runbookURLTemplate, nil +} diff --git a/pkg/monitoring/rules/recordingrules/operator.go b/pkg/monitoring/rules/recordingrules/operator.go new file mode 100644 index 000000000..46e1123b8 --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/operator.go @@ -0,0 +1,57 @@ +package recordingrules + +import ( + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + "k8s.io/apimachinery/pkg/util/intstr" +) + +const ( + CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" + TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" +) + +func operatorRecordingRules() []operatorrules.RecordingRule { + return []operatorrules.RecordingRule{ + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_operator_up", + Help: "The total number of running ssp-operator pods", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_template_validator_up", + Help: "The total number of running virt-template-validator pods", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", + Help: "The total number of ssp-operator pods reconciling with no errors", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_template_validator_rejected_increase", + Help: "The increase in the number of rejected template validators, over the last hour", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_ssp_common_templates_restored_increase", + Help: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), + }, + } +} diff --git a/pkg/monitoring/rules/recordingrules/recordingrules.go b/pkg/monitoring/rules/recordingrules/recordingrules.go new file mode 100644 index 000000000..5857c79c3 --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/recordingrules.go @@ -0,0 +1,9 @@ +package recordingrules + +import "github.com/machadovilaca/operator-observability/pkg/operatorrules" + +func Register() error { + return operatorrules.RegisterRecordingRules( + operatorRecordingRules(), + ) +} diff --git a/pkg/monitoring/rules/rules-tests.yaml b/pkg/monitoring/rules/rules-tests.yaml index c228f0a28..0f55aaa69 100644 --- a/pkg/monitoring/rules/rules-tests.yaml +++ b/pkg/monitoring/rules/rules-tests.yaml @@ -2,6 +2,10 @@ rule_files: - rules.json +group_eval_order: + - recordingRules.rules + - alerts.rules + tests: # SSPDown alert tests - interval: "1m" @@ -15,7 +19,7 @@ tests: exp_alerts: - exp_annotations: summary: "All SSP operator pods are down." - runbook_url: "test-runbook:SSPDown" + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPDown" exp_labels: severity: "critical" operator_health_impact: "critical" @@ -38,7 +42,7 @@ tests: exp_alerts: - exp_annotations: summary: "All Template Validator pods are down." - runbook_url: "test-runbook:SSPTemplateValidatorDown" + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPTemplateValidatorDown" exp_labels: severity: "critical" operator_health_impact: "critical" @@ -68,8 +72,8 @@ tests: alertname: "SSPFailingToReconcile" exp_alerts: - exp_annotations: - summary: "The ssp-operator pod is up but failing to reconcile" - runbook_url: "test-runbook:SSPFailingToReconcile" + summary: "The ssp-operator pod is up but failing to reconcile." + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPFailingToReconcile" exp_labels: severity: "critical" operator_health_impact: "critical" @@ -96,8 +100,8 @@ tests: alertname: "SSPHighRateRejectedVms" exp_alerts: - exp_annotations: - summary: "High rate of rejected Vms" - runbook_url: "test-runbook:SSPHighRateRejectedVms" + summary: "High rate of rejected Vms." + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPHighRateRejectedVms" exp_labels: severity: "warning" operator_health_impact: "warning" @@ -110,8 +114,8 @@ tests: alertname: "SSPHighRateRejectedVms" exp_alerts: - exp_annotations: - summary: "High rate of rejected Vms" - runbook_url: "test-runbook:SSPHighRateRejectedVms" + summary: "High rate of rejected Vms." + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPHighRateRejectedVms" exp_labels: severity: "warning" operator_health_impact: "warning" @@ -137,8 +141,8 @@ tests: alertname: "SSPCommonTemplatesModificationReverted" exp_alerts: - exp_annotations: - summary: "Common Templates manual modifications were reverted by the operator" - runbook_url: "test-runbook:SSPCommonTemplatesModificationReverted" + summary: "Common Templates manual modifications were reverted by the operator." + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPCommonTemplatesModificationReverted" exp_labels: severity: "warning" operator_health_impact: "none" @@ -151,8 +155,8 @@ tests: alertname: "SSPCommonTemplatesModificationReverted" exp_alerts: - exp_annotations: - summary: "Common Templates manual modifications were reverted by the operator" - runbook_url: "test-runbook:SSPCommonTemplatesModificationReverted" + summary: "Common Templates manual modifications were reverted by the operator." + runbook_url: "https://kubevirt.io/monitoring/runbooks/SSPCommonTemplatesModificationReverted" exp_labels: severity: "warning" operator_health_impact: "none" @@ -178,9 +182,9 @@ tests: alertname: "VMStorageClassWarning" exp_alerts: - exp_annotations: - summary: "1 Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns" + summary: "1 Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns." description: "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', VMs may cause reports of bad crc/signature errors due to certain I/O patterns. Cluster performance can be severely degraded if the number of re-transmissions due to crc errors causes network saturation." - runbook_url: "test-runbook:VMStorageClassWarning" + runbook_url: "https://kubevirt.io/monitoring/runbooks/VMStorageClassWarning" exp_labels: severity: "warning" operator_health_impact: "none" diff --git a/pkg/monitoring/rules/rules.go b/pkg/monitoring/rules/rules.go index afa64569b..0d33798eb 100644 --- a/pkg/monitoring/rules/rules.go +++ b/pkg/monitoring/rules/rules.go @@ -1,180 +1,50 @@ package rules import ( - "fmt" - + "github.com/machadovilaca/operator-observability/pkg/operatorrules" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/utils/ptr" -) -const ( - severityAlertLabelKey = "severity" - healthImpactAlertLabelKey = "operator_health_impact" - partOfAlertLabelKey = "kubernetes_operator_part_of" - partOfAlertLabelValue = "kubevirt" - componentAlertLabelKey = "kubernetes_operator_component" - componentAlertLabelValue = "ssp-operator" + "kubevirt.io/ssp-operator/pkg/monitoring/rules/alerts" + "kubevirt.io/ssp-operator/pkg/monitoring/rules/recordingrules" ) const ( - CommonTemplatesRestoredIncreaseQuery = "sum(increase(kubevirt_ssp_common_templates_restored_total{pod=~'ssp-operator.*'}[1h]))" - TemplateValidatorRejectedIncreaseQuery = "sum(increase(kubevirt_ssp_template_validator_rejected_total{pod=~'virt-template-validator.*'}[1h]))" + RuleName = "prometheus-k8s-rules-cnv" + PrometheusKey = "prometheus" + PrometheusValue = "k8s" + RoleLabelKey = "role" + RoleLabelValue = "alert-rules" + KubevirtLabelKey = "kubevirt.io" + KubevirtLabelValue = "prometheus-rules" + PrometheusLabelKey = "prometheus.ssp.kubevirt.io" + PrometheusLabelValue = "true" ) -// RecordRulesDesc represent SSP Operator Prometheus Record Rules -type RecordRulesDesc struct { - Name string - Expr intstr.IntOrString - Description string - Type string -} - -// recordRulesDescList lists all SSP Operator Prometheus Record Rules -var recordRulesDescList = []RecordRulesDesc{ - { - Name: "kubevirt_ssp_operator_up", - Expr: intstr.FromString("sum(up{pod=~'ssp-operator.*'}) OR on() vector(0)"), - Description: "The total number of running ssp-operator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_up", - Expr: intstr.FromString("sum(up{pod=~'virt-template-validator.*'}) OR on() vector(0)"), - Description: "The total number of running virt-template-validator pods", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_operator_reconcile_succeeded_aggregated", - Expr: intstr.FromString("sum(kubevirt_ssp_operator_reconcile_succeeded)"), - Description: "The total number of ssp-operator pods reconciling with no errors", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_template_validator_rejected_increase", - Expr: intstr.FromString(TemplateValidatorRejectedIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of rejected template validators, over the last hour", - Type: "Gauge", - }, - { - Name: "kubevirt_ssp_common_templates_restored_increase", - Expr: intstr.FromString(CommonTemplatesRestoredIncreaseQuery + " OR on() vector(0)"), - Description: "The increase in the number of common templates restored by the operator back to their original state, over the last hour", - Type: "Gauge", - }, -} - -func RecordRules() []promv1.Rule { - result := make([]promv1.Rule, 0, len(recordRulesDescList)) - for _, rrd := range recordRulesDescList { - result = append(result, promv1.Rule{Record: rrd.Name, Expr: rrd.Expr}) +func SetupRules() error { + if err := recordingrules.Register(); err != nil { + return err } - return result -} -func RecordRulesWithDescriptions() []RecordRulesDesc { - result := make([]RecordRulesDesc, 0, len(recordRulesDescList)) - for _, rrd := range recordRulesDescList { - result = append(result, rrd) + if err := alerts.Register(); err != nil { + return err } - return result + + return nil } -func AlertRules(runbookURLTemplate string) []promv1.Rule { - return []promv1.Rule{ - { - Expr: intstr.FromString("sum(kubevirt_vmi_phase_count{phase=\"running\"}) by (node,os,workload,flavor,instance_type,preference)"), - Record: "cnv:vmi_status_running:count", +func BuildPrometheusRule(namespace string) (*promv1.PrometheusRule, error) { + return operatorrules.BuildPrometheusRule( + RuleName, + namespace, + map[string]string{ + PrometheusKey: PrometheusValue, + RoleLabelKey: RoleLabelValue, + KubevirtLabelKey: KubevirtLabelValue, + PrometheusLabelKey: PrometheusLabelValue, }, - { - Alert: "SSPDown", - Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All SSP operator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPTemplateValidatorDown", - Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "All Template Validator pods are down.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPTemplateValidatorDown"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPFailingToReconcile", - Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "The ssp-operator pod is up but failing to reconcile", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPFailingToReconcile"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPHighRateRejectedVms", - Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"), - For: ptr.To[promv1.Duration]("5m"), - Annotations: map[string]string{ - "summary": "High rate of rejected Vms", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPHighRateRejectedVms"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "SSPCommonTemplatesModificationReverted", - Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"), - For: ptr.To[promv1.Duration]("0m"), - Annotations: map[string]string{ - "summary": "Common Templates manual modifications were reverted by the operator", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "SSPCommonTemplatesModificationReverted"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - { - Alert: "VMStorageClassWarning", - Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"), - Annotations: map[string]string{ - "summary": "{{ $value }} Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns", - "description": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', VMs may cause reports of bad crc/signature errors due to certain I/O patterns. Cluster performance can be severely degraded if the number of re-transmissions due to crc errors causes network saturation.", - "runbook_url": fmt.Sprintf(runbookURLTemplate, "VMStorageClassWarning"), - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "none", - partOfAlertLabelKey: partOfAlertLabelValue, - componentAlertLabelKey: componentAlertLabelValue, - }, - }, - } + ) +} + +func ListRecordingRules() []operatorrules.RecordingRule { + return operatorrules.ListRecordingRules() } diff --git a/tests/metrics_test.go b/tests/metrics_test.go index 308933212..84f7af05e 100644 --- a/tests/metrics_test.go +++ b/tests/metrics_test.go @@ -2,7 +2,6 @@ package tests import ( "fmt" - common_templates "kubevirt.io/ssp-operator/internal/operands/common-templates" "net/http" "reflect" "time" @@ -18,7 +17,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "kubevirt.io/ssp-operator/internal/common" + common_templates "kubevirt.io/ssp-operator/internal/operands/common-templates" "kubevirt.io/ssp-operator/internal/operands/metrics" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) func mergeMaps(maps ...map[string]string) map[string]string { @@ -43,7 +44,7 @@ var _ = Describe("Metrics", func() { expectedLabels := expectedLabelsFor("metrics", common.AppComponentMonitoring) serviceMonitorRes = testResource{ - Name: metrics.PrometheusRuleName, + Name: rules.RuleName, Namespace: strategy.GetNamespace(), Resource: &promv1.ServiceMonitor{}, ExpectedLabels: mergeMaps(expectedLabels, metrics.ServiceMonitorLabels()), @@ -81,7 +82,7 @@ var _ = Describe("Metrics", func() { } prometheusRuleRes = testResource{ - Name: metrics.PrometheusRuleName, + Name: rules.RuleName, Namespace: strategy.GetNamespace(), Resource: &promv1.PrometheusRule{}, ExpectedLabels: expectedLabelsFor("metrics", common.AppComponentMonitoring), diff --git a/tests/monitoring_test.go b/tests/monitoring_test.go index c1c87be68..03efeeacb 100644 --- a/tests/monitoring_test.go +++ b/tests/monitoring_test.go @@ -30,7 +30,7 @@ import ( ssp "kubevirt.io/ssp-operator/api/v1beta2" "kubevirt.io/ssp-operator/internal/operands/metrics" - "kubevirt.io/ssp-operator/pkg/monitoring/rules" + "kubevirt.io/ssp-operator/pkg/monitoring/rules/recordingrules" "kubevirt.io/ssp-operator/tests/env" ) @@ -49,7 +49,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8363] Should fire SSPCommonTemplatesModificationReverted", func() { // we have to wait for prometheus to pick up the series before we increase it. - waitForSeriesToBeDetected(rules.CommonTemplatesRestoredIncreaseQuery) + waitForSeriesToBeDetected(recordingrules.CommonTemplatesRestoredIncreaseQuery) expectTemplateUpdateToIncreaseTotalRestoredTemplatesCount(testTemplate) waitForAlertToActivate("SSPCommonTemplatesModificationReverted") }) @@ -114,7 +114,7 @@ var _ = Describe("Prometheus Alerts", func() { }) It("[test_id:8377] Should fire SSPHighRateRejectedVms", func() { - waitForSeriesToBeDetected(rules.TemplateValidatorRejectedIncreaseQuery) + waitForSeriesToBeDetected(recordingrules.TemplateValidatorRejectedIncreaseQuery) Expect(apiClient.Create(ctx, template)).ToNot(HaveOccurred(), "Failed to create template: %s", template.Name) for range [6]int{} { time.Sleep(time.Second * 5) diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 323b72368..86ae23e32 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -2,106 +2,54 @@ package main import ( "fmt" - "sort" - "strings" - "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/docs" sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" validatorMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/template-validator" "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) -const ( - title = "# SSP Operator metrics\n" - background = "This document aims to help users that are not familiar with metrics exposed by the SSP Operator.\n" + - "All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.\n\n" +const tpl = `# SSP Operator metrics - KVSpecificMetrics = "## SSP Operator Metrics List\n" +{{- range . }} - opening = title + - background + - KVSpecificMetrics +{{ $deprecatedVersion := "" -}} +{{- with index .ExtraFields "DeprecatedVersion" -}} + {{- $deprecatedVersion = printf " in %s" . -}} +{{- end -}} - footerHeading = "## Developing new metrics\n" - footerContent = "After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.\n" +{{- $stabilityLevel := "" -}} +{{- if and (.ExtraFields.StabilityLevel) (ne .ExtraFields.StabilityLevel "STABLE") -}} + {{- $stabilityLevel = printf "[%s%s] " .ExtraFields.StabilityLevel $deprecatedVersion -}} +{{- end -}} - footer = footerHeading + footerContent -) - -func main() { - metricsList := recordRulesDescToMetricList(rules.RecordRulesWithDescriptions()) - - sspMetrics.SetupMetrics() - validatorMetrics.SetupMetrics() - - for _, m := range operatormetrics.ListMetrics() { - metricsList = append(metricsList, metric{ - name: m.GetOpts().Name, - description: m.GetOpts().Help, - mtype: strings.TrimSuffix(string(m.GetType()), "Vec"), - }) - } +### {{ .Name }} +{{ print $stabilityLevel }}{{ .Help }}. Type: {{ .Type -}}. - sort.Sort(metricsList) - printMetrics(metricsList) -} +{{- end }} -func printMetrics(metricsList metricList) { - fmt.Print(opening) - metricsList.writeOut() - fmt.Print(footer) -} +## Developing new metrics -type metric struct { - name string - description string - mtype string -} +All metrics documented here are auto-generated and reflect exactly what is being +exposed. After developing new metrics or changing old ones please regenerate +this document. +` -func recordRulesDescToMetricList(mdl []rules.RecordRulesDesc) metricList { - res := make([]metric, len(mdl)) - for i, md := range mdl { - res[i] = metricDescriptionToMetric(md) +func main() { + if err := sspMetrics.SetupMetrics(); err != nil { + panic(err) } - return res -} - -func metricDescriptionToMetric(rrd rules.RecordRulesDesc) metric { - return metric{ - name: rrd.Name, - description: rrd.Description, - mtype: rrd.Type, + if err := validatorMetrics.SetupMetrics(); err != nil { + panic(err) } -} - -func (m metric) writeOut() { - fmt.Println("###", m.name) - fmt.Println(m.description + ". Type: " + m.mtype + ".") -} -type metricList []metric - -var _ sort.Interface = metricList{} - -// Len implements sort.Interface.Len -func (m metricList) Len() int { - return len(m) -} + if err := rules.SetupRules(); err != nil { + panic(err) + } -// Less implements sort.Interface.Less -func (m metricList) Less(i, j int) bool { - return m[i].name < m[j].name -} + docsString := docs.BuildMetricsDocsWithCustomTemplate(sspMetrics.ListMetrics(), rules.ListRecordingRules(), tpl) -// Swap implements sort.Interface.Swap -func (m metricList) Swap(i, j int) { - m[i], m[j] = m[j], m[i] -} - -func (m metricList) writeOut() { - for _, met := range m { - met.writeOut() - } + fmt.Print(docsString) } diff --git a/tools/prom-metrics-collector/metrics_collector.go b/tools/prom-metrics-collector/metrics_collector.go deleted file mode 100644 index a0c852f58..000000000 --- a/tools/prom-metrics-collector/metrics_collector.go +++ /dev/null @@ -1,31 +0,0 @@ -package main - -import ( - parser "github.com/kubevirt/monitoring/pkg/metrics/parser" - dto "github.com/prometheus/client_model/go" - - "kubevirt.io/ssp-operator/pkg/monitoring/rules" -) - -// This should be used only for very rare cases where the naming conventions that are explained in the best practices: -// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines -// should be ignored. -var excludedMetrics = map[string]struct{}{} - -func readMetrics() []*dto.MetricFamily { - var metricFamilies []*dto.MetricFamily - sspMetrics := rules.RecordRulesWithDescriptions() - - for _, metric := range sspMetrics { - if _, isExcludedMetric := excludedMetrics[metric.Name]; !isExcludedMetric { - mf := parser.CreateMetricFamily(parser.Metric{ - Name: metric.Name, - Help: metric.Description, - Type: metric.Type, - }) - metricFamilies = append(metricFamilies, mf) - } - } - - return metricFamilies -} diff --git a/tools/prom-metrics-collector/metrics_json_generator.go b/tools/prom-metrics-collector/metrics_json_generator.go index 6c998f73d..8b60c16a5 100644 --- a/tools/prom-metrics-collector/metrics_json_generator.go +++ b/tools/prom-metrics-collector/metrics_json_generator.go @@ -3,17 +3,60 @@ package main import ( "encoding/json" "fmt" - "os" + "strings" + + "github.com/kubevirt/monitoring/pkg/metrics/parser" + + sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator" + validatorMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/template-validator" + "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) +// This should be used only for very rare cases where the naming conventions that are explained in the best practices: +// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines +// should be ignored. +var excludedMetrics = map[string]struct{}{} + func main() { - metricFamilies := readMetrics() + if err := sspMetrics.SetupMetrics(); err != nil { + panic(err) + } + + if err := validatorMetrics.SetupMetrics(); err != nil { + panic(err) + } - jsonBytes, err := json.Marshal(metricFamilies) - if err != nil { - fmt.Println(err) - os.Exit(1) + if err := rules.SetupRules(); err != nil { + panic(err) } - fmt.Println(string(jsonBytes)) + var metricFamilies []parser.Metric + + metricsList := sspMetrics.ListMetrics() + for _, m := range metricsList { + if _, isExcludedMetric := excludedMetrics[m.GetOpts().Name]; !isExcludedMetric { + metricFamilies = append(metricFamilies, parser.Metric{ + Name: m.GetOpts().Name, + Help: m.GetOpts().Help, + Type: strings.ToUpper(string(m.GetBaseType())), + }) + } + } + + rulesList := rules.ListRecordingRules() + for _, r := range rulesList { + if _, isExcludedMetric := excludedMetrics[r.GetOpts().Name]; !isExcludedMetric { + metricFamilies = append(metricFamilies, parser.Metric{ + Name: r.GetOpts().Name, + Help: r.GetOpts().Help, + Type: strings.ToUpper(string(r.GetType())), + }) + } + } + + if jsonBytes, err := json.Marshal(metricFamilies); err != nil { + panic(err) + } else { + fmt.Println(string(jsonBytes)) + } } diff --git a/tools/test-rules-writer/test_rules_writer.go b/tools/test-rules-writer/test_rules_writer.go index f571a00df..c65b687a7 100644 --- a/tools/test-rules-writer/test_rules_writer.go +++ b/tools/test-rules-writer/test_rules_writer.go @@ -5,26 +5,22 @@ import ( "fmt" "os" - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "kubevirt.io/ssp-operator/pkg/monitoring/rules" ) func main() { - const runbookTemplate = "test-runbook:%s" - - allRules := append(rules.RecordRules(), rules.AlertRules(runbookTemplate)...) + if err := rules.SetupRules(); err != nil { + panic(err) + } - spec := promv1.PrometheusRuleSpec{ - Groups: []promv1.RuleGroup{{ - Name: "test.rules", - Rules: allRules, - }}, + pr, err := rules.BuildPrometheusRule("testnamespace") + if err != nil { + panic(err) } encoder := json.NewEncoder(os.Stdout) encoder.SetIndent("", " ") - if err := encoder.Encode(spec); err != nil { + if err := encoder.Encode(pr.Spec); err != nil { fmt.Fprintf(os.Stderr, "Error encoding prometheus spec: %v", err) os.Exit(1) } diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go new file mode 100644 index 000000000..e5d65f922 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/alerts.go @@ -0,0 +1,96 @@ +package docs + +import ( + "bytes" + "log" + "sort" + "text/template" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +const defaultAlertsTemplate = `# Operator Alerts + +{{- range . }} + +### {{.Name}} +**Summary:** {{ index .Annotations "summary" }}. + +**Description:** {{ index .Annotations "description" }}. + +**Severity:** {{ index .Labels "severity" }}. +{{- if .For }} + +**For:** {{ .For }}. +{{- end -}} +{{- end }} + +## Developing new alerts + +All alerts documented here are auto-generated and reflect exactly what is being +exposed. After developing new alerts or changing old ones please regenerate +this document. +` + +type alertDocs struct { + Name string + Expr string + For string + Annotations map[string]string + Labels map[string]string +} + +// BuildAlertsDocsWithCustomTemplate returns a string with the documentation +// for the given alerts, using the given template. +func BuildAlertsDocsWithCustomTemplate( + alerts []promv1.Rule, + tplString string, +) string { + + tpl, err := template.New("alerts").Parse(tplString) + if err != nil { + log.Fatalln(err) + } + + var allDocs []alertDocs + + if alerts != nil { + allDocs = append(allDocs, buildAlertsDocs(alerts)...) + } + + buf := bytes.NewBufferString("") + err = tpl.Execute(buf, allDocs) + if err != nil { + log.Fatalln(err) + } + + return buf.String() +} + +// BuildAlertsDocs returns a string with the documentation for the given +// metrics. +func BuildAlertsDocs(alerts []promv1.Rule) string { + return BuildAlertsDocsWithCustomTemplate(alerts, defaultAlertsTemplate) +} + +func buildAlertsDocs(alerts []promv1.Rule) []alertDocs { + alertsDocs := make([]alertDocs, len(alerts)) + for i, alert := range alerts { + alertsDocs[i] = alertDocs{ + Name: alert.Alert, + Expr: alert.Expr.String(), + For: string(*alert.For), + Annotations: alert.Annotations, + Labels: alert.Labels, + } + } + sortAlertsDocs(alertsDocs) + + return alertsDocs +} + +func sortAlertsDocs(alertsDocs []alertDocs) { + sort.Slice(alertsDocs, func(i, j int) bool { + return alertsDocs[i].Name < alertsDocs[j].Name + }) +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go new file mode 100644 index 000000000..8271230bd --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/docs/metrics.go @@ -0,0 +1,106 @@ +package docs + +import ( + "bytes" + "log" + "sort" + "strings" + "text/template" + + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" +) + +const defaultMetricsTemplate = `# Operator Metrics + +{{- range . }} + +### {{.Name}} +{{.Help}}. + +Type: {{.Type}}. +{{- end }} + +## Developing new metrics + +All metrics documented here are auto-generated and reflect exactly what is being +exposed. After developing new metrics or changing old ones please regenerate +this document. +` + +type metricDocs struct { + Name string + Help string + Type string + ExtraFields map[string]string +} + +type docOptions interface { + GetOpts() operatormetrics.MetricOpts + GetType() operatormetrics.MetricType +} + +// BuildMetricsDocsWithCustomTemplate returns a string with the documentation +// for the given metrics, using the given template. +func BuildMetricsDocsWithCustomTemplate( + metrics []operatormetrics.Metric, + recordingRules []operatorrules.RecordingRule, + tplString string, +) string { + + tpl, err := template.New("metrics").Parse(tplString) + if err != nil { + log.Fatalln(err) + } + + var allDocs []metricDocs + + if metrics != nil { + allDocs = append(allDocs, buildMetricsDocs(metrics)...) + } + + if recordingRules != nil { + allDocs = append(allDocs, buildMetricsDocs(recordingRules)...) + } + + sortMetricsDocs(allDocs) + + buf := bytes.NewBufferString("") + err = tpl.Execute(buf, allDocs) + if err != nil { + log.Fatalln(err) + } + + return buf.String() +} + +// BuildMetricsDocs returns a string with the documentation for the given +// metrics. +func BuildMetricsDocs(metrics []operatormetrics.Metric, recordingRules []operatorrules.RecordingRule) string { + return BuildMetricsDocsWithCustomTemplate(metrics, recordingRules, defaultMetricsTemplate) +} + +func buildMetricsDocs[T docOptions](items []T) []metricDocs { + metricsDocs := make([]metricDocs, len(items)) + for i, metric := range items { + metricOpts := metric.GetOpts() + metricsDocs[i] = metricDocs{ + Name: metricOpts.Name, + Help: metricOpts.Help, + Type: getAndConvertMetricType(metric.GetType()), + ExtraFields: metricOpts.ExtraFields, + } + } + + return metricsDocs +} + +func sortMetricsDocs(metricsDocs []metricDocs) { + sort.Slice(metricsDocs, func(i, j int) bool { + return metricsDocs[i].Name < metricsDocs[j].Name + }) +} + +func getAndConvertMetricType(metricType operatormetrics.MetricType) string { + return strings.ReplaceAll(string(metricType), "Vec", "") +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go new file mode 100644 index 000000000..97b55d077 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/prometheusrules.go @@ -0,0 +1,73 @@ +package operatorrules + +import ( + "cmp" + "fmt" + "slices" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +// BuildPrometheusRule builds a PrometheusRule object from the registered recording rules and alerts. +func BuildPrometheusRule(name, namespace string, labels map[string]string) (*promv1.PrometheusRule, error) { + spec, err := buildPrometheusRuleSpec() + if err != nil { + return nil, err + } + + return &promv1.PrometheusRule{ + TypeMeta: metav1.TypeMeta{ + APIVersion: promv1.SchemeGroupVersion.String(), + Kind: promv1.PrometheusRuleKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: labels, + }, + Spec: *spec, + }, nil +} + +func buildPrometheusRuleSpec() (*promv1.PrometheusRuleSpec, error) { + var groups []promv1.RuleGroup + + if len(operatorRegistry.registeredRecordingRules) != 0 { + groups = append(groups, promv1.RuleGroup{ + Name: "recordingRules.rules", + Rules: buildRecordingRulesRules(), + }) + } + + if len(operatorRegistry.registeredAlerts) != 0 { + groups = append(groups, promv1.RuleGroup{ + Name: "alerts.rules", + Rules: ListAlerts(), + }) + } + + if len(groups) == 0 { + return nil, fmt.Errorf("no registered recording rule or alert") + } + + return &promv1.PrometheusRuleSpec{Groups: groups}, nil +} + +func buildRecordingRulesRules() []promv1.Rule { + var rules []promv1.Rule + + for _, recordingRule := range operatorRegistry.registeredRecordingRules { + rules = append(rules, promv1.Rule{ + Record: recordingRule.MetricsOpts.Name, + Expr: recordingRule.Expr, + }) + } + + slices.SortFunc(rules, func(a, b promv1.Rule) int { + return cmp.Compare(a.Record, b.Record) + }) + + return rules +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go new file mode 100644 index 000000000..da582ceec --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/rbac.go @@ -0,0 +1,45 @@ +package operatorrules + +import ( + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func BuildRoleAndRoleBinding(namePrefix, namespace, promSAName, promSANamespace string, labels map[string]string) (*rbacv1.Role, *rbacv1.RoleBinding) { + r := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-role", + Namespace: namespace, + Labels: labels, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"services", "endpoints", "pods"}, + Verbs: []string{"get", "list"}, + }, + }, + } + + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: namePrefix + "-rolebinding", + Namespace: namespace, + Labels: labels, + }, + RoleRef: rbacv1.RoleRef{ + Kind: "Role", + Name: namePrefix + "-role", + APIGroup: rbacv1.GroupName, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: promSAName, + Namespace: promSANamespace, + }, + }, + } + + return r, rb +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go new file mode 100644 index 000000000..8dfc50c3e --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/recordingrule.go @@ -0,0 +1,24 @@ +package operatorrules + +import ( + "k8s.io/apimachinery/pkg/util/intstr" + + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" +) + +// RecordingRule is a struct that represents a Prometheus recording rule. +type RecordingRule struct { + MetricsOpts operatormetrics.MetricOpts + MetricType operatormetrics.MetricType + Expr intstr.IntOrString +} + +// GetOpts returns the metric options of the recording rule. +func (c RecordingRule) GetOpts() operatormetrics.MetricOpts { + return c.MetricsOpts +} + +// GetType returns the metric type of the recording rule. +func (c RecordingRule) GetType() operatormetrics.MetricType { + return c.MetricType +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go new file mode 100644 index 000000000..cc72af21d --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/registry.go @@ -0,0 +1,78 @@ +package operatorrules + +import ( + "cmp" + "slices" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +var operatorRegistry = newRegistry() + +type operatorRegisterer struct { + registeredRecordingRules map[string]RecordingRule + registeredAlerts map[string]promv1.Rule +} + +func newRegistry() operatorRegisterer { + return operatorRegisterer{ + registeredRecordingRules: map[string]RecordingRule{}, + registeredAlerts: map[string]promv1.Rule{}, + } +} + +// RegisterRecordingRules registers the given recording rules. +func RegisterRecordingRules(recordingRules ...[]RecordingRule) error { + for _, recordingRuleList := range recordingRules { + for _, recordingRule := range recordingRuleList { + operatorRegistry.registeredRecordingRules[recordingRule.MetricsOpts.Name] = recordingRule + } + } + + return nil +} + +// RegisterAlerts registers the given alerts. +func RegisterAlerts(alerts ...[]promv1.Rule) error { + for _, alertList := range alerts { + for _, alert := range alertList { + operatorRegistry.registeredAlerts[alert.Alert] = alert + } + } + + return nil +} + +// ListRecordingRules returns the registered recording rules. +func ListRecordingRules() []RecordingRule { + var rules []RecordingRule + for _, rule := range operatorRegistry.registeredRecordingRules { + rules = append(rules, rule) + } + + slices.SortFunc(rules, func(a, b RecordingRule) int { + return cmp.Compare(a.GetOpts().Name, b.GetOpts().Name) + }) + + return rules +} + +// ListAlerts returns the registered alerts. +func ListAlerts() []promv1.Rule { + var alerts []promv1.Rule + for _, alert := range operatorRegistry.registeredAlerts { + alerts = append(alerts, alert) + } + + slices.SortFunc(alerts, func(a, b promv1.Rule) int { + return cmp.Compare(a.Alert, b.Alert) + }) + + return alerts +} + +// CleanRegistry removes all registered rules and alerts. +func CleanRegistry() error { + operatorRegistry = newRegistry() + return nil +} diff --git a/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go new file mode 100644 index 000000000..c06a032d7 --- /dev/null +++ b/vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules/schema.go @@ -0,0 +1,22 @@ +package operatorrules + +import ( + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/runtime" + + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +func AddToScheme(scheme *runtime.Scheme) error { + err := promv1.AddToScheme(scheme) + if err != nil { + return err + } + + err = rbacv1.AddToScheme(scheme) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index ad9ba1ca0..31ed6705b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -182,7 +182,9 @@ github.com/json-iterator/go github.com/kubevirt/monitoring/pkg/metrics/parser # github.com/machadovilaca/operator-observability v0.0.13 ## explicit; go 1.21 +github.com/machadovilaca/operator-observability/pkg/docs github.com/machadovilaca/operator-observability/pkg/operatormetrics +github.com/machadovilaca/operator-observability/pkg/operatorrules # github.com/mailru/easyjson v0.7.7 ## explicit; go 1.12 github.com/mailru/easyjson/buffer