Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor recording rules and alerts code #870

Merged
merged 1 commit into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
# SSP Operator metrics
This document aims to help users that are not familiar with metrics exposed by the SSP Operator.
All metrics documented here are auto-generated by the utility tool `tools/metricsdocs` and reflects exactly what is being exposed.

## SSP Operator Metrics List
### kubevirt_ssp_common_templates_restored_increase
The increase in the number of common templates restored by the operator back to their original state, over the last hour. Type: Gauge.

### kubevirt_ssp_common_templates_restored_total
The total number of common templates restored by the operator back to their original state. Type: Counter.

### kubevirt_ssp_operator_reconcile_succeeded
Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise. Type: Gauge.

### kubevirt_ssp_operator_reconcile_succeeded_aggregated
The total number of ssp-operator pods reconciling with no errors. Type: Gauge.

### kubevirt_ssp_operator_up
The total number of running ssp-operator pods. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_increase
The increase in the number of rejected template validators, over the last hour. Type: Gauge.

### kubevirt_ssp_template_validator_rejected_total
The total number of rejected template validators. Type: Counter.

### kubevirt_ssp_template_validator_up
The total number of running virt-template-validator pods. Type: Gauge.

### kubevirt_ssp_vm_rbd_block_volume_without_rxbounce
VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.
[ALPHA] VM with RBD mounted Block volume (without rxbounce option set). Type: Gauge.

## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.

All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
7 changes: 6 additions & 1 deletion internal/operands/metrics/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/internal/operands"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

// Define RBAC rules needed by this operand:
Expand Down Expand Up @@ -96,7 +97,11 @@ func reconcileMonitoringRbacRoleBinding(request *common.Request) (common.Reconci
}

func reconcilePrometheusRule(request *common.Request) (common.ReconcileResult, error) {
prometheusRule, err := newPrometheusRule(request.Namespace)
if err := rules.SetupRules(); err != nil {
return common.ReconcileResult{}, err
}

prometheusRule, err := rules.BuildPrometheusRule(request.Namespace)
if err != nil {
return common.ReconcileResult{}, err
}
Expand Down
10 changes: 8 additions & 2 deletions internal/operands/metrics/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (

ssp "kubevirt.io/ssp-operator/api/v1beta2"
"kubevirt.io/ssp-operator/internal/common"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
)

var log = logf.Log.WithName("metrics_operand")
Expand All @@ -34,6 +35,8 @@ var _ = Describe("Metrics operand", func() {
)

BeforeEach(func() {
Expect(rules.SetupRules()).To(Succeed())

client := fake.NewClientBuilder().WithScheme(common.Scheme).Build()
request = common.Request{
Request: reconcile.Request{
Expand Down Expand Up @@ -67,7 +70,7 @@ var _ = Describe("Metrics operand", func() {
_, err := operand.Reconcile(&request)
Expect(err).ToNot(HaveOccurred())

prometheusRule, err := newPrometheusRule(namespace)
prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

ExpectResourceExists(prometheusRule, request)
Expand All @@ -82,7 +85,7 @@ var _ = Describe("Metrics operand", func() {
os.Setenv(runbookURLTemplateEnv, template)
}

prometheusRule, err := newPrometheusRule(namespace)
err := rules.SetupRules()

if strings.Count(template, "%s") != 1 || strings.Count(template, "%") != 1 {
Expect(err).To(HaveOccurred())
Expand All @@ -91,6 +94,9 @@ var _ = Describe("Metrics operand", func() {

Expect(err).ToNot(HaveOccurred())

prometheusRule, err := rules.BuildPrometheusRule(namespace)
Expect(err).ToNot(HaveOccurred())

for _, group := range prometheusRule.Spec.Groups {
for _, rule := range group.Rules {
if rule.Alert != "" {
Expand Down
48 changes: 1 addition & 47 deletions internal/operands/metrics/resources.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
package metrics

import (
"errors"
"os"
"strings"

promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
rbac "k8s.io/api/rbac/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -13,7 +9,6 @@ import (
)

const (
PrometheusRuleName = "prometheus-k8s-rules-cnv"
MonitorNamespace = "openshift-monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
Expand Down Expand Up @@ -69,7 +64,7 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
return &promv1.ServiceMonitor{
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: PrometheusRuleName,
Name: rules.RuleName,
Labels: ServiceMonitorLabels(),
},
Spec: promv1.ServiceMonitorSpec{
Expand All @@ -96,44 +91,3 @@ func newServiceMonitorCR(namespace string) *promv1.ServiceMonitor {
},
}
}

func newPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
runbookURLTemplate, err := getRunbookURLTemplate()
if err != nil {
return nil, err
}

return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: PrometheusRuleName,
Namespace: namespace,
Labels: map[string]string{
"prometheus": "k8s",
"role": "alert-rules",
"kubevirt.io": "prometheus-rules",
PrometheusLabelKey: PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cnv.rules",
Rules: append(rules.RecordRules(), rules.AlertRules(runbookURLTemplate)...),
},
},
},
}, nil
}

func getRunbookURLTemplate() (string, error) {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 || strings.Count(runbookURLTemplate, "%") != 1 {
return "", errors.New("runbook URL template must have exactly 1 %s substring")
}

return runbookURLTemplate, nil
}
6 changes: 5 additions & 1 deletion internal/template-validator/validator/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,11 @@ func (app *App) Run() {
registerReadinessProbe()

// setup monitoring
validatorMetrics.SetupMetrics()
err = validatorMetrics.SetupMetrics()
if err != nil {
logger.Log.Error(err, "Error setting up metrics")
avlitman marked this conversation as resolved.
Show resolved Hide resolved
panic(err)
avlitman marked this conversation as resolved.
Show resolved Hide resolved
}

logger.Log.Info("TLS certs directory", "directory", app.TLSInfo.CertsDirectory)

Expand Down
15 changes: 11 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"kubevirt.io/ssp-operator/controllers"
"kubevirt.io/ssp-operator/internal/common"
sspMetrics "kubevirt.io/ssp-operator/pkg/monitoring/metrics/ssp-operator"
"kubevirt.io/ssp-operator/pkg/monitoring/rules"
"kubevirt.io/ssp-operator/webhooks"
// +kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -181,15 +182,21 @@ func (s *prometheusServer) getPrometheusTLSConfig(ctx context.Context, certWatch
}
}

func newPrometheusServer(metricsAddr string, cache cache.Cache) *prometheusServer {
sspMetrics.SetupMetrics()
avlitman marked this conversation as resolved.
Show resolved Hide resolved
func newPrometheusServer(metricsAddr string, cache cache.Cache) (*prometheusServer, error) {
if err := sspMetrics.SetupMetrics(); err != nil {
return nil, err
}

if err := rules.SetupRules(); err != nil {
return nil, err
}

return &prometheusServer{
certPath: path.Join(sdkTLSDir, sdkTLSCrt),
keyPath: path.Join(sdkTLSDir, sdkTLSKey),
cache: cache,
serverAddress: metricsAddr,
}
}, nil
}

func main() {
Expand Down Expand Up @@ -249,7 +256,7 @@ func main() {
}
}

metricsServer := newPrometheusServer(metricsAddr, mgr.GetCache())
metricsServer, err := newPrometheusServer(metricsAddr, mgr.GetCache())
if err != nil {
setupLog.Error(err, "unable create Prometheus server")
os.Exit(1)
Expand Down
13 changes: 8 additions & 5 deletions pkg/monitoring/metrics/ssp-operator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ import (
runtimemetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

func SetupMetrics() {
func SetupMetrics() error {
operatormetrics.Register = runtimemetrics.Registry.Register

if err := operatormetrics.RegisterMetrics(
return operatormetrics.RegisterMetrics(
operatorMetrics,
rbdMetrics,
templateMetrics,
); err != nil {
panic(err)
}
)
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
13 changes: 8 additions & 5 deletions pkg/monitoring/metrics/template-validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
)

func SetupMetrics() {
if err := operatormetrics.RegisterMetrics(
func SetupMetrics() error {
return operatormetrics.RegisterMetrics(
templateMetrics,
); err != nil {
panic(err)
}
)
}

// ListMetrics registered prometheus metrics
func ListMetrics() []operatormetrics.Metric {
return operatormetrics.ListMetrics()
}
89 changes: 89 additions & 0 deletions pkg/monitoring/rules/alerts/operator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package alerts

import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)

const (
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
)

func operatorAlerts() []promv1.Rule {
return []promv1.Rule{
{
Alert: "SSPDown",
Expr: intstr.FromString("kubevirt_ssp_operator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All SSP operator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPTemplateValidatorDown",
Expr: intstr.FromString("kubevirt_ssp_template_validator_up == 0"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "All Template Validator pods are down.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPFailingToReconcile",
Expr: intstr.FromString("(kubevirt_ssp_operator_reconcile_succeeded_aggregated == 0) and (kubevirt_ssp_operator_up > 0)"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "The ssp-operator pod is up but failing to reconcile.",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
},
{
Alert: "SSPHighRateRejectedVms",
Expr: intstr.FromString("kubevirt_ssp_template_validator_rejected_increase > 5"),
For: ptr.To[promv1.Duration]("5m"),
Annotations: map[string]string{
"summary": "High rate of rejected Vms.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
},
{
Alert: "SSPCommonTemplatesModificationReverted",
Expr: intstr.FromString("kubevirt_ssp_common_templates_restored_increase > 0"),
For: ptr.To[promv1.Duration]("0m"),
Annotations: map[string]string{
"summary": "Common Templates manual modifications were reverted by the operator.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
{
Alert: "VMStorageClassWarning",
Expr: intstr.FromString("(count(kubevirt_ssp_vm_rbd_block_volume_without_rxbounce > 0) or vector(0)) > 0"),
Annotations: map[string]string{
"summary": "{{ $value }} Virtual Machines may cause reports of bad crc/signature errors due to certain I/O patterns.",
"description": "When running VMs using ODF storage with 'rbd' mounter or 'rbd.csi.ceph.com provisioner', VMs may cause reports of bad crc/signature errors due to certain I/O patterns. Cluster performance can be severely degraded if the number of re-transmissions due to crc errors causes network saturation.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
},
},
}
}
Loading