From 2150cddfd445713fa54af0524bfec03a0e500e49 Mon Sep 17 00:00:00 2001 From: Serge Logvinov Date: Sun, 5 May 2024 19:44:32 +0300 Subject: [PATCH] feat: expose metrics Expose API call metrics and node CSR approve statistics. Signed-off-by: Serge Logvinov --- Makefile | 2 +- .../talos-cloud-controller-manager/Chart.yaml | 2 +- .../templates/deployment.yaml | 1 + .../values.yaml | 3 + docs/metrics.md | 62 +++++++++++++++++++ pkg/metrics/metrics.go | 20 ++++++ pkg/metrics/metrics_api.go | 51 +++++++++++++++ pkg/metrics/metrics_csr.go | 44 +++++++++++++ pkg/talos/helper.go | 5 ++ pkg/talos/instances.go | 9 ++- 10 files changed, 195 insertions(+), 4 deletions(-) create mode 100644 docs/metrics.md create mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/metrics/metrics_api.go create mode 100644 pkg/metrics/metrics_csr.go diff --git a/Makefile b/Makefile index d790cdf..04de7aa 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ build: ## Build .PHONY: run run: build ./talos-cloud-controller-manager-$(ARCH) --v=5 --kubeconfig=kubeconfig --cloud-config=hack/ccm-config.yaml --controllers=cloud-node \ - --use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1 + --use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1 --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics .PHONY: lint lint: ## Lint Code diff --git a/charts/talos-cloud-controller-manager/Chart.yaml b/charts/talos-cloud-controller-manager/Chart.yaml index 200a267..9eaf4d9 100644 --- a/charts/talos-cloud-controller-manager/Chart.yaml +++ b/charts/talos-cloud-controller-manager/Chart.yaml @@ -11,5 +11,5 @@ keywords: maintainers: - name: sergelogvinov url: https://github.com/sergelogvinov -version: 0.3.0 +version: 0.3.1 appVersion: "v1.6.0" diff --git a/charts/talos-cloud-controller-manager/templates/deployment.yaml b/charts/talos-cloud-controller-manager/templates/deployment.yaml index 4765fcf..8fa50be 100644 --- a/charts/talos-cloud-controller-manager/templates/deployment.yaml +++ b/charts/talos-cloud-controller-manager/templates/deployment.yaml @@ -56,6 +56,7 @@ spec: - --leader-elect-resource-name=cloud-controller-manager-talos - --use-service-account-credentials - --secure-port={{ .Values.service.containerPort }} + - --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics {{- with .Values.extraArgs }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/charts/talos-cloud-controller-manager/values.yaml b/charts/talos-cloud-controller-manager/values.yaml index 7150b91..89eba57 100644 --- a/charts/talos-cloud-controller-manager/values.yaml +++ b/charts/talos-cloud-controller-manager/values.yaml @@ -105,6 +105,9 @@ service: containerPort: 50258 # -- Additional custom annotations for Service. annotations: {} + # prometheus.io/scrape: "true" + # prometheus.io/scheme: "https" + # prometheus.io/port: "50258" # -- Resource requests and limits. # ref: http://kubernetes.io/docs/user-guide/compute-resources/ diff --git a/docs/metrics.md b/docs/metrics.md new file mode 100644 index 0000000..4ef8621 --- /dev/null +++ b/docs/metrics.md @@ -0,0 +1,62 @@ +# Metrics documentation + +This document is a reflection of the current state of the exposed metrics of the Talos CCM. + +## Gater metrics from talos-cloud-controller-manager + +By default, the Talos CCM exposes metrics on the `https://localhost:50258/metrics` endpoint. + +Enabling the metrics is done by setting the `--secure-port` and the `--authorization-always-allow-paths` flag to allow access to the `/metrics` endpoint. + +```yaml +talos-cloud-controller-manager + --authorization-always-allow-paths="/metrics" + --secure-port=50258 +``` + +### Helm chart values + +The following values can be set in the Helm chart to expose the metrics of the Talos CCM. + +```yaml +service: + containerPort: 50258 + annotations: + prometheus.io/scrape: "true" + prometheus.io/scheme: "https" + prometheus.io/port: "50258" +``` + +## Metrics exposed by the CCM + +### Talos API calls + +|Metric name|Metric type|Labels/tags| +|-----------|-----------|-----------| +|talosccm_api_request_duration_seconds|Histogram|`request`=| +|talosccm_api_request_errors_total|Counter|`request`=| + +```txt +talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.1"} 10 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.25"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.5"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="1"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="2.5"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="5"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="10"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="30"} 16 +talosccm_api_request_duration_seconds_bucket{request="addresses",le="+Inf"} 16 +talosccm_api_request_duration_seconds_sum{request="addresses"} 1.369387789 +talosccm_api_request_duration_seconds_count{request="addresses"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.1"} 14 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.25"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.5"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="1"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="2.5"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="5"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="10"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="30"} 16 +talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="+Inf"} 16 +talosccm_api_request_duration_seconds_sum{request="platformmetadata"} 1.2046141220000002 +talosccm_api_request_duration_seconds_count{request="platformmetadata"} 16 +``` diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 0000000..e01ded0 --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,20 @@ +// Package metrics collects metrics. +package metrics + +import ( + "time" +) + +// MetricContext indicates the context for Talos client metrics. +type MetricContext struct { + start time.Time + attributes []string +} + +// NewMetricContext creates a new MetricContext. +func NewMetricContext(resource string) *MetricContext { + return &MetricContext{ + start: time.Now(), + attributes: []string{resource}, + } +} diff --git a/pkg/metrics/metrics_api.go b/pkg/metrics/metrics_api.go new file mode 100644 index 0000000..1c531e4 --- /dev/null +++ b/pkg/metrics/metrics_api.go @@ -0,0 +1,51 @@ +package metrics + +import ( + "time" + + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +// TalosMetrics contains the metrics for Talos API calls. +type TalosMetrics struct { + Duration *metrics.HistogramVec + Errors *metrics.CounterVec +} + +var apiMetrics = registerAPIMetrics() + +// ObserveRequest records the request latency and counts the errors. +func (mc *MetricContext) ObserveRequest(err error) error { + apiMetrics.Duration.WithLabelValues(mc.attributes...).Observe( + time.Since(mc.start).Seconds()) + + if err != nil { + apiMetrics.Errors.WithLabelValues(mc.attributes...).Inc() + } + + return err +} + +func registerAPIMetrics() *TalosMetrics { + metrics := &TalosMetrics{ + Duration: metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Name: "talosccm_api_request_duration_seconds", + Help: "Latency of an Talos API call", + Buckets: []float64{.1, .25, .5, 1, 2.5, 5, 10, 30}, + }, []string{"request"}), + Errors: metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "talosccm_api_request_errors_total", + Help: "Total number of errors for an Talos API call", + }, []string{"request"}), + } + + legacyregistry.MustRegister( + metrics.Duration, + metrics.Errors, + ) + + return metrics +} diff --git a/pkg/metrics/metrics_csr.go b/pkg/metrics/metrics_csr.go new file mode 100644 index 0000000..3bb4c5e --- /dev/null +++ b/pkg/metrics/metrics_csr.go @@ -0,0 +1,44 @@ +package metrics + +import ( + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +// CSRMetrics contains the metrics for certificate signing requests. +type CSRMetrics struct { + approvalCount *metrics.CounterVec +} + +// CSRApprovalStatus is the status of a CSR. +type CSRApprovalStatus string + +const ( + // ApprovalStatusDeny is used when a CSR is denied. + ApprovalStatusDeny CSRApprovalStatus = "deny" + // ApprovalStatusApprove is used when a CSR is approved. + ApprovalStatusApprove CSRApprovalStatus = "approve" +) + +var csrMetrics = registerCSRMetrics() + +// CSRApprovedCount counts the number of approved, denied and ignored CSRs. +func CSRApprovedCount(status CSRApprovalStatus) { + csrMetrics.approvalCount.WithLabelValues(string(status)).Inc() +} + +func registerCSRMetrics() *CSRMetrics { + metrics := &CSRMetrics{ + approvalCount: metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "talosccm_csr_approval_count", + Help: "Count of approved, denied and ignored node CSRs", + }, []string{"status"}), + } + + legacyregistry.MustRegister( + metrics.approvalCount, + ) + + return metrics +} diff --git a/pkg/talos/helper.go b/pkg/talos/helper.go index 368d904..6a60cf4 100644 --- a/pkg/talos/helper.go +++ b/pkg/talos/helper.go @@ -7,6 +7,7 @@ import ( "fmt" "strings" + "github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics" "github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer" utilsnet "github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net" "github.com/siderolabs/talos/pkg/machinery/resources/network" @@ -193,11 +194,15 @@ func csrNodeChecks(ctx context.Context, kclient clientkubernetes.Interface, x509 for _, ip := range x509cr.IPAddresses { if !slices.Contains(nodeAddrs, ip.String()) { + metrics.CSRApprovedCount(metrics.ApprovalStatusDeny) + return false, fmt.Errorf("csrNodeChecks: CSR %s Node IP addresses don't match corresponding "+ "Node IP addresses %q, got %q", x509cr.DNSNames[0], nodeAddrs, ip) } } + metrics.CSRApprovedCount(metrics.ApprovalStatusApprove) + return true, nil } diff --git a/pkg/talos/instances.go b/pkg/talos/instances.go index 7c859a4..51ab358 100644 --- a/pkg/talos/instances.go +++ b/pkg/talos/instances.go @@ -6,6 +6,7 @@ import ( "maps" "strings" + "github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics" "github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer" "github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net" "github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/platform" @@ -64,9 +65,11 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud return nil, fmt.Errorf("error refreshing client connection: %w", err) } + mc := metrics.NewMetricContext(runtime.PlatformMetadataID) + for _, ip := range nodeIPs { meta, err = i.c.getNodeMetadata(ctx, ip) - if err == nil { + if mc.ObserveRequest(err) == nil { nodeIP = ip break @@ -99,8 +102,10 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud return nil, fmt.Errorf("error transforming node: %w", err) } + mc = metrics.NewMetricContext("addresses") + ifaces, err := i.c.getNodeIfaces(ctx, nodeIP) - if err != nil { + if mc.ObserveRequest(err) != nil { return nil, fmt.Errorf("error getting interfaces list from the node %s: %w", node.Name, err) }