Skip to content

Commit

Permalink
feat: expose metrics
Browse files Browse the repository at this point in the history
Expose API call metrics and node CSR approve statistics.

Signed-off-by: Serge Logvinov <[email protected]>
  • Loading branch information
sergelogvinov committed May 6, 2024
1 parent 0faf0ae commit 2150cdd
Show file tree
Hide file tree
Showing 10 changed files with 195 additions and 4 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ build: ## Build
.PHONY: run
run: build
./talos-cloud-controller-manager-$(ARCH) --v=5 --kubeconfig=kubeconfig --cloud-config=hack/ccm-config.yaml --controllers=cloud-node \
--use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1
--use-service-account-credentials --leader-elect=false --bind-address=127.0.0.1 --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics

.PHONY: lint
lint: ## Lint Code
Expand Down
2 changes: 1 addition & 1 deletion charts/talos-cloud-controller-manager/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ keywords:
maintainers:
- name: sergelogvinov
url: https://github.com/sergelogvinov
version: 0.3.0
version: 0.3.1
appVersion: "v1.6.0"
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ spec:
- --leader-elect-resource-name=cloud-controller-manager-talos
- --use-service-account-credentials
- --secure-port={{ .Values.service.containerPort }}
- --authorization-always-allow-paths=/healthz,/livez,/readyz,/metrics
{{- with .Values.extraArgs }}
{{- toYaml . | nindent 12 }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions charts/talos-cloud-controller-manager/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ service:
containerPort: 50258
# -- Additional custom annotations for Service.
annotations: {}
# prometheus.io/scrape: "true"
# prometheus.io/scheme: "https"
# prometheus.io/port: "50258"

# -- Resource requests and limits.
# ref: http://kubernetes.io/docs/user-guide/compute-resources/
Expand Down
62 changes: 62 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Metrics documentation

This document is a reflection of the current state of the exposed metrics of the Talos CCM.

## Gater metrics from talos-cloud-controller-manager

By default, the Talos CCM exposes metrics on the `https://localhost:50258/metrics` endpoint.

Enabling the metrics is done by setting the `--secure-port` and the `--authorization-always-allow-paths` flag to allow access to the `/metrics` endpoint.

```yaml
talos-cloud-controller-manager
--authorization-always-allow-paths="/metrics"
--secure-port=50258
```

### Helm chart values

The following values can be set in the Helm chart to expose the metrics of the Talos CCM.

```yaml
service:
containerPort: 50258
annotations:
prometheus.io/scrape: "true"
prometheus.io/scheme: "https"
prometheus.io/port: "50258"
```
## Metrics exposed by the CCM
### Talos API calls
|Metric name|Metric type|Labels/tags|
|-----------|-----------|-----------|
|talosccm_api_request_duration_seconds|Histogram|`request`=<api_request>|
|talosccm_api_request_errors_total|Counter|`request`=<api_request>|

```txt
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.1"} 10
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.25"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="0.5"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="1"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="2.5"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="5"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="10"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="30"} 16
talosccm_api_request_duration_seconds_bucket{request="addresses",le="+Inf"} 16
talosccm_api_request_duration_seconds_sum{request="addresses"} 1.369387789
talosccm_api_request_duration_seconds_count{request="addresses"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.1"} 14
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.25"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="0.5"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="1"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="2.5"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="5"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="10"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="30"} 16
talosccm_api_request_duration_seconds_bucket{request="platformmetadata",le="+Inf"} 16
talosccm_api_request_duration_seconds_sum{request="platformmetadata"} 1.2046141220000002
talosccm_api_request_duration_seconds_count{request="platformmetadata"} 16
```
20 changes: 20 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Package metrics collects metrics.
package metrics

import (
"time"
)

// MetricContext indicates the context for Talos client metrics.
type MetricContext struct {
start time.Time
attributes []string
}

// NewMetricContext creates a new MetricContext.
func NewMetricContext(resource string) *MetricContext {
return &MetricContext{
start: time.Now(),
attributes: []string{resource},
}
}
51 changes: 51 additions & 0 deletions pkg/metrics/metrics_api.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package metrics

import (
"time"

"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)

// TalosMetrics contains the metrics for Talos API calls.
type TalosMetrics struct {
Duration *metrics.HistogramVec
Errors *metrics.CounterVec
}

var apiMetrics = registerAPIMetrics()

// ObserveRequest records the request latency and counts the errors.
func (mc *MetricContext) ObserveRequest(err error) error {
apiMetrics.Duration.WithLabelValues(mc.attributes...).Observe(
time.Since(mc.start).Seconds())

if err != nil {
apiMetrics.Errors.WithLabelValues(mc.attributes...).Inc()
}

return err
}

func registerAPIMetrics() *TalosMetrics {
metrics := &TalosMetrics{
Duration: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Name: "talosccm_api_request_duration_seconds",
Help: "Latency of an Talos API call",
Buckets: []float64{.1, .25, .5, 1, 2.5, 5, 10, 30},
}, []string{"request"}),
Errors: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "talosccm_api_request_errors_total",
Help: "Total number of errors for an Talos API call",
}, []string{"request"}),
}

legacyregistry.MustRegister(
metrics.Duration,
metrics.Errors,
)

return metrics
}
44 changes: 44 additions & 0 deletions pkg/metrics/metrics_csr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package metrics

import (
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)

// CSRMetrics contains the metrics for certificate signing requests.
type CSRMetrics struct {
approvalCount *metrics.CounterVec
}

// CSRApprovalStatus is the status of a CSR.
type CSRApprovalStatus string

const (
// ApprovalStatusDeny is used when a CSR is denied.
ApprovalStatusDeny CSRApprovalStatus = "deny"
// ApprovalStatusApprove is used when a CSR is approved.
ApprovalStatusApprove CSRApprovalStatus = "approve"
)

var csrMetrics = registerCSRMetrics()

// CSRApprovedCount counts the number of approved, denied and ignored CSRs.
func CSRApprovedCount(status CSRApprovalStatus) {
csrMetrics.approvalCount.WithLabelValues(string(status)).Inc()
}

func registerCSRMetrics() *CSRMetrics {
metrics := &CSRMetrics{
approvalCount: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "talosccm_csr_approval_count",
Help: "Count of approved, denied and ignored node CSRs",
}, []string{"status"}),
}

legacyregistry.MustRegister(
metrics.approvalCount,
)

return metrics
}
5 changes: 5 additions & 0 deletions pkg/talos/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"fmt"
"strings"

"github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer"
utilsnet "github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net"
"github.com/siderolabs/talos/pkg/machinery/resources/network"
Expand Down Expand Up @@ -193,11 +194,15 @@ func csrNodeChecks(ctx context.Context, kclient clientkubernetes.Interface, x509

for _, ip := range x509cr.IPAddresses {
if !slices.Contains(nodeAddrs, ip.String()) {
metrics.CSRApprovedCount(metrics.ApprovalStatusDeny)

return false, fmt.Errorf("csrNodeChecks: CSR %s Node IP addresses don't match corresponding "+
"Node IP addresses %q, got %q", x509cr.DNSNames[0], nodeAddrs, ip)
}
}

metrics.CSRApprovedCount(metrics.ApprovalStatusApprove)

return true, nil
}

Expand Down
9 changes: 7 additions & 2 deletions pkg/talos/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"maps"
"strings"

"github.com/siderolabs/talos-cloud-controller-manager/pkg/metrics"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/transformer"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/net"
"github.com/siderolabs/talos-cloud-controller-manager/pkg/utils/platform"
Expand Down Expand Up @@ -64,9 +65,11 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud
return nil, fmt.Errorf("error refreshing client connection: %w", err)
}

mc := metrics.NewMetricContext(runtime.PlatformMetadataID)

for _, ip := range nodeIPs {
meta, err = i.c.getNodeMetadata(ctx, ip)
if err == nil {
if mc.ObserveRequest(err) == nil {
nodeIP = ip

break
Expand Down Expand Up @@ -99,8 +102,10 @@ func (i *instances) InstanceMetadata(ctx context.Context, node *v1.Node) (*cloud
return nil, fmt.Errorf("error transforming node: %w", err)
}

mc = metrics.NewMetricContext("addresses")

ifaces, err := i.c.getNodeIfaces(ctx, nodeIP)
if err != nil {
if mc.ObserveRequest(err) != nil {
return nil, fmt.Errorf("error getting interfaces list from the node %s: %w", node.Name, err)
}

Expand Down

0 comments on commit 2150cdd

Please sign in to comment.