Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve metrics for map and cache sizes #2291

Merged
merged 8 commits into from
Apr 11, 2024
27 changes: 23 additions & 4 deletions docs/content/en/docs/reference/metrics.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,7 @@ func New(s *server.Server) *Cache {
func Get() *Cache {
return cache
}

func (ec *Cache) len() int {
return len(ec.cache)
}
39 changes: 39 additions & 0 deletions pkg/eventcache/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package eventcache

import (
"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/prometheus/client_golang/prometheus"
)

type cacheSizeMetric struct {
desc *prometheus.Desc
}

func (m *cacheSizeMetric) Describe(ch chan<- *prometheus.Desc) {
ch <- m.desc
}

func (m *cacheSizeMetric) Collect(ch chan<- prometheus.Metric) {
size := 0
if cache != nil {
size = cache.len()
}
ch <- prometheus.MustNewConstMetric(
m.desc,
prometheus.GaugeValue,
float64(size),
)
}

func NewCacheCollector() prometheus.Collector {
return &cacheSizeMetric{
prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "event_cache_entries"),
"The number of entries in the event cache.",
nil, nil,
),
}
}
41 changes: 7 additions & 34 deletions pkg/metrics/mapmetrics/mapmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@ import (

var (
MapSize = metrics.NewBPFGauge(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_in_use_gauge"),
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_entries"),
"The total number of in-use entries per map.",
[]string{"map", "total"}, nil,
[]string{"map"}, nil,
))
MapCapacity = metrics.NewBPFGauge(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_capacity"),
"Capacity of a BPF map. Expected to be constant.",
[]string{"map"}, nil,
))
MapErrors = metrics.NewBPFCounter(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "map_errors_total"),
Expand All @@ -24,36 +29,4 @@ var (

func InitMetrics(_ *prometheus.Registry) {
// custom collectors are registered independently

// NOTES:
// * Delete (move/replace) map_drops_total as it's monitoring process cache not maps
// * Rename map_in_use_gauge metric (to e.g. map_entries) and delete total label?
// * Introduce a metric for map capacity
}

// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps.
// NB: We can't register individual BPF collectors collecting map metrics, because they share the
// metrics descriptors. Sending duplicate descriptors from different collectors results in
// a panic. Sending duplicate descriptors from the same collector is fine, so we define a simple
// wrapper for all collectors collecting map metrics.
type bpfCollector struct {
collectors []prometheus.Collector
}

func NewBPFCollector(collectors ...prometheus.Collector) prometheus.Collector {
return &bpfCollector{
collectors: collectors,
}
}

func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) {
for _, m := range c.collectors {
m.Describe(ch)
}
}

func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) {
for _, m := range c.collectors {
m.Collect(ch)
}
}
8 changes: 5 additions & 3 deletions pkg/metrics/metricsconfig/initmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package metricsconfig

import (
"github.com/cilium/tetragon/pkg/eventcache"
"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
"github.com/cilium/tetragon/pkg/metrics/eventcachemetrics"
Expand All @@ -19,6 +20,7 @@ import (
"github.com/cilium/tetragon/pkg/metrics/syscallmetrics"
"github.com/cilium/tetragon/pkg/metrics/watchermetrics"
"github.com/cilium/tetragon/pkg/observer"
"github.com/cilium/tetragon/pkg/process"
"github.com/cilium/tetragon/pkg/version"
grpcmetrics "github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus"
"github.com/prometheus/client_golang/prometheus"
Expand All @@ -29,10 +31,12 @@ func initHealthMetrics(registry *prometheus.Registry) {
version.InitMetrics(registry)
errormetrics.InitMetrics(registry)
eventcachemetrics.InitMetrics(registry)
registry.MustRegister(eventcache.NewCacheCollector())
eventmetrics.InitHealthMetrics(registry)
mapmetrics.InitMetrics(registry)
opcodemetrics.InitMetrics(registry)
policyfiltermetrics.InitMetrics(registry)
process.InitMetrics(registry)
ringbufmetrics.InitMetrics(registry)
ringbufqueuemetrics.InitMetrics(registry)
watchermetrics.InitMetrics(registry)
Expand All @@ -51,9 +55,7 @@ func initAllHealthMetrics(registry *prometheus.Registry) {
policystatemetrics.InitMetrics(registry)

// register custom collectors
registry.MustRegister(mapmetrics.NewBPFCollector(
observer.NewBPFCollector(),
))
registry.MustRegister(observer.NewBPFCollector())
registry.MustRegister(eventmetrics.NewBPFCollector())
}

Expand Down
18 changes: 13 additions & 5 deletions pkg/observer/observer_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
package observer

import (
"fmt"
"path/filepath"
"strings"

Expand All @@ -26,6 +25,7 @@ func NewBPFCollector() prometheus.Collector {

func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- mapmetrics.MapSize.Desc()
ch <- mapmetrics.MapCapacity.Desc()
ch <- mapmetrics.MapErrors.Desc()
}

Expand Down Expand Up @@ -72,12 +72,16 @@ func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) {
}
defer mapLink.Close()

updateMapSize(ch, mapLinkStats, int(mapLink.MaxEntries()), name)
updateMapSize(ch, mapLinkStats, name)
ch <- mapmetrics.MapCapacity.MustMetric(
float64(mapLink.MaxEntries()),
name,
)
updateMapErrors(ch, mapLinkStats, name)
}
}

func updateMapSize(ch chan<- prometheus.Metric, mapLinkStats *ebpf.Map, maxEntries int, name string) {
func updateMapSize(ch chan<- prometheus.Metric, mapLinkStats *ebpf.Map, name string) {
var values []int64
if err := mapLinkStats.Lookup(int32(0), &values); err != nil {
return
Expand All @@ -89,7 +93,7 @@ func updateMapSize(ch chan<- prometheus.Metric, mapLinkStats *ebpf.Map, maxEntri
}
ch <- mapmetrics.MapSize.MustMetric(
float64(sum),
name, fmt.Sprint(maxEntries),
name,
)
}

Expand Down Expand Up @@ -137,7 +141,11 @@ func (c *bpfZeroCollector) Collect(ch chan<- prometheus.Metric) {
for _, m := range monitoredMaps {
ch <- mapmetrics.MapSize.MustMetric(
0,
m, fmt.Sprint(0),
m,
)
ch <- mapmetrics.MapCapacity.MustMetric(
0,
m,
)
ch <- mapmetrics.MapErrors.MustMetric(
0,
Expand Down
31 changes: 31 additions & 0 deletions pkg/process/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,37 @@ var ProcessCacheTotal = prometheus.NewGauge(prometheus.GaugeOpts{
ConstLabels: nil,
})

type cacheCapacityMetric struct {
desc *prometheus.Desc
}

func (m *cacheCapacityMetric) Describe(ch chan<- *prometheus.Desc) {
ch <- m.desc
}

func (m *cacheCapacityMetric) Collect(ch chan<- prometheus.Metric) {
capacity := 0
if procCache != nil {
capacity = procCache.size
}
ch <- prometheus.MustNewConstMetric(
m.desc,
prometheus.GaugeValue,
float64(capacity),
)
}

func NewCacheCollector() prometheus.Collector {
return &cacheCapacityMetric{
prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "process_cache_capacity"),
"The capacity of the process cache. Expected to be constant.",
nil, nil,
),
}
}

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(ProcessCacheTotal)
registry.MustRegister(NewCacheCollector())
}
Loading