From c7354b069813c5bf1c47c8f1f83007f8dc974743 Mon Sep 17 00:00:00 2001 From: Anna Kapuscinska Date: Mon, 2 Sep 2024 00:29:40 +0200 Subject: [PATCH] Stop reporting process cache evictions and misses as errors Stop reporting tetragon_errors_total metric with "type" label values: process_cache_miss_on_get process_cache_evicted process_cache_miss_on_remove Evictions and misses are not errors, this is just cache's life, sometimes you hit, sometimes you miss, and sometimes you gotta evict a process. They are now reported by more intuitive metrics: tetragon_process_cache_evictions_total tetragon_process_cache_misses_total{operation=~"get|remove"} Signed-off-by: Anna Kapuscinska --- contrib/upgrade-notes/latest.md | 3 +++ docs/content/en/docs/reference/metrics.md | 2 +- pkg/metrics/errormetrics/errormetrics.go | 11 +---------- pkg/process/cache.go | 4 ---- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/contrib/upgrade-notes/latest.md b/contrib/upgrade-notes/latest.md index dd6acfe6889..d0339048417 100644 --- a/contrib/upgrade-notes/latest.md +++ b/contrib/upgrade-notes/latest.md @@ -53,3 +53,6 @@ tetragon: * `tetragon_ringbuf_perf_event_lost_total` -> `tetragon_observer_ringbuf_events_lost_total` * `tetragon_ringbuf_queue_received_total` -> `tetragon_observer_ringbuf_queue_events_received_total` * `tetragon_ringbuf_queue_lost_total` -> `tetragon_observer_ringbuf_queue_events_lost_total` +* `tetragon_errors_total{type="process_cache_evicted"}` metric is replaced by `tetragon_process_cache_evicted_total`. +* `tetragon_errors_total{type=~"process_cache_miss_on_get|process_cache_miss_on_remove"}` metrics are replaced by + `tetragon_process_cache_misses_total{operation=~"get|remove"}`. \ No newline at end of file diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index 30b9a0960c2..b62a77a20cb 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -51,7 +51,7 @@ The total number of Tetragon errors. For internal use only. | label | values | | ----- | ------ | -| `type ` | `event_finalize_process_info_failed, event_missing_process_info, handler_error, process_cache_evicted, process_cache_miss_on_get, process_cache_miss_on_remove, process_metadata_username_failed, process_metadata_username_ignored_not_in_host_namespaces, process_pid_tid_mismatch` | +| `type ` | `event_finalize_process_info_failed, event_missing_process_info, handler_error, process_metadata_username_failed, process_metadata_username_ignored_not_in_host_namespaces, process_pid_tid_mismatch` | ### `tetragon_event_cache_accesses_total` diff --git a/pkg/metrics/errormetrics/errormetrics.go b/pkg/metrics/errormetrics/errormetrics.go index 109f35e3746..1967066effb 100644 --- a/pkg/metrics/errormetrics/errormetrics.go +++ b/pkg/metrics/errormetrics/errormetrics.go @@ -15,14 +15,8 @@ import ( type ErrorType int const ( - // Process not found on get() call. - ProcessCacheMissOnGet ErrorType = iota - // Process evicted from the cache. - ProcessCacheEvicted - // Process not found on remove() call. - ProcessCacheMissOnRemove // Tid and Pid mismatch that could affect BPF and user space caching logic - ProcessPidTidMismatch + ProcessPidTidMismatch ErrorType = iota // An event is missing process info. EventMissingProcessInfo // An error occurred in an event handler. @@ -37,9 +31,6 @@ const ( ) var errorTypeLabelValues = map[ErrorType]string{ - ProcessCacheMissOnGet: "process_cache_miss_on_get", - ProcessCacheEvicted: "process_cache_evicted", - ProcessCacheMissOnRemove: "process_cache_miss_on_remove", ProcessPidTidMismatch: "process_pid_tid_mismatch", EventMissingProcessInfo: "event_missing_process_info", HandlerError: "handler_error", diff --git a/pkg/process/cache.go b/pkg/process/cache.go index 9618e20a075..7040db8480a 100644 --- a/pkg/process/cache.go +++ b/pkg/process/cache.go @@ -10,7 +10,6 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/logger" - "github.com/cilium/tetragon/pkg/metrics/errormetrics" lru "github.com/hashicorp/golang-lru/v2" ) @@ -132,7 +131,6 @@ func NewCache( lruCache, err := lru.NewWithEvict( processCacheSize, func(_ string, _ *ProcessInternal) { - errormetrics.ErrorTotalInc(errormetrics.ProcessCacheEvicted) processCacheEvictions.Inc() }, ) @@ -151,7 +149,6 @@ func (pc *Cache) get(processID string) (*ProcessInternal, error) { process, ok := pc.cache.Get(processID) if !ok { logger.GetLogger().WithField("id in event", processID).Debug("process not found in cache") - errormetrics.ErrorTotalInc(errormetrics.ProcessCacheMissOnGet) processCacheMisses.WithLabelValues("get").Inc() return nil, fmt.Errorf("invalid entry for process ID: %s", processID) } @@ -173,7 +170,6 @@ func (pc *Cache) remove(process *tetragon.Process) bool { if present { processCacheTotal.Dec() } else { - errormetrics.ErrorTotalInc(errormetrics.ProcessCacheMissOnRemove) processCacheMisses.WithLabelValues("remove").Inc() } return present