diff --git a/docs/content/en/docs/reference/helm-chart.md b/docs/content/en/docs/reference/helm-chart.md index 80b6c21d583..7dce1c9593c 100644 --- a/docs/content/en/docs/reference/helm-chart.md +++ b/docs/content/en/docs/reference/helm-chart.md @@ -81,6 +81,8 @@ To use [the values available](#values), with `helm install` or `helm upgrade`, u | tetragon.enableProcessCred | bool | `false` | Enable Capabilities visibility in exec and kprobe events. | | tetragon.enableProcessNs | bool | `false` | Enable Namespaces visibility in exec and kprobe events. | | tetragon.enabled | bool | `true` | | +| tetragon.eventCacheRetries | int | `15` | Configure the number of retries in tetragon's event cache. | +| tetragon.eventCacheRetryDelay | int | `2` | Configure the delay (in seconds) between retires in tetragon's event cache. | | tetragon.exportAllowList | string | `"{\"event_set\":[\"PROCESS_EXEC\", \"PROCESS_EXIT\", \"PROCESS_KPROBE\", \"PROCESS_UPROBE\", \"PROCESS_TRACEPOINT\", \"PROCESS_LSM\"]}"` | Allowlist for JSON export. For example, to export only process_connect events from the default namespace: exportAllowList: | {"namespace":["default"],"event_set":["PROCESS_EXEC"]} | | tetragon.exportDenyList | string | `"{\"health_check\":true}\n{\"namespace\":[\"\", \"cilium\", \"kube-system\"]}"` | Denylist for JSON export. For example, to exclude exec events that look similar to Kubernetes health checks and all the events from kube-system namespace and the host: exportDenyList: | {"health_check":true} {"namespace":["kube-system",""]} | | tetragon.exportFileCompress | bool | `false` | Compress rotated JSON export files. | diff --git a/docs/data/tetragon_flags.yaml b/docs/data/tetragon_flags.yaml index 11b1da257f0..a023cbc756f 100644 --- a/docs/data/tetragon_flags.yaml +++ b/docs/data/tetragon_flags.yaml @@ -76,6 +76,12 @@ options: default_value: "true" usage: | Enable TracingPolicy and TracingPolicyNamespaced custom resources + - name: event-cache-retries + default_value: "15" + usage: Number of retries for event cache + - name: event-cache-retry-delay + default_value: "2" + usage: Delay in seconds between event cache retries - name: event-queue-size default_value: "10000" usage: Set the size of the internal event queue. diff --git a/install/kubernetes/tetragon/README.md b/install/kubernetes/tetragon/README.md index 732ffcc162a..3f72e09d051 100644 --- a/install/kubernetes/tetragon/README.md +++ b/install/kubernetes/tetragon/README.md @@ -63,6 +63,8 @@ Helm chart for Tetragon | tetragon.enableProcessCred | bool | `false` | Enable Capabilities visibility in exec and kprobe events. | | tetragon.enableProcessNs | bool | `false` | Enable Namespaces visibility in exec and kprobe events. | | tetragon.enabled | bool | `true` | | +| tetragon.eventCacheRetries | int | `15` | Configure the number of retries in tetragon's event cache. | +| tetragon.eventCacheRetryDelay | int | `2` | Configure the delay (in seconds) between retires in tetragon's event cache. | | tetragon.exportAllowList | string | `"{\"event_set\":[\"PROCESS_EXEC\", \"PROCESS_EXIT\", \"PROCESS_KPROBE\", \"PROCESS_UPROBE\", \"PROCESS_TRACEPOINT\", \"PROCESS_LSM\"]}"` | Allowlist for JSON export. For example, to export only process_connect events from the default namespace: exportAllowList: | {"namespace":["default"],"event_set":["PROCESS_EXEC"]} | | tetragon.exportDenyList | string | `"{\"health_check\":true}\n{\"namespace\":[\"\", \"cilium\", \"kube-system\"]}"` | Denylist for JSON export. For example, to exclude exec events that look similar to Kubernetes health checks and all the events from kube-system namespace and the host: exportDenyList: | {"health_check":true} {"namespace":["kube-system",""]} | | tetragon.exportFileCompress | bool | `false` | Compress rotated JSON export files. | diff --git a/install/kubernetes/tetragon/templates/tetragon_configmap.yaml b/install/kubernetes/tetragon/templates/tetragon_configmap.yaml index 511692a2417..4d56e68c84e 100644 --- a/install/kubernetes/tetragon/templates/tetragon_configmap.yaml +++ b/install/kubernetes/tetragon/templates/tetragon_configmap.yaml @@ -67,4 +67,6 @@ data: {{- if .Values.tetragon.pprof.enabled }} pprof-address: {{ .Values.tetragon.pprof.address }}:{{ .Values.tetragon.pprof.port }} {{- end }} + event-cache-retries: {{ .Values.tetragon.eventCacheRetries | quote }} + event-cache-retry-delay: {{ .Values.tetragon.eventCacheRetryDelay | quote }} {{- include "configmap.extra" . | nindent 2 }} diff --git a/install/kubernetes/tetragon/values.yaml b/install/kubernetes/tetragon/values.yaml index f1686d31f08..60f2a76bb87 100644 --- a/install/kubernetes/tetragon/values.yaml +++ b/install/kubernetes/tetragon/values.yaml @@ -222,6 +222,10 @@ tetragon: extraVolumeMounts: [] # -- resources for the the oci-hook-setup init container resources: {} + # -- Configure the number of retries in tetragon's event cache. + eventCacheRetries: 15 + # -- Configure the delay (in seconds) between retires in tetragon's event cache. + eventCacheRetryDelay: 2 # Tetragon Operator settings tetragonOperator: # -- Enables the Tetragon Operator. diff --git a/pkg/defaults/defaults.go b/pkg/defaults/defaults.go index cb26d0eb420..b9920c42d53 100644 --- a/pkg/defaults/defaults.go +++ b/pkg/defaults/defaults.go @@ -45,6 +45,10 @@ const ( // Pid file where to write tetragon main PID DefaultPidFile = DefaultRunDir + "tetragon.pid" + + // defaults for the event cache + DefaultEventCacheNumRetries = 15 + DefaultEventCacheRetryDelay = 2 ) var ( diff --git a/pkg/eventcache/eventcache.go b/pkg/eventcache/eventcache.go index 8e29a4f2f68..c086bbc0601 100644 --- a/pkg/eventcache/eventcache.go +++ b/pkg/eventcache/eventcache.go @@ -9,6 +9,7 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" "github.com/cilium/tetragon/pkg/ktime" + "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/option" "github.com/cilium/tetragon/pkg/process" "github.com/cilium/tetragon/pkg/reader/node" @@ -23,13 +24,6 @@ const ( FROM_EV_CACHE ) -const ( - // garbage collection retries - CacheStrikes = 15 - // garbage collection run interval - EventRetryTimer = time.Second * 2 -) - var ( cache *Cache nodeName string @@ -135,7 +129,7 @@ func (ec *Cache) handleEvents() { } if err != nil { event.color++ - if event.color < CacheStrikes { + if event.color < option.Config.EventCacheNumRetries { tmp = append(tmp, event) continue } @@ -169,9 +163,9 @@ func (ec *Cache) loop() { for { select { case <-ticker.C: - /* Every 'EventRetryTimer' walk the slice of events pending pod info. If - * an event hasn't completed its podInfo after two iterations send the - * event anyways. + /* Every 'option.Config.EventCacheRetryDelay' seconds walk the slice of events + * pending pod info. If an event hasn't completed its podInfo after two iterations + * send the event anyways. */ ec.handleEvents() @@ -228,6 +222,8 @@ func NewWithTimer(s *server.Server, dur time.Duration) *Cache { cache.done <- true } + logger.GetLogger().WithField("retries", option.Config.EventCacheNumRetries).WithField("delay", dur).Info("Creating new EventCache") + cache = &Cache{ objsChan: make(chan CacheObj), done: make(chan bool), @@ -241,7 +237,7 @@ func NewWithTimer(s *server.Server, dur time.Duration) *Cache { } func New(s *server.Server) *Cache { - return NewWithTimer(s, EventRetryTimer) + return NewWithTimer(s, time.Second*time.Duration(option.Config.EventCacheRetryDelay)) } func Get() *Cache { diff --git a/pkg/grpc/exec/exec_test_helper.go b/pkg/grpc/exec/exec_test_helper.go index 03b8b83226d..eb2bbf3455a 100644 --- a/pkg/grpc/exec/exec_test_helper.go +++ b/pkg/grpc/exec/exec_test_helper.go @@ -27,7 +27,7 @@ import ( ) const ( - CacheTimerMs = 100 + CacheTimerMs = 1 ) var ( @@ -415,7 +415,7 @@ func GrpcExecOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *testing.T) AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckExecEvents(t, AllEvents, parentPid, currentPid) } @@ -474,7 +474,7 @@ func GrpcExecMisingParent[EXEC notify.Message, EXIT notify.Message](t *testing.T AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work assert.Equal(t, len(AllEvents), 1) execEv := AllEvents[0].GetProcessExec() @@ -503,7 +503,7 @@ func GrpcMissingExec[EXEC notify.Message, EXIT notify.Message](t *testing.T) { AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work assert.Equal(t, len(AllEvents), 1) ev := AllEvents[0] @@ -659,7 +659,7 @@ func GrpcExecCloneOutOfOrder[EXEC notify.Message, CLONE notify.Message, EXIT not AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckCloneEvents(t, AllEvents, currentPid, clonePid) } @@ -793,8 +793,8 @@ func GrpcExecPodInfoInOrder[EXEC notify.Message, EXIT notify.Message](t *testing AllEvents = append(AllEvents, e) } - fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } @@ -838,7 +838,7 @@ func GrpcExecPodInfoOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *test } fakeWatcher.AddPod(dummyPod) - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } @@ -886,7 +886,7 @@ func GrpcExecPodInfoInOrderAfter[EXEC notify.Message, EXIT notify.Message](t *te AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } @@ -933,7 +933,7 @@ func GrpcExecPodInfoOutOfOrderAfter[EXEC notify.Message, EXIT notify.Message](t AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } @@ -983,7 +983,7 @@ func GrpcExecPodInfoDelayedOutOfOrder[EXEC notify.Message, EXIT notify.Message]( fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } @@ -1032,7 +1032,7 @@ func GrpcExecPodInfoDelayedInOrder[EXEC notify.Message, EXIT notify.Message](t * fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } @@ -1079,7 +1079,7 @@ func GrpcDelayedExecK8sOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *t AllEvents = append(AllEvents, e) } - time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work + time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work CheckPodEvents(t, AllEvents) } diff --git a/pkg/jsonchecker/jsonchecker.go b/pkg/jsonchecker/jsonchecker.go index 150eb1667cb..d0350bdafec 100644 --- a/pkg/jsonchecker/jsonchecker.go +++ b/pkg/jsonchecker/jsonchecker.go @@ -15,15 +15,14 @@ import ( "github.com/cilium/tetragon/api/v1/tetragon" ec "github.com/cilium/tetragon/api/v1/tetragon/codegen/eventchecker" "github.com/cilium/tetragon/api/v1/tetragon/codegen/helpers" - "github.com/cilium/tetragon/pkg/eventcache" "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/testutils" "github.com/sirupsen/logrus" ) -var ( +const ( Retries = 13 - RetryDelay = eventcache.EventRetryTimer + (1 * time.Second) + RetryDelay = 3 * time.Second ) // DebugError is an error that will create a debug output message diff --git a/pkg/option/config.go b/pkg/option/config.go index 867acb91391..924a588c7a2 100644 --- a/pkg/option/config.go +++ b/pkg/option/config.go @@ -10,6 +10,7 @@ import ( "path/filepath" "time" + "github.com/cilium/tetragon/pkg/defaults" "github.com/cilium/tetragon/pkg/logger" "github.com/cilium/tetragon/pkg/metrics" "github.com/spf13/viper" @@ -100,6 +101,9 @@ type config struct { EnableCgIDmap bool EnableCgIDmapDebug bool + + EventCacheNumRetries int + EventCacheRetryDelay int } var ( @@ -117,6 +121,11 @@ var ( // Enable all metrics labels by default MetricsLabelFilter: DefaultLabelFilter(), + + // set default valus for the event cache + // mainly used in the case of testing + EventCacheNumRetries: defaults.DefaultEventCacheNumRetries, + EventCacheRetryDelay: defaults.DefaultEventCacheRetryDelay, } ) diff --git a/pkg/option/flags.go b/pkg/option/flags.go index 54fd9ca08f1..b2a6b9e1aae 100644 --- a/pkg/option/flags.go +++ b/pkg/option/flags.go @@ -112,6 +112,9 @@ const ( KeyEnableCgIDmap = "enable-cgidmap" KeyEnableCgIDmapDebug = "enable-cgidmap-debug" + + KeyEventCacheRetries = "event-cache-retries" + KeyEventCacheRetryDelay = "event-cache-retry-delay" ) type UsernameMetadaCode int @@ -238,6 +241,10 @@ func ReadAndSetFlags() error { Config.EnableCgIDmap = viper.GetBool(KeyEnableCgIDmap) Config.EnableCgIDmapDebug = viper.GetBool(KeyEnableCgIDmapDebug) + + Config.EventCacheNumRetries = viper.GetInt(KeyEventCacheRetries) + Config.EventCacheRetryDelay = viper.GetInt(KeyEventCacheRetryDelay) + return nil } @@ -401,4 +408,7 @@ func AddFlags(flags *pflag.FlagSet) { flags.Bool(KeyEnableCgIDmap, false, "enable pod resolution via cgroup ids") flags.Bool(KeyEnableCgIDmapDebug, false, "enable cgidmap debugging info") + + flags.Int(KeyEventCacheRetries, defaults.DefaultEventCacheNumRetries, "Number of retries for event cache") + flags.Int(KeyEventCacheRetryDelay, defaults.DefaultEventCacheRetryDelay, "Delay in seconds between event cache retries") }