Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make event cache tunable #2928

Merged
merged 3 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/content/en/docs/reference/helm-chart.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions docs/data/tetragon_flags.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions install/kubernetes/tetragon/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,6 @@ data:
{{- if .Values.tetragon.pprof.enabled }}
pprof-address: {{ .Values.tetragon.pprof.address }}:{{ .Values.tetragon.pprof.port }}
{{- end }}
event-cache-retries: {{ .Values.tetragon.eventCacheRetries | quote }}
event-cache-retry-delay: {{ .Values.tetragon.eventCacheRetryDelay | quote }}
{{- include "configmap.extra" . | nindent 2 }}
4 changes: 4 additions & 0 deletions install/kubernetes/tetragon/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ tetragon:
extraVolumeMounts: []
# -- resources for the the oci-hook-setup init container
resources: {}
# -- Configure the number of retries in tetragon's event cache.
eventCacheRetries: 15
# -- Configure the delay (in seconds) between retires in tetragon's event cache.
eventCacheRetryDelay: 2
# Tetragon Operator settings
tetragonOperator:
# -- Enables the Tetragon Operator.
Expand Down
4 changes: 4 additions & 0 deletions pkg/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ const (

// Pid file where to write tetragon main PID
DefaultPidFile = DefaultRunDir + "tetragon.pid"

// defaults for the event cache
DefaultEventCacheNumRetries = 15
DefaultEventCacheRetryDelay = 2
)

var (
Expand Down
20 changes: 8 additions & 12 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/ktime"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/option"
"github.com/cilium/tetragon/pkg/process"
"github.com/cilium/tetragon/pkg/reader/node"
Expand All @@ -23,13 +24,6 @@ const (
FROM_EV_CACHE
)

const (
// garbage collection retries
CacheStrikes = 15
// garbage collection run interval
EventRetryTimer = time.Second * 2
)

var (
cache *Cache
nodeName string
Expand Down Expand Up @@ -135,7 +129,7 @@ func (ec *Cache) handleEvents() {
}
if err != nil {
event.color++
if event.color < CacheStrikes {
if event.color < option.Config.EventCacheNumRetries {
tmp = append(tmp, event)
continue
}
Expand Down Expand Up @@ -169,9 +163,9 @@ func (ec *Cache) loop() {
for {
select {
case <-ticker.C:
/* Every 'EventRetryTimer' walk the slice of events pending pod info. If
* an event hasn't completed its podInfo after two iterations send the
* event anyways.
/* Every 'option.Config.EventCacheRetryDelay' seconds walk the slice of events
* pending pod info. If an event hasn't completed its podInfo after two iterations
* send the event anyways.
*/
ec.handleEvents()

Expand Down Expand Up @@ -228,6 +222,8 @@ func NewWithTimer(s *server.Server, dur time.Duration) *Cache {
cache.done <- true
}

logger.GetLogger().WithField("retries", option.Config.EventCacheNumRetries).WithField("delay", dur).Info("Creating new EventCache")

cache = &Cache{
objsChan: make(chan CacheObj),
done: make(chan bool),
Expand All @@ -241,7 +237,7 @@ func NewWithTimer(s *server.Server, dur time.Duration) *Cache {
}

func New(s *server.Server) *Cache {
return NewWithTimer(s, EventRetryTimer)
return NewWithTimer(s, time.Second*time.Duration(option.Config.EventCacheRetryDelay))
}

func Get() *Cache {
Expand Down
26 changes: 13 additions & 13 deletions pkg/grpc/exec/exec_test_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
)

const (
CacheTimerMs = 100
CacheTimerMs = 1
)

var (
Expand Down Expand Up @@ -415,7 +415,7 @@ func GrpcExecOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *testing.T)
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckExecEvents(t, AllEvents, parentPid, currentPid)
}

Expand Down Expand Up @@ -474,7 +474,7 @@ func GrpcExecMisingParent[EXEC notify.Message, EXIT notify.Message](t *testing.T
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

assert.Equal(t, len(AllEvents), 1)
execEv := AllEvents[0].GetProcessExec()
Expand Down Expand Up @@ -503,7 +503,7 @@ func GrpcMissingExec[EXEC notify.Message, EXIT notify.Message](t *testing.T) {
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

assert.Equal(t, len(AllEvents), 1)
ev := AllEvents[0]
Expand Down Expand Up @@ -659,7 +659,7 @@ func GrpcExecCloneOutOfOrder[EXEC notify.Message, CLONE notify.Message, EXIT not
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckCloneEvents(t, AllEvents, currentPid, clonePid)
}
Expand Down Expand Up @@ -793,8 +793,8 @@ func GrpcExecPodInfoInOrder[EXEC notify.Message, EXIT notify.Message](t *testing
AllEvents = append(AllEvents, e)
}

fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return
time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -838,7 +838,7 @@ func GrpcExecPodInfoOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *test
}

fakeWatcher.AddPod(dummyPod)
time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -886,7 +886,7 @@ func GrpcExecPodInfoInOrderAfter[EXEC notify.Message, EXIT notify.Message](t *te
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -933,7 +933,7 @@ func GrpcExecPodInfoOutOfOrderAfter[EXEC notify.Message, EXIT notify.Message](t
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -983,7 +983,7 @@ func GrpcExecPodInfoDelayedOutOfOrder[EXEC notify.Message, EXIT notify.Message](

fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckPodEvents(t, AllEvents)
}
Expand Down Expand Up @@ -1032,7 +1032,7 @@ func GrpcExecPodInfoDelayedInOrder[EXEC notify.Message, EXIT notify.Message](t *

fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckPodEvents(t, AllEvents)
}
Expand Down Expand Up @@ -1079,7 +1079,7 @@ func GrpcDelayedExecK8sOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *t
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckPodEvents(t, AllEvents)
}
5 changes: 2 additions & 3 deletions pkg/jsonchecker/jsonchecker.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@ import (
"github.com/cilium/tetragon/api/v1/tetragon"
ec "github.com/cilium/tetragon/api/v1/tetragon/codegen/eventchecker"
"github.com/cilium/tetragon/api/v1/tetragon/codegen/helpers"
"github.com/cilium/tetragon/pkg/eventcache"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/testutils"
"github.com/sirupsen/logrus"
)

var (
const (
Retries = 13
RetryDelay = eventcache.EventRetryTimer + (1 * time.Second)
RetryDelay = 3 * time.Second
)

// DebugError is an error that will create a debug output message
Expand Down
9 changes: 9 additions & 0 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"path/filepath"
"time"

"github.com/cilium/tetragon/pkg/defaults"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/spf13/viper"
Expand Down Expand Up @@ -100,6 +101,9 @@ type config struct {

EnableCgIDmap bool
EnableCgIDmapDebug bool

EventCacheNumRetries int
EventCacheRetryDelay int
}

var (
Expand All @@ -117,6 +121,11 @@ var (

// Enable all metrics labels by default
MetricsLabelFilter: DefaultLabelFilter(),

// set default valus for the event cache
// mainly used in the case of testing
EventCacheNumRetries: defaults.DefaultEventCacheNumRetries,
EventCacheRetryDelay: defaults.DefaultEventCacheRetryDelay,
}
)

Expand Down
10 changes: 10 additions & 0 deletions pkg/option/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ const (

KeyEnableCgIDmap = "enable-cgidmap"
KeyEnableCgIDmapDebug = "enable-cgidmap-debug"

KeyEventCacheRetries = "event-cache-retries"
KeyEventCacheRetryDelay = "event-cache-retry-delay"
)

type UsernameMetadaCode int
Expand Down Expand Up @@ -238,6 +241,10 @@ func ReadAndSetFlags() error {

Config.EnableCgIDmap = viper.GetBool(KeyEnableCgIDmap)
Config.EnableCgIDmapDebug = viper.GetBool(KeyEnableCgIDmapDebug)

Config.EventCacheNumRetries = viper.GetInt(KeyEventCacheRetries)
Config.EventCacheRetryDelay = viper.GetInt(KeyEventCacheRetryDelay)

return nil
}

Expand Down Expand Up @@ -401,4 +408,7 @@ func AddFlags(flags *pflag.FlagSet) {

flags.Bool(KeyEnableCgIDmap, false, "enable pod resolution via cgroup ids")
flags.Bool(KeyEnableCgIDmapDebug, false, "enable cgidmap debugging info")

flags.Int(KeyEventCacheRetries, defaults.DefaultEventCacheNumRetries, "Number of retries for event cache")
flags.Int(KeyEventCacheRetryDelay, defaults.DefaultEventCacheRetryDelay, "Delay in seconds between event cache retries")
}
Loading