Skip to content

Commit

Permalink
Make EventCache configurable
Browse files Browse the repository at this point in the history
These include the number of retries and the delay between them.

New command line arguments to do that:
      --event-cache-retries int                   Number of retries for event cache (default 15)
      --event-cache-retry-delay int               Delay in seconds between event cache retries (default 2)

Signed-off-by: Anastasios Papagiannis <[email protected]>
  • Loading branch information
tpapagian committed Sep 18, 2024
1 parent 7bdd83b commit 62957f1
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 27 deletions.
6 changes: 6 additions & 0 deletions docs/data/tetragon_flags.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ const (

// Pid file where to write tetragon main PID
DefaultPidFile = DefaultRunDir + "tetragon.pid"

// defaults for the event cache
DefaultEventCacheNumRetries = 15
DefaultEventCacheRetryDelay = 2
)

var (
Expand Down
20 changes: 8 additions & 12 deletions pkg/eventcache/eventcache.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/cilium/tetragon/api/v1/tetragon"
"github.com/cilium/tetragon/pkg/ktime"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/option"
"github.com/cilium/tetragon/pkg/process"
"github.com/cilium/tetragon/pkg/reader/node"
Expand All @@ -23,13 +24,6 @@ const (
FROM_EV_CACHE
)

const (
// garbage collection retries
CacheStrikes = 15
// garbage collection run interval
EventRetryTimer = time.Second * 2
)

var (
cache *Cache
nodeName string
Expand Down Expand Up @@ -135,7 +129,7 @@ func (ec *Cache) handleEvents() {
}
if err != nil {
event.color++
if event.color < CacheStrikes {
if event.color < option.Config.EventCacheNumRetries {
tmp = append(tmp, event)
continue
}
Expand Down Expand Up @@ -169,9 +163,9 @@ func (ec *Cache) loop() {
for {
select {
case <-ticker.C:
/* Every 'EventRetryTimer' walk the slice of events pending pod info. If
* an event hasn't completed its podInfo after two iterations send the
* event anyways.
/* Every 'option.Config.EventCacheRetryDelay' seconds walk the slice of events
* pending pod info. If an event hasn't completed its podInfo after two iterations
* send the event anyways.
*/
ec.handleEvents()

Expand Down Expand Up @@ -228,6 +222,8 @@ func NewWithTimer(s *server.Server, dur time.Duration) *Cache {
cache.done <- true
}

logger.GetLogger().WithField("retries", option.Config.EventCacheNumRetries).WithField("delay", dur).Info("Creating new EventCache")

cache = &Cache{
objsChan: make(chan CacheObj),
done: make(chan bool),
Expand All @@ -241,7 +237,7 @@ func NewWithTimer(s *server.Server, dur time.Duration) *Cache {
}

func New(s *server.Server) *Cache {
return NewWithTimer(s, EventRetryTimer)
return NewWithTimer(s, time.Second*time.Duration(option.Config.EventCacheRetryDelay))
}

func Get() *Cache {
Expand Down
24 changes: 12 additions & 12 deletions pkg/grpc/exec/exec_test_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ func GrpcExecOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *testing.T)
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckExecEvents(t, AllEvents, parentPid, currentPid)
}

Expand Down Expand Up @@ -474,7 +474,7 @@ func GrpcExecMisingParent[EXEC notify.Message, EXIT notify.Message](t *testing.T
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

assert.Equal(t, len(AllEvents), 1)
execEv := AllEvents[0].GetProcessExec()
Expand Down Expand Up @@ -503,7 +503,7 @@ func GrpcMissingExec[EXEC notify.Message, EXIT notify.Message](t *testing.T) {
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

assert.Equal(t, len(AllEvents), 1)
ev := AllEvents[0]
Expand Down Expand Up @@ -659,7 +659,7 @@ func GrpcExecCloneOutOfOrder[EXEC notify.Message, CLONE notify.Message, EXIT not
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckCloneEvents(t, AllEvents, currentPid, clonePid)
}
Expand Down Expand Up @@ -793,8 +793,8 @@ func GrpcExecPodInfoInOrder[EXEC notify.Message, EXIT notify.Message](t *testing
AllEvents = append(AllEvents, e)
}

fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return
time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -838,7 +838,7 @@ func GrpcExecPodInfoOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *test
}

fakeWatcher.AddPod(dummyPod)
time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -886,7 +886,7 @@ func GrpcExecPodInfoInOrderAfter[EXEC notify.Message, EXIT notify.Message](t *te
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -933,7 +933,7 @@ func GrpcExecPodInfoOutOfOrderAfter[EXEC notify.Message, EXIT notify.Message](t
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work
CheckPodEvents(t, AllEvents)
}

Expand Down Expand Up @@ -983,7 +983,7 @@ func GrpcExecPodInfoDelayedOutOfOrder[EXEC notify.Message, EXIT notify.Message](

fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckPodEvents(t, AllEvents)
}
Expand Down Expand Up @@ -1032,7 +1032,7 @@ func GrpcExecPodInfoDelayedInOrder[EXEC notify.Message, EXIT notify.Message](t *

fakeWatcher.AddPod(dummyPod) // setup some dummy pod to return

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckPodEvents(t, AllEvents)
}
Expand Down Expand Up @@ -1079,7 +1079,7 @@ func GrpcDelayedExecK8sOutOfOrder[EXEC notify.Message, EXIT notify.Message](t *t
AllEvents = append(AllEvents, e)
}

time.Sleep(time.Millisecond * ((eventcache.CacheStrikes + 4) * CacheTimerMs)) // wait for cache to do it's work
time.Sleep(time.Millisecond * time.Duration((option.Config.EventCacheNumRetries+4)*CacheTimerMs)) // wait for cache to do it's work

CheckPodEvents(t, AllEvents)
}
5 changes: 2 additions & 3 deletions pkg/jsonchecker/jsonchecker.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@ import (
"github.com/cilium/tetragon/api/v1/tetragon"
ec "github.com/cilium/tetragon/api/v1/tetragon/codegen/eventchecker"
"github.com/cilium/tetragon/api/v1/tetragon/codegen/helpers"
"github.com/cilium/tetragon/pkg/eventcache"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/testutils"
"github.com/sirupsen/logrus"
)

var (
const (
Retries = 13
RetryDelay = eventcache.EventRetryTimer + (1 * time.Second)
RetryDelay = 3 * time.Second
)

// DebugError is an error that will create a debug output message
Expand Down
9 changes: 9 additions & 0 deletions pkg/option/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"path/filepath"
"time"

"github.com/cilium/tetragon/pkg/defaults"
"github.com/cilium/tetragon/pkg/logger"
"github.com/cilium/tetragon/pkg/metrics"
"github.com/spf13/viper"
Expand Down Expand Up @@ -100,6 +101,9 @@ type config struct {

EnableCgIDmap bool
EnableCgIDmapDebug bool

EventCacheNumRetries int
EventCacheRetryDelay int
}

var (
Expand All @@ -117,6 +121,11 @@ var (

// Enable all metrics labels by default
MetricsLabelFilter: DefaultLabelFilter(),

// set default valus for the event cache
// mainly used in the case of testing
EventCacheNumRetries: defaults.DefaultEventCacheNumRetries,
EventCacheRetryDelay: defaults.DefaultEventCacheRetryDelay,
}
)

Expand Down
10 changes: 10 additions & 0 deletions pkg/option/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ const (

KeyEnableCgIDmap = "enable-cgidmap"
KeyEnableCgIDmapDebug = "enable-cgidmap-debug"

KeyEventCacheRetries = "event-cache-retries"
KeyEventCacheRetryDelay = "event-cache-retry-delay"
)

type UsernameMetadaCode int
Expand Down Expand Up @@ -238,6 +241,10 @@ func ReadAndSetFlags() error {

Config.EnableCgIDmap = viper.GetBool(KeyEnableCgIDmap)
Config.EnableCgIDmapDebug = viper.GetBool(KeyEnableCgIDmapDebug)

Config.EventCacheNumRetries = viper.GetInt(KeyEventCacheRetries)
Config.EventCacheRetryDelay = viper.GetInt(KeyEventCacheRetryDelay)

return nil
}

Expand Down Expand Up @@ -401,4 +408,7 @@ func AddFlags(flags *pflag.FlagSet) {

flags.Bool(KeyEnableCgIDmap, false, "enable pod resolution via cgroup ids")
flags.Bool(KeyEnableCgIDmapDebug, false, "enable cgidmap debugging info")

flags.Int(KeyEventCacheRetries, defaults.DefaultEventCacheNumRetries, "Number of retries for event cache")
flags.Int(KeyEventCacheRetryDelay, defaults.DefaultEventCacheRetryDelay, "Delay in seconds between event cache retries")
}

0 comments on commit 62957f1

Please sign in to comment.