Skip to content

Commit

Permalink
chore: add perfbuf metric per event (METRICS=1)
Browse files Browse the repository at this point in the history
Enabled only when built with METRICS=1.

BPFPerfEventSubmitAttemptsCount and BPFPerfEventSubmitFailuresCount
count the number of events processed by the eBPF programs and written to
or attempted to be written to the perf buffer.

It is incremented right after the attempt of writing the event to the
perf buffer, making it possible to measure if the that event was
successfully written to the perf buffer or not.

This metric can be used to monitor the performance of individual eBPF
events and to detect potential bottlenecks.
  • Loading branch information
geyslan committed Dec 3, 2024
1 parent 5fe36d1 commit 5a5c2e7
Show file tree
Hide file tree
Showing 10 changed files with 414 additions and 38 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ else
GO_DEBUG_FLAG = -w
endif

ifeq ($(METRICS),1)
BPF_DEBUG_FLAG += -DMETRICS
endif

ifeq ($(UNAME_M),x86_64)
ARCH = x86_64
LINUX_ARCH = x86
Expand Down Expand Up @@ -423,6 +427,7 @@ $(OUTPUT_DIR)/tracee.bpf.o: \
$(TRACEE_EBPF_OBJ_HEADERS)
#
$(CMD_CLANG) \
$(BPF_DEBUG_FLAG) \
-D__TARGET_ARCH_$(LINUX_ARCH) \
-D__BPF_TRACING__ \
-DCORE \
Expand Down Expand Up @@ -501,6 +506,7 @@ $(OUTPUT_DIR)/tracee: \
-ldflags="$(GO_DEBUG_FLAG) \
-extldflags \"$(CGO_EXT_LDFLAGS_EBPF)\" \
-X github.com/aquasecurity/tracee/pkg/version.version=$(VERSION) \
-X github.com/aquasecurity/tracee/pkg/version.metrics=$(METRICS) \
" \
-v -o $@ \
./cmd/tracee
Expand Down
33 changes: 32 additions & 1 deletion pkg/ebpf/c/common/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,24 @@ statfunc int save_args_to_submit_buf(event_data_t *event, args_t *args)
return arg_num;
}

#ifdef METRICS
struct event_stats_values {
u64 attempts;
u64 failures;
};

typedef struct event_stats_values event_stats_values_t;

struct events_stats {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_EVENT_ID);
__type(key, u32); // eventid
__type(value, event_stats_values_t);
} events_stats SEC(".maps");

typedef struct events_stats events_stats_t;
#endif

statfunc int events_perf_submit(program_data_t *p, long ret)
{
p->event->context.retval = ret;
Expand All @@ -484,7 +502,20 @@ statfunc int events_perf_submit(program_data_t *p, long ret)
:
: [size] "r"(size), [max_size] "i"(MAX_EVENT_SIZE));

return bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size);
long perf_ret = bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size);

#ifdef METRICS
// update event stats
event_stats_values_t *evt_stat = bpf_map_lookup_elem(&events_stats, &p->event->context.eventid);
if (unlikely(evt_stat == NULL))
return perf_ret;

__sync_fetch_and_add(&evt_stat->attempts, 1);
if (perf_ret < 0)
__sync_fetch_and_add(&evt_stat->failures, 1);
#endif

return perf_ret;
}

statfunc int signal_perf_submit(void *ctx, controlplane_signal_t *sig)
Expand Down
18 changes: 17 additions & 1 deletion pkg/ebpf/c/tracee.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -5513,7 +5513,23 @@ statfunc u32 cgroup_skb_submit(void *map, struct __sk_buff *ctx,
neteventctx->eventctx.eventid = event_type;

// Submit the event.
return bpf_perf_event_output(ctx, map, flags, neteventctx, sizeof_net_event_context_t());
long perf_ret = bpf_perf_event_output(ctx, map, flags, neteventctx, sizeof_net_event_context_t());

#ifdef METRICS
if (map != &events)
return perf_ret;

// update event stats
event_stats_values_t *evt_stat = bpf_map_lookup_elem(&events_stats, &neteventctx->eventctx.eventid);
if (unlikely(evt_stat == NULL))
return perf_ret;

__sync_fetch_and_add(&evt_stat->attempts, 1);
if (perf_ret < 0)
__sync_fetch_and_add(&evt_stat->failures, 1);
#endif

return perf_ret;
}

// Submit a network event.
Expand Down
79 changes: 79 additions & 0 deletions pkg/ebpf/perf_count.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package ebpf

import (
"context"
"encoding/binary"
"time"
"unsafe"

"github.com/aquasecurity/tracee/pkg/events"
"github.com/aquasecurity/tracee/pkg/logger"
)

// eventStatsValues mirrors the C struct event_stats_values (event_stats_values_t).
type eventStatsValues struct {
submitAttempts uint64
submitFailures uint64
}

// countPerfEventSubmissions is a goroutine that periodically counts the
// number of attempts and failures to submit events to the perf buffer
func (t *Tracee) countPerfEventSubmissions(ctx context.Context) {
logger.Debugw("Starting countPerfEventSubmissions goroutine")
defer logger.Debugw("Stopped countPerfEventSubmissions goroutine")

evtsCountsBPFMap, err := t.bpfModule.GetMap("events_stats")
if err != nil {
logger.Errorw("Failed to get events_stats map", "error", err)
return
}

evtStatZero := eventStatsValues{}
for _, id := range t.policyManager.EventsToSubmit() {
key := uint32(id)
err := evtsCountsBPFMap.Update(unsafe.Pointer(&key), unsafe.Pointer(&evtStatZero))
if err != nil {
logger.Errorw("Failed to update events_stats map", "error", err)
}
}

ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t.stats.BPFPerfEventSubmitAttemptsCount.Reset()
t.stats.BPFPerfEventSubmitFailuresCount.Reset()

// Get the counts of each event from the BPF map
iter := evtsCountsBPFMap.Iterator()
for iter.Next() {
key := binary.LittleEndian.Uint32(iter.Key())
value, err := evtsCountsBPFMap.GetValue(unsafe.Pointer(&key))
if err != nil {
logger.Errorw("Failed to get value from events_stats map", "error", err)
continue
}

// Get counts
id := events.ID(key)
attempts := binary.LittleEndian.Uint64(value[0:8])
failures := binary.LittleEndian.Uint64(value[8:16])
t.stats.BPFPerfEventSubmitAttemptsCount.Set(id, attempts)
t.stats.BPFPerfEventSubmitFailuresCount.Set(id, failures)

// Update Prometheus metrics for current event
evtName := events.Core.GetDefinitionByID(id).GetName()
t.stats.BPFPerfEventSubmitAttemptsCount.GaugeVec().WithLabelValues(evtName).Set(float64(attempts))
t.stats.BPFPerfEventSubmitFailuresCount.GaugeVec().WithLabelValues(evtName).Set(float64(failures))
}

// Log the counts
t.stats.BPFPerfEventSubmitAttemptsCount.Log()
t.stats.BPFPerfEventSubmitFailuresCount.Log()
}
}
}
12 changes: 10 additions & 2 deletions pkg/ebpf/tracee.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
"github.com/aquasecurity/tracee/pkg/utils/environment"
"github.com/aquasecurity/tracee/pkg/utils/proc"
"github.com/aquasecurity/tracee/pkg/utils/sharedobjs"
"github.com/aquasecurity/tracee/pkg/version"
"github.com/aquasecurity/tracee/types/trace"
)

Expand All @@ -63,7 +64,7 @@ type Tracee struct {
running atomic.Bool
done chan struct{} // signal to safely stop end-stage processing
OutDir *os.File // use utils.XXX functions to create or write to this file
stats metrics.Stats
stats *metrics.Stats
sigEngine *engine.Engine
// Events
eventsSorter *sorting.EventsChronologicalSorter
Expand Down Expand Up @@ -129,7 +130,7 @@ type Tracee struct {
}

func (t *Tracee) Stats() *metrics.Stats {
return &t.stats
return t.stats
}

func (t *Tracee) Engine() *engine.Engine {
Expand Down Expand Up @@ -225,6 +226,7 @@ func New(cfg config.Config) (*Tracee, error) {
t := &Tracee{
config: cfg,
done: make(chan struct{}),
stats: metrics.NewStats(),
writtenFiles: make(map[string]string),
readFiles: make(map[string]string),
capturedFiles: make(map[string]int64),
Expand Down Expand Up @@ -1382,6 +1384,12 @@ func (t *Tracee) Run(ctx gocontext.Context) error {
t.controlPlane.Start()
go t.controlPlane.Run(ctx)

// Measure event perf buffer write attempts (METRICS build only)

if version.MetricsBuild() {
go t.countPerfEventSubmissions(ctx)
}

// Main event loop (polling events perf buffer)

t.eventsPerfMap.Poll(pollTimeout)
Expand Down
85 changes: 85 additions & 0 deletions pkg/metrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package metrics

import (
"maps"
"sync"

"github.com/prometheus/client_golang/prometheus"

"github.com/aquasecurity/tracee/pkg/counter"
"github.com/aquasecurity/tracee/pkg/logger"
)

type Collector[K comparable] struct {
m sync.RWMutex
description string
values map[K]uint64
gaugeVec *prometheus.GaugeVec
}

func NewCollector[K comparable](description string, gv *prometheus.GaugeVec) *Collector[K] {
return &Collector[K]{
m: sync.RWMutex{},
description: description,
values: make(map[K]uint64),
gaugeVec: gv,
}
}

func (c *Collector[K]) Get(k K) (uint64, bool) {
c.m.RLock()
defer c.m.RUnlock()

v, ok := c.values[k]
return v, ok
}

func (c *Collector[K]) Set(k K, v uint64) {
c.m.Lock()
defer c.m.Unlock()

c.values[k] = v
}

func (c *Collector[K]) Total() uint64 {
c.m.RLock()
defer c.m.RUnlock()

total := counter.NewCounter(0)
for _, v := range c.values {
err := total.Increment(v)
if err != nil {
logger.Errorw("Failed to increment total counter", "error", err)
}
}

return total.Get()
}

func (c *Collector[K]) Reset() {
c.m.Lock()
defer c.m.Unlock()

c.values = make(map[K]uint64)
}

func (c *Collector[K]) Description() string {
c.m.RLock()
defer c.m.RUnlock()

return c.description
}

func (c *Collector[K]) GaugeVec() *prometheus.GaugeVec {
c.m.RLock()
defer c.m.RUnlock()

return c.gaugeVec
}

func (c *Collector[K]) Values() map[K]uint64 {
c.m.RLock()
defer c.m.RUnlock()

return maps.Clone(c.values)
}
74 changes: 74 additions & 0 deletions pkg/metrics/event_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"

"github.com/aquasecurity/tracee/pkg/counter"
"github.com/aquasecurity/tracee/pkg/events"
"github.com/aquasecurity/tracee/pkg/logger"
)

type EventCollector struct {
c *Collector[events.ID]
}

func NewEventCollector(description string, gv *prometheus.GaugeVec) *EventCollector {
return &EventCollector{
c: NewCollector[events.ID](description, gv),
}
}

func (ec *EventCollector) Get(id events.ID) uint64 {
v, ok := ec.c.Get(id)
if !ok {
logger.Errorw("Failed to get value from event collector", "event_id", id)
}
return v
}

func (ec *EventCollector) Set(id events.ID, v uint64) {
ec.c.Set(id, v)
}

func (ec *EventCollector) Total() uint64 {
return ec.c.Total()
}

func (ec *EventCollector) Reset() {
ec.c.Reset()
}

func (ec *EventCollector) Description() string {
return ec.c.Description()
}

func (ec *EventCollector) GaugeVec() *prometheus.GaugeVec {
return ec.c.GaugeVec()
}

func (ec *EventCollector) Values() map[events.ID]uint64 {
return ec.c.Values()
}

func (ec *EventCollector) Log() {
values := ec.c.Values()
description := ec.c.Description()

keyVals := make([]interface{}, 0, len(values)*2+1)
total := counter.NewCounter(0)
for k, v := range values {
keyVals = append(keyVals,
events.Core.GetDefinitionByID(events.ID(k)).GetName(),
v,
)

err := total.Increment(v)
if err != nil {
logger.Errorw("Failed to increment total counter", "error", err)
}
}

// Log the counts
keyVals = append(keyVals, "total", total.Get())
logger.Infow(description, keyVals...)
}
Loading

0 comments on commit 5a5c2e7

Please sign in to comment.