Skip to content

Commit

Permalink
Add a metric to provide per-event missed events
Browse files Browse the repository at this point in the history
Example:
$ curl localhost:2112/metrics 2> /dev/null | grep 'missed_events_total\|ringbuf_perf_event_lost_total\|ringbuf_queue_lost_total\|msg_op_total\|ringbuf_queue_received_total'
tetragon_missed_events_total{event="clone_sent"} 323
tetragon_missed_events_total{event="data_failed"} 927
tetragon_missed_events_total{event="data_sent"} 616
tetragon_missed_events_total{event="exec_sent"} 323
tetragon_missed_events_total{event="exit_sent"} 321
tetragon_missed_events_total{event="kprobe_sent"} 52
tetragon_missed_events_total{event="total_failed"} 927
tetragon_missed_events_total{event="total_sent"} 1635
tetragon_msg_op_total{msg_op="13"} 52
tetragon_msg_op_total{msg_op="23"} 323
tetragon_msg_op_total{msg_op="24"} 616
tetragon_msg_op_total{msg_op="5"} 323
tetragon_msg_op_total{msg_op="7"} 321
tetragon_ringbuf_perf_event_lost_total 927
tetragon_ringbuf_queue_lost_total 0
tetragon_ringbuf_queue_received_total 1635

This PR adds an eBPF map collector for getting metrics directly from a
map. This map contains values with the return values of all
perf_event_output calls (i.e. if it fails). This provides us the
ability to determine missed events per type. Metric
tetragon_missed_events_total contains such information.

Using the previous example, we can see that we lost 927 events from
the user-space (tetragon_ringbuf_perf_event_lost_total). This is the same
as tetragon_missed_events_total{event="total_failed"} gathered from the
kernel. All of these missed events are from data events
(tetragon_missed_events_total{event="data_failed"}).

The total events that we got from the user-space perspective is
tetragon_ringbuf_queue_received_total while from the kernel perspective
is tetragon_missed_events_total{event="total_sent"}.

As we have seen cases where
tetragon_missed_events_total{event="total_failed"} is not the same as
tetragon_ringbuf_perf_event_lost_total we also provide the number of all
per-type events that sent successfully (and not).

Signed-off-by: Anastasios Papagiannis <[email protected]>
  • Loading branch information
tpapagian committed Oct 30, 2023
1 parent cd215f2 commit 1c5ca91
Show file tree
Hide file tree
Showing 16 changed files with 174 additions and 12 deletions.
1 change: 1 addition & 0 deletions bpf/alignchecker/bpf_alignchecker.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ struct execve_map_value _execve_map_value;
struct event_config _event_config;
struct tetragon_conf _tetragon_conf;
struct cgroup_tracking_value _cgroup_tracking_value;
struct kernel_stats _kernel_stats;
2 changes: 1 addition & 1 deletion bpf/cgroup/bpf_cgroup_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ send_cgrp_event(struct bpf_raw_tracepoint_args *ctx,
memcpy(&msg->cgrp_data.name, &cgrp_track->name, KN_NAME_LENGTH);
probe_read_str(&msg->path, PATH_MAP_SIZE - 1, path);

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size);
PERF_METRICS(METRIC_CGROUP, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size));

return 0;
}
Expand Down
37 changes: 37 additions & 0 deletions bpf/lib/process.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,4 +535,41 @@ execve_joined_info_map_get(__u64 tid)

_Static_assert(sizeof(struct execve_map_value) % 8 == 0,
"struct execve_map_value should have size multiple of 8 bytes");

#define METRIC_EXEC 0
#define METRIC_CLONE 1
#define METRIC_EXIT 2
#define METRIC_DATA 3
#define METRIC_CGROUP 4
#define METRIC_LOADER 5
#define METRIC_TRACEPOINT 6
#define METRIC_KPROBE 7
#define METRIC_UPORBE 8
#define METRIC_MAX_VALUES 9

struct kernel_stats {
__u64 sent[METRIC_MAX_VALUES];
__u64 sent_failed[METRIC_MAX_VALUES];
};

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, __u32);
__type(value, struct kernel_stats);
__uint(max_entries, 1);
} tg_stats_map SEC(".maps");

#define PERF_METRICS(__x, __v) \
do { \
long retval = (__v); \
__u32 key = 0; \
struct kernel_stats *valp = map_lookup_elem(&tg_stats_map, &key); \
if (valp) { \
if (retval >= 0) \
__sync_fetch_and_add(&valp->sent[__x], 1); \
else \
__sync_fetch_and_add(&valp->sent_failed[__x], 1); \
} \
} while (0)

#endif //_PROCESS__
2 changes: 1 addition & 1 deletion bpf/process/bpf_execve_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,6 @@ execve_send(struct sched_execve_args *ctx)
sizeof(struct msg_capabilities) +
sizeof(struct msg_cred_minimal) + sizeof(struct msg_ns) +
sizeof(struct msg_execve_key) + p->size);
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, event, size);
PERF_METRICS(METRIC_EXEC, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, event, size));
return 0;
}
3 changes: 1 addition & 2 deletions bpf/process/bpf_exit.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ static inline __attribute__((always_inline)) void event_exit_send(void *ctx, __u
probe_read(&exit->info.code, sizeof(exit->info.code),
_(&task->exit_code));

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, exit,
size);
PERF_METRICS(METRIC_EXIT, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, exit, size));
}
execve_map_delete(tgid);
}
Expand Down
3 changes: 1 addition & 2 deletions bpf/process/bpf_fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ BPF_KPROBE(event_wake_up_new_task, struct task_struct *task)
/* Last: set any encountered error when setting cgroup info */
msg.flags |= error_flags;

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, &msg,
size);
PERF_METRICS(METRIC_CLONE, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, &msg, size));
}
return 0;
}
2 changes: 1 addition & 1 deletion bpf/process/bpf_generic_retkprobe.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,6 @@ BPF_KRETPROBE(generic_retkprobe_event, unsigned long ret)
: [total] "+r"(total)
:);
e->common.size = total;
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
PERF_METRICS(METRIC_KPROBE, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, e, total));
return 0;
}
2 changes: 1 addition & 1 deletion bpf/process/bpf_loader.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,6 @@ loader_kprobe(struct pt_regs *ctx)
msg->common.op = MSG_OP_LOADER;
msg->common.flags = 0;

perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, total);
PERF_METRICS(METRIC_LOADER, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, total));
return 0;
}
6 changes: 3 additions & 3 deletions bpf/process/data_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ __do_bytes(void *ctx, struct msg_data *msg, unsigned long uptr, size_t bytes)
return err;

msg->common.size = offsetof(struct msg_data, arg) + bytes;
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg,
msg->common.size);
PERF_METRICS(METRIC_DATA, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg,
msg->common.size));
return bytes;
b:
return -1;
Expand Down Expand Up @@ -106,7 +106,7 @@ __do_str(void *ctx, struct msg_data *msg, unsigned long arg, bool *done)
: [size] "+r"(size)
:);
msg->common.size = size;
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size);
PERF_METRICS(METRIC_DATA, perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, msg, size));
return ret;
}

Expand Down
14 changes: 13 additions & 1 deletion bpf/process/types/basic.h
Original file line number Diff line number Diff line change
Expand Up @@ -2188,6 +2188,7 @@ generic_output(void *ctx, struct bpf_map_def *heap)
struct msg_generic_kprobe *e;
int zero = 0;
size_t total;
long ret;

e = map_lookup_elem(heap, &zero);
if (!e)
Expand Down Expand Up @@ -2226,7 +2227,18 @@ generic_output(void *ctx, struct bpf_map_def *heap)
:
: [total] "+r"(total)
:);
perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
ret = perf_event_output(ctx, &tcpmon_map, BPF_F_CURRENT_CPU, e, total);
switch (e->common.op) {
case MSG_OP_GENERIC_TRACEPOINT:
PERF_METRICS(METRIC_TRACEPOINT, ret);
break;
case MSG_OP_GENERIC_KPROBE:
PERF_METRICS(METRIC_KPROBE, ret);
break;
case MSG_OP_GENERIC_UPROBE:
PERF_METRICS(METRIC_UPORBE, ret);
break;
}
return 1;
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/alignchecker/alignchecker.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ func CheckStructAlignments(pathToObj string) error {

// cgroup
"cgroup_tracking_value": {cgrouptrackmap.CgrpTrackingValue{}},

// metrics
"kernel_stats": {processapi.KernelStats{}},
}

return alignchecker.CheckStructAlignments(pathToObj, alignments, true)
Expand Down
18 changes: 18 additions & 0 deletions pkg/api/processapi/processapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,21 @@ type MsgCgroupEvent struct {
CgrpData MsgCgroupData `align:"cgrp_data"` // Complementary cgroup data
Path [CGROUP_PATH_LENGTH]byte `align:"path"` // Full path of the cgroup on fs
}

const (
MetricExec = 0
MetricClone = 1
MetricExit = 2
MetricData = 3
MetricCgroup = 4
MetricLoader = 5
MetricTracepoint = 6
MetricKprobe = 7
MetricUprobe = 8
MetricMaxValues = 9
)

type KernelStats struct {
Sent [MetricMaxValues]uint64 `align:"sent"`
SentFailed [MetricMaxValues]uint64 `align:"sent_failed"`
}
84 changes: 84 additions & 0 deletions pkg/bpfmetrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon
package bpfmetrics

import (
"fmt"
"path/filepath"

"github.com/cilium/ebpf"
"github.com/cilium/tetragon/pkg/api/processapi"
"github.com/cilium/tetragon/pkg/metrics/eventmetrics"
"github.com/cilium/tetragon/pkg/option"
"github.com/prometheus/client_golang/prometheus"
)

var metrics = map[int]string{
processapi.MetricExec: "exec",
processapi.MetricClone: "clone",
processapi.MetricExit: "exit",
processapi.MetricData: "data",
processapi.MetricCgroup: "cgroup",
processapi.MetricLoader: "loader",
processapi.MetricTracepoint: "tracepoint",
processapi.MetricKprobe: "kprobe",
processapi.MetricUprobe: "uprobe",
}

// bpfCollector implements prometheus.Collector. It collects metrics directly from BPF maps.
type bpfCollector struct{}

func NewBPFCollector() prometheus.Collector {
return &bpfCollector{}
}

func (c *bpfCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- eventmetrics.MissedEvents.Desc()
}

func (c *bpfCollector) Collect(ch chan<- prometheus.Metric) {
mapHandle, err := ebpf.LoadPinnedMap(filepath.Join(option.Config.MapDir, "tg_stats_map"), nil)
if err != nil {
return
}
defer mapHandle.Close()

var zero uint32
var allCpuValue []processapi.KernelStats
if err := mapHandle.Lookup(zero, &allCpuValue); err != nil {
return
}

sum := processapi.KernelStats{}
for _, val := range allCpuValue {
for i := 0; i < processapi.MetricMaxValues; i++ {
sum.Sent[i] += val.Sent[i]
sum.SentFailed[i] += val.SentFailed[i]
}
}

for i := 0; i < processapi.MetricMaxValues; i++ {
if sum.Sent[i] > 0 {
ch <- eventmetrics.MissedEvents.MustMetric(float64(sum.Sent[i]), fmt.Sprintf("%s_sent", metrics[i]))
}
if sum.SentFailed[i] > 0 {
ch <- eventmetrics.MissedEvents.MustMetric(float64(sum.SentFailed[i]), fmt.Sprintf("%s_failed", metrics[i]))
}
}

var totalSent uint64
for i := 0; i < processapi.MetricMaxValues; i++ {
totalSent += sum.Sent[i]
}
if totalSent > 0 {
ch <- eventmetrics.MissedEvents.MustMetric(float64(totalSent), "total_sent")
}

var totalSentFailed uint64
for i := 0; i < processapi.MetricMaxValues; i++ {
totalSentFailed += sum.SentFailed[i]
}
if totalSentFailed > 0 {
ch <- eventmetrics.MissedEvents.MustMetric(float64(totalSentFailed), "total_failed")
}
}
5 changes: 5 additions & 0 deletions pkg/metrics/eventmetrics/eventmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ var (
Help: "The total number of Tetragon events",
ConstLabels: nil,
}, []string{"type"})
MissedEvents = metrics.NewBPFCounter(prometheus.NewDesc(
prometheus.BuildFQName(consts.MetricsNamespace, "", "missed_events_total"),
"The total number of missed Tetragon events per type.",
[]string{"event"}, nil,
))
FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "flags_total",
Expand Down
2 changes: 2 additions & 0 deletions pkg/metrics/metricsconfig/initmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package metricsconfig

import (
"github.com/cilium/tetragon/pkg/bpfmetrics"
"github.com/cilium/tetragon/pkg/eventcache"
"github.com/cilium/tetragon/pkg/grpc/tracing"
"github.com/cilium/tetragon/pkg/metrics/errormetrics"
Expand Down Expand Up @@ -49,6 +50,7 @@ func InitAllMetrics(registry *prometheus.Registry) {
eventcache.NewBPFCollector(),
observer.NewBPFCollector(),
process.NewBPFCollector(),
bpfmetrics.NewBPFCollector(),
))

// register common third-party collectors
Expand Down
2 changes: 2 additions & 0 deletions pkg/sensors/base/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ var (
/* Internal statistics for debugging */
ExecveStats = program.MapBuilder("execve_map_stats", Execve)
ExecveJoinMapStats = program.MapBuilder("tg_execve_joined_info_map_stats", ExecveBprmCommit)
StatsMap = program.MapBuilder("tg_stats_map", Execve)

sensor = sensors.Sensor{
Name: "__base__",
Expand Down Expand Up @@ -99,6 +100,7 @@ func GetDefaultMaps() []*program.Map {
NamesMap,
TCPMonMap,
TetragonConfMap,
StatsMap,
}
return maps

Expand Down

0 comments on commit 1c5ca91

Please sign in to comment.