From 5ea36d1ddc6ade04ee4c0ed72cc526565af99868 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 6 Jan 2024 21:47:23 +0000 Subject: [PATCH] tetragon: Add missed stats to kprobemetrics package Adding metrics for missed runs on program and link level to kprobemetrics package and logic to store and collect missed stats. The missed stats are supported for all programs and kprobe/kprobe-multi links. They are stored per 'attach name' and 'policy name'. For programs (not just kprobes): tetragon_missed_prog_probes_total{attach="__x64_sys_linkat",policy="sys-linkat-passwd"} 68 tetragon_missed_prog_probes_total{attach="acct_process",policy="__base__"} 60 tetragon_missed_prog_probes_total{attach="sched/sched_process_exec",policy="__base__"} 64 tetragon_missed_prog_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 66 tetragon_missed_prog_probes_total{attach="wake_up_new_task",policy="__base__"} 62 For kprobe and kprobe-multi links: tetragon_missed_link_probes_total{attach="__x64_sys_linkat",policy="sys-linkat-passwd"} 45 tetragon_missed_link_probes_total{attach="acct_process",policy="__base__"} 39 tetragon_missed_link_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 43 tetragon_missed_link_probes_total{attach="wake_up_new_task",policy="__base__"} 41 tetragon_missed_prog_probes_total{attach="acct_process",policy="__base__"} 40 tetragon_missed_prog_probes_total{attach="kprobe_multi (1 functions)",policy="sys-linkat-passwd"} 48 tetragon_missed_prog_probes_total{attach="sched/sched_process_exec",policy="__base__"} 44 tetragon_missed_prog_probes_total{attach="security_bprm_committing_creds",policy="__base__"} 46 tetragon_missed_prog_probes_total{attach="wake_up_new_task",policy="__base__"} 42 Note changing the healthMetrics group to be created as not constrained, so it can carry new metrics. It will be addressed in future by adding debug metrics group. Signed-off-by: Jiri Olsa --- docs/content/en/docs/reference/metrics.md | 18 +++++ pkg/metrics/kprobemetrics/collector.go | 83 +++++++++++++++++++++++ pkg/metrics/kprobemetrics/missed.go | 29 ++++++++ pkg/metricsconfig/healthmetrics.go | 4 +- 4 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 pkg/metrics/kprobemetrics/collector.go create mode 100644 pkg/metrics/kprobemetrics/missed.go diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index ed089fa40c6..f9d49d54a09 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -181,6 +181,24 @@ The total number of Tetragon events per type that are failed to sent from the ke | ----- | ------ | | `msg_op` | `11, 13, 14, 15, 23, 24, 25, 26, 5, 7` | +### `tetragon_missed_link_probes_total` + +The total number of Tetragon probe missed by link. + +| label | values | +| ----- | ------ | +| `attach` | `sys_panic` | +| `policy` | `monitor_panic` | + +### `tetragon_missed_prog_probes_total` + +The total number of Tetragon probe missed by program. + +| label | values | +| ----- | ------ | +| `attach` | `sys_panic` | +| `policy` | `monitor_panic` | + ### `tetragon_msg_op_total` The total number of times we encounter a given message opcode. For internal use only. diff --git a/pkg/metrics/kprobemetrics/collector.go b/pkg/metrics/kprobemetrics/collector.go new file mode 100644 index 00000000000..487e844b88c --- /dev/null +++ b/pkg/metrics/kprobemetrics/collector.go @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package kprobemetrics + +import ( + "github.com/cilium/ebpf/link" + "github.com/cilium/tetragon/pkg/bpf" + "github.com/cilium/tetragon/pkg/metrics" + "github.com/cilium/tetragon/pkg/sensors" + "github.com/cilium/tetragon/pkg/sensors/program" + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/sys/unix" +) + +func NewBPFCollector() prometheus.Collector { + return metrics.NewCustomCollector( + metrics.CustomMetrics{ + MissedLink, + MissedProg, + }, + collect, + collectForDocs, + ) +} + +func collectLink(ch chan<- prometheus.Metric, load *program.Program) { + if load.Link == nil { + return + } + + info, err := load.Link.Info() + if err != nil { + return + } + + missed := uint64(0) + + switch info.Type { + case link.PerfEventType: + if !bpf.HasMissedStatsPerfEvent() { + return + } + pevent := info.PerfEvent() + switch pevent.Type { + case unix.BPF_PERF_EVENT_KPROBE, unix.BPF_PERF_EVENT_KRETPROBE: + kprobe := pevent.Kprobe() + missed, _ = kprobe.Missed() + } + case link.KprobeMultiType: + if !bpf.HasMissedStatsKprobeMulti() { + return + } + kmulti := info.KprobeMulti() + missed, _ = kmulti.Missed() + default: + } + + ch <- MissedLink.MustMetric(float64(missed), load.Policy, load.Attach) +} + +func collectProg(ch chan<- prometheus.Metric, load *program.Program) { + info, err := load.Prog.Info() + if err != nil { + return + } + + missed, _ := info.RecursionMisses() + ch <- MissedProg.MustMetric(float64(missed), load.Policy, load.Attach) +} + +func collect(ch chan<- prometheus.Metric) { + allPrograms := sensors.AllPrograms() + for _, prog := range allPrograms { + collectLink(ch, prog) + collectProg(ch, prog) + } +} + +func collectForDocs(ch chan<- prometheus.Metric) { + ch <- MissedLink.MustMetric(0, "monitor_panic", "sys_panic") + ch <- MissedProg.MustMetric(0, "monitor_panic", "sys_panic") +} diff --git a/pkg/metrics/kprobemetrics/missed.go b/pkg/metrics/kprobemetrics/missed.go new file mode 100644 index 00000000000..a10d92e0d8a --- /dev/null +++ b/pkg/metrics/kprobemetrics/missed.go @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package kprobemetrics + +import ( + "github.com/cilium/tetragon/pkg/metrics" + "github.com/cilium/tetragon/pkg/metrics/consts" +) + +var ( + MissedLink = metrics.MustNewCustomCounter(metrics.NewOpts( + consts.MetricsNamespace, "", "missed_link_probes_total", + "The total number of Tetragon probe missed by link.", + nil, nil, []metrics.UnconstrainedLabel{ + metrics.UnconstrainedLabel{Name: "policy", ExampleValue: "monitor_panic"}, + metrics.UnconstrainedLabel{Name: "attach", ExampleValue: "sys_panic"}, + }, + )) + + MissedProg = metrics.MustNewCustomCounter(metrics.NewOpts( + consts.MetricsNamespace, "", "missed_prog_probes_total", + "The total number of Tetragon probe missed by program.", + nil, nil, []metrics.UnconstrainedLabel{ + metrics.UnconstrainedLabel{Name: "policy", ExampleValue: "monitor_panic"}, + metrics.UnconstrainedLabel{Name: "attach", ExampleValue: "sys_panic"}, + }, + )) +) diff --git a/pkg/metricsconfig/healthmetrics.go b/pkg/metricsconfig/healthmetrics.go index 9de0b4719b1..70182dc5fc4 100644 --- a/pkg/metricsconfig/healthmetrics.go +++ b/pkg/metricsconfig/healthmetrics.go @@ -36,7 +36,7 @@ var ( func GetHealthGroup() metrics.Group { healthMetricsOnce.Do(func() { - healthMetrics = metrics.NewMetricsGroup(true) + healthMetrics = metrics.NewMetricsGroup(false) }) return healthMetrics } @@ -102,4 +102,6 @@ func registerHealthMetrics(group metrics.Group) { group.MustRegister(policystatemetrics.NewPolicyStateCollector()) // gRPC metrics group.MustRegister(grpcmetrics.NewServerMetrics()) + // missed metris + group.MustRegister(kprobemetrics.NewBPFCollector()) }