diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index ed089fa40c6..f9d49d54a09 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -181,6 +181,24 @@ The total number of Tetragon events per type that are failed to sent from the ke | ----- | ------ | | `msg_op` | `11, 13, 14, 15, 23, 24, 25, 26, 5, 7` | +### `tetragon_missed_link_probes_total` + +The total number of Tetragon probe missed by link. + +| label | values | +| ----- | ------ | +| `attach` | `sys_panic` | +| `policy` | `monitor_panic` | + +### `tetragon_missed_prog_probes_total` + +The total number of Tetragon probe missed by program. + +| label | values | +| ----- | ------ | +| `attach` | `sys_panic` | +| `policy` | `monitor_panic` | + ### `tetragon_msg_op_total` The total number of times we encounter a given message opcode. For internal use only. diff --git a/pkg/metrics/kprobemetrics/collector.go b/pkg/metrics/kprobemetrics/collector.go new file mode 100644 index 00000000000..487e844b88c --- /dev/null +++ b/pkg/metrics/kprobemetrics/collector.go @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package kprobemetrics + +import ( + "github.com/cilium/ebpf/link" + "github.com/cilium/tetragon/pkg/bpf" + "github.com/cilium/tetragon/pkg/metrics" + "github.com/cilium/tetragon/pkg/sensors" + "github.com/cilium/tetragon/pkg/sensors/program" + "github.com/prometheus/client_golang/prometheus" + "golang.org/x/sys/unix" +) + +func NewBPFCollector() prometheus.Collector { + return metrics.NewCustomCollector( + metrics.CustomMetrics{ + MissedLink, + MissedProg, + }, + collect, + collectForDocs, + ) +} + +func collectLink(ch chan<- prometheus.Metric, load *program.Program) { + if load.Link == nil { + return + } + + info, err := load.Link.Info() + if err != nil { + return + } + + missed := uint64(0) + + switch info.Type { + case link.PerfEventType: + if !bpf.HasMissedStatsPerfEvent() { + return + } + pevent := info.PerfEvent() + switch pevent.Type { + case unix.BPF_PERF_EVENT_KPROBE, unix.BPF_PERF_EVENT_KRETPROBE: + kprobe := pevent.Kprobe() + missed, _ = kprobe.Missed() + } + case link.KprobeMultiType: + if !bpf.HasMissedStatsKprobeMulti() { + return + } + kmulti := info.KprobeMulti() + missed, _ = kmulti.Missed() + default: + } + + ch <- MissedLink.MustMetric(float64(missed), load.Policy, load.Attach) +} + +func collectProg(ch chan<- prometheus.Metric, load *program.Program) { + info, err := load.Prog.Info() + if err != nil { + return + } + + missed, _ := info.RecursionMisses() + ch <- MissedProg.MustMetric(float64(missed), load.Policy, load.Attach) +} + +func collect(ch chan<- prometheus.Metric) { + allPrograms := sensors.AllPrograms() + for _, prog := range allPrograms { + collectLink(ch, prog) + collectProg(ch, prog) + } +} + +func collectForDocs(ch chan<- prometheus.Metric) { + ch <- MissedLink.MustMetric(0, "monitor_panic", "sys_panic") + ch <- MissedProg.MustMetric(0, "monitor_panic", "sys_panic") +} diff --git a/pkg/metrics/kprobemetrics/missed.go b/pkg/metrics/kprobemetrics/missed.go new file mode 100644 index 00000000000..a10d92e0d8a --- /dev/null +++ b/pkg/metrics/kprobemetrics/missed.go @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package kprobemetrics + +import ( + "github.com/cilium/tetragon/pkg/metrics" + "github.com/cilium/tetragon/pkg/metrics/consts" +) + +var ( + MissedLink = metrics.MustNewCustomCounter(metrics.NewOpts( + consts.MetricsNamespace, "", "missed_link_probes_total", + "The total number of Tetragon probe missed by link.", + nil, nil, []metrics.UnconstrainedLabel{ + metrics.UnconstrainedLabel{Name: "policy", ExampleValue: "monitor_panic"}, + metrics.UnconstrainedLabel{Name: "attach", ExampleValue: "sys_panic"}, + }, + )) + + MissedProg = metrics.MustNewCustomCounter(metrics.NewOpts( + consts.MetricsNamespace, "", "missed_prog_probes_total", + "The total number of Tetragon probe missed by program.", + nil, nil, []metrics.UnconstrainedLabel{ + metrics.UnconstrainedLabel{Name: "policy", ExampleValue: "monitor_panic"}, + metrics.UnconstrainedLabel{Name: "attach", ExampleValue: "sys_panic"}, + }, + )) +) diff --git a/pkg/metricsconfig/healthmetrics.go b/pkg/metricsconfig/healthmetrics.go index 9de0b4719b1..70182dc5fc4 100644 --- a/pkg/metricsconfig/healthmetrics.go +++ b/pkg/metricsconfig/healthmetrics.go @@ -36,7 +36,7 @@ var ( func GetHealthGroup() metrics.Group { healthMetricsOnce.Do(func() { - healthMetrics = metrics.NewMetricsGroup(true) + healthMetrics = metrics.NewMetricsGroup(false) }) return healthMetrics } @@ -102,4 +102,6 @@ func registerHealthMetrics(group metrics.Group) { group.MustRegister(policystatemetrics.NewPolicyStateCollector()) // gRPC metrics group.MustRegister(grpcmetrics.NewServerMetrics()) + // missed metris + group.MustRegister(kprobemetrics.NewBPFCollector()) }