From 9c66f6d4de7d223d9dd4de91bf33675c295cd86b Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 12:04:38 +0200 Subject: [PATCH 1/3] Revert "policyfiltermetrics: Remove error label" This reverts commit 425f91e2be6b83c4b654062d64ec49ec5e072d4f. Signed-off-by: Kornilios Kourtis --- go.mod | 2 +- .../policyfiltermetrics/policyfiltermetrics.go | 9 +++++++-- pkg/policyfilter/rthooks/rthooks.go | 2 +- pkg/policyfilter/state.go | 14 +++++++------- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/go.mod b/go.mod index 02ca81c304c..502a7fdaac5 100644 --- a/go.mod +++ b/go.mod @@ -30,6 +30,7 @@ require ( github.com/mennanov/fieldmask-utils v1.1.2 github.com/opencontainers/runtime-spec v1.2.0 github.com/pelletier/go-toml v1.9.5 + github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/client_model v0.6.0 github.com/sirupsen/logrus v1.9.3 @@ -149,7 +150,6 @@ require ( github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/common v0.48.0 // indirect diff --git a/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go b/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go index 0f3b8693734..e1465d0d50c 100644 --- a/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go +++ b/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go @@ -4,7 +4,11 @@ package policyfiltermetrics import ( + "fmt" + "strings" + "github.com/cilium/tetragon/pkg/metrics/consts" + "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" ) @@ -50,7 +54,7 @@ var ( Name: "policyfilter_metrics_total", Help: "Policy filter metrics. For internal use only.", ConstLabels: nil, - }, []string{"subsys", "op"}) + }, []string{"subsys", "op", "error_type"}) ) func InitMetrics(registry *prometheus.Registry) { @@ -68,8 +72,9 @@ func InitMetrics(registry *prometheus.Registry) { // * Rename policyfilter_metrics_total to get rid of _metrics? } -func OpInc(subsys Subsys, op Operation) { +func OpInc(subsys Subsys, op Operation, err error) { PolicyFilterOpMetrics.WithLabelValues( subsys.String(), op.String(), + strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", ""), ).Inc() } diff --git a/pkg/policyfilter/rthooks/rthooks.go b/pkg/policyfilter/rthooks/rthooks.go index 69d9542ad5e..4d9eab82afc 100644 --- a/pkg/policyfilter/rthooks/rthooks.go +++ b/pkg/policyfilter/rthooks/rthooks.go @@ -103,7 +103,7 @@ func createContainerHook(_ context.Context, arg *rthooks.CreateContainerArg) err if err := pfState.AddPodContainer(policyfilter.PodID(podID), namespace, pod.Labels, containerID, cgid); err != nil { log.WithError(err).Warn("failed to update policy filter, aborting hook.") } - policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation) + policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, err) return nil } diff --git a/pkg/policyfilter/state.go b/pkg/policyfilter/state.go index 1f29c5c3e60..8859eb88a4d 100644 --- a/pkg/policyfilter/state.go +++ b/pkg/policyfilter/state.go @@ -284,7 +284,7 @@ func (m *state) updatePodHandler(pod *v1.Pod) error { "pod-id": podID, "container-ids": containerIDs, "namespace": namespace, - }).Warn("policyfilter: UpdatePod failed") + }).Warn("policyfilter, add pod-handler: AddPodContainer failed") return err } @@ -299,17 +299,17 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { logger.GetLogger().Warn("policyfilter, add-pod handler: unexpected object type: %T", pod) return } - m.updatePodHandler(pod) - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation) + err := m.updatePodHandler(pod) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, err) }, UpdateFunc: func(_, newObj interface{}) { pod, ok := newObj.(*v1.Pod) if !ok { - logger.GetLogger().Warn("policyfilter, update-pod handler: unexpected object type(s): new:%T", pod) + logger.GetLogger().Warn("policyfilter, update-pod: unexpected object type(s): new:%T", pod) return } - m.updatePodHandler(pod) - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation) + err := m.updatePodHandler(pod) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, err) }, DeleteFunc: func(obj interface{}) { // Remove all containers for this pod @@ -332,7 +332,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { "namespace": namespace, }).Warn("policyfilter, delete-pod handler: DelPod failed") } - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, err) }, } } From 706b5aa5912d77a81cad461383a49f04645f6004 Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 14:01:57 +0200 Subject: [PATCH 2/3] policyfilter: misc message fixes Commit 425f91e2be6b83c4b654062d64ec49ec5e072d4f ("policyfiltermetrics: Remove error label") was reverted in the previous patch included some misc fixes for log messages. Add them here. Signed-off-by: Kornilios Kourtis --- pkg/policyfilter/state.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/policyfilter/state.go b/pkg/policyfilter/state.go index 8859eb88a4d..673b1f1ed61 100644 --- a/pkg/policyfilter/state.go +++ b/pkg/policyfilter/state.go @@ -284,7 +284,7 @@ func (m *state) updatePodHandler(pod *v1.Pod) error { "pod-id": podID, "container-ids": containerIDs, "namespace": namespace, - }).Warn("policyfilter, add pod-handler: AddPodContainer failed") + }).Warn("policyfilter, UpdatePod failed") return err } @@ -305,7 +305,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { UpdateFunc: func(_, newObj interface{}) { pod, ok := newObj.(*v1.Pod) if !ok { - logger.GetLogger().Warn("policyfilter, update-pod: unexpected object type(s): new:%T", pod) + logger.GetLogger().Warn("policyfilter, update-pod handler: unexpected object type(s): new:%T", pod) return } err := m.updatePodHandler(pod) From c0fe8ab6b1b79b5f790a20476d2f4d731f21265f Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 2 Apr 2024 14:12:23 +0200 Subject: [PATCH 3/3] policyfilter: implement ErrorLabel() for metrics This patch implements ErrorLabel() for generating error labels for metrics. This fixes an issue with the previous implementation where the cardinality of the error_type label was potentially unbounded. Signed-off-by: Kornilios Kourtis --- docs/content/en/docs/reference/metrics.md | 1 + go.mod | 2 +- .../policyfiltermetrics.go | 45 ++++++++++++------- pkg/policyfilter/error.go | 34 ++++++++++++++ pkg/policyfilter/error_test.go | 16 +++++++ pkg/policyfilter/rthooks/rthooks.go | 2 +- pkg/policyfilter/state.go | 10 ++--- 7 files changed, 88 insertions(+), 22 deletions(-) create mode 100644 pkg/policyfilter/error.go create mode 100644 pkg/policyfilter/error_test.go diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index f99ffe95e48..78582dbc06e 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -176,6 +176,7 @@ Policy filter metrics. For internal use only. | label | values | | ----- | ------ | +| `error` | `generic-error, pod-namespace-conflict` | | `op ` | `add, add-container, delete, update` | | `subsys` | `pod-handlers, rthooks` | diff --git a/go.mod b/go.mod index 502a7fdaac5..02ca81c304c 100644 --- a/go.mod +++ b/go.mod @@ -30,7 +30,6 @@ require ( github.com/mennanov/fieldmask-utils v1.1.2 github.com/opencontainers/runtime-spec v1.2.0 github.com/pelletier/go-toml v1.9.5 - github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/client_model v0.6.0 github.com/sirupsen/logrus v1.9.3 @@ -150,6 +149,7 @@ require ( github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/common v0.48.0 // indirect diff --git a/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go b/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go index e1465d0d50c..07c0f43fe1b 100644 --- a/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go +++ b/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go @@ -4,11 +4,7 @@ package policyfiltermetrics import ( - "fmt" - "strings" - "github.com/cilium/tetragon/pkg/metrics/consts" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" ) @@ -48,33 +44,52 @@ func (s Operation) String() string { return operationLabelValues[s] } +type OperationErr int + +const ( + NoErr OperationErr = iota + GenericErr + PodNamespaceConflictErr +) + +var operationErrLabels = map[OperationErr]string{ + NoErr: "", + GenericErr: "generic-error", + PodNamespaceConflictErr: "pod-namespace-conflict", +} + +func (s OperationErr) String() string { + return operationErrLabels[s] +} + var ( PolicyFilterOpMetrics = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, Name: "policyfilter_metrics_total", Help: "Policy filter metrics. For internal use only.", ConstLabels: nil, - }, []string{"subsys", "op", "error_type"}) + }, []string{"subsys", "op", "error"}) ) func InitMetrics(registry *prometheus.Registry) { registry.MustRegister(PolicyFilterOpMetrics) // Initialize metrics with labels - PolicyFilterOpMetrics.WithLabelValues(RTHooksSubsys.String(), AddContainerOperation.String()).Add(0) - PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), AddPodOperation.String()).Add(0) - PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), UpdatePodOperation.String()).Add(0) - PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), DeletePodOperation.String()).Add(0) + for _, subsys := range subsysLabelValues { + for _, op := range operationLabelValues { + for _, err := range operationErrLabels { + PolicyFilterOpMetrics.WithLabelValues( + subsys, op, err, + ).Add(0) + } + } + } // NOTES: - // * error, error_type, type - standardize on a label // * Don't confuse op in policyfilter_metrics_total with ops.OpCode // * Rename policyfilter_metrics_total to get rid of _metrics? } -func OpInc(subsys Subsys, op Operation, err error) { - PolicyFilterOpMetrics.WithLabelValues( - subsys.String(), op.String(), - strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", ""), - ).Inc() +func OpInc(subsys Subsys, op Operation, err string) { + PolicyFilterOpMetrics.WithLabelValues(subsys.String(), op.String(), err).Inc() } diff --git a/pkg/policyfilter/error.go b/pkg/policyfilter/error.go new file mode 100644 index 00000000000..1781c91ab2c --- /dev/null +++ b/pkg/policyfilter/error.go @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package policyfilter + +import ( + "fmt" + + "github.com/cilium/tetragon/pkg/metrics/policyfiltermetrics" +) + +// podNamespaceConflictErr: even if a pod changes, we expect the namespace to remain the same +type podNamespaceConflictErr struct { + podID PodID + oldNs, newNs string +} + +func (e *podNamespaceConflictErr) Error() string { + return fmt.Sprintf("conflicting namespaces for pod with id '%s': old='%s' vs new='%s'", + e.podID.String(), e.oldNs, e.newNs) +} + +// ErrorLabel returns an error label with a small cardinality so it can be used in metrics +func ErrorLabel(err error) string { + if err == nil { + return policyfiltermetrics.NoErr.String() + } + switch err.(type) { + case *podNamespaceConflictErr: + return policyfiltermetrics.PodNamespaceConflictErr.String() + default: + return policyfiltermetrics.GenericErr.String() + } +} diff --git a/pkg/policyfilter/error_test.go b/pkg/policyfilter/error_test.go new file mode 100644 index 00000000000..205fee92392 --- /dev/null +++ b/pkg/policyfilter/error_test.go @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package policyfilter + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestErrorLabel(t *testing.T) { + var err error = &podNamespaceConflictErr{PodID{}, "foo", "lala"} + require.Equal(t, "", ErrorLabel(nil)) + require.Equal(t, "pod-namespace-conflict", ErrorLabel(err)) +} diff --git a/pkg/policyfilter/rthooks/rthooks.go b/pkg/policyfilter/rthooks/rthooks.go index 4d9eab82afc..daf142d70e1 100644 --- a/pkg/policyfilter/rthooks/rthooks.go +++ b/pkg/policyfilter/rthooks/rthooks.go @@ -103,7 +103,7 @@ func createContainerHook(_ context.Context, arg *rthooks.CreateContainerArg) err if err := pfState.AddPodContainer(policyfilter.PodID(podID), namespace, pod.Labels, containerID, cgid); err != nil { log.WithError(err).Warn("failed to update policy filter, aborting hook.") } - policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, policyfilter.ErrorLabel(err)) return nil } diff --git a/pkg/policyfilter/state.go b/pkg/policyfilter/state.go index 673b1f1ed61..af8d79f0372 100644 --- a/pkg/policyfilter/state.go +++ b/pkg/policyfilter/state.go @@ -300,7 +300,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { return } err := m.updatePodHandler(pod) - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, ErrorLabel(err)) }, UpdateFunc: func(_, newObj interface{}) { pod, ok := newObj.(*v1.Pod) @@ -309,7 +309,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { return } err := m.updatePodHandler(pod) - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, ErrorLabel(err)) }, DeleteFunc: func(obj interface{}) { // Remove all containers for this pod @@ -332,7 +332,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { "namespace": namespace, }).Warn("policyfilter, delete-pod handler: DelPod failed") } - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, ErrorLabel(err)) }, } } @@ -587,7 +587,7 @@ func (m *state) AddPodContainer(podID PodID, namespace string, podLabels labels. }).Info("AddPodContainer: added pod") } else if pod.namespace != namespace { // sanity check: old and new namespace should match - return fmt.Errorf("conflicting namespaces for pod with id %s: old='%s' vs new='%s'", podID, pod.namespace, namespace) + return &podNamespaceConflictErr{podID: podID, oldNs: pod.namespace, newNs: namespace} } m.addPodContainers(pod, []string{containerID}, []CgroupID{cgID}) @@ -788,7 +788,7 @@ func (m *state) UpdatePod(podID PodID, namespace string, podLabels labels.Labels dlog.Info("UpdatePod: added pod") } else if pod.namespace != namespace { // sanity check: old and new namespace should match - return fmt.Errorf("conflicting namespaces for pod with id %s: old='%s' vs new='%s'", podID, pod.namespace, namespace) + return &podNamespaceConflictErr{podID: podID, oldNs: pod.namespace, newNs: namespace} } // labels changed: check if there are policies ads that: