diff --git a/docs/content/en/docs/reference/metrics.md b/docs/content/en/docs/reference/metrics.md index f99ffe95e48..9099c54f99e 100644 --- a/docs/content/en/docs/reference/metrics.md +++ b/docs/content/en/docs/reference/metrics.md @@ -176,6 +176,7 @@ Policy filter metrics. For internal use only. | label | values | | ----- | ------ | +| `error_type` | ` ` | | `op ` | `add, add-container, delete, update` | | `subsys` | `pod-handlers, rthooks` | diff --git a/go.mod b/go.mod index 502a7fdaac5..02ca81c304c 100644 --- a/go.mod +++ b/go.mod @@ -30,7 +30,6 @@ require ( github.com/mennanov/fieldmask-utils v1.1.2 github.com/opencontainers/runtime-spec v1.2.0 github.com/pelletier/go-toml v1.9.5 - github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.19.0 github.com/prometheus/client_model v0.6.0 github.com/sirupsen/logrus v1.9.3 @@ -150,6 +149,7 @@ require ( github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/common v0.48.0 // indirect diff --git a/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go b/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go index e1465d0d50c..7da5cadca4e 100644 --- a/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go +++ b/pkg/metrics/policyfiltermetrics/policyfiltermetrics.go @@ -4,11 +4,7 @@ package policyfiltermetrics import ( - "fmt" - "strings" - "github.com/cilium/tetragon/pkg/metrics/consts" - "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" ) @@ -61,10 +57,10 @@ func InitMetrics(registry *prometheus.Registry) { registry.MustRegister(PolicyFilterOpMetrics) // Initialize metrics with labels - PolicyFilterOpMetrics.WithLabelValues(RTHooksSubsys.String(), AddContainerOperation.String()).Add(0) - PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), AddPodOperation.String()).Add(0) - PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), UpdatePodOperation.String()).Add(0) - PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), DeletePodOperation.String()).Add(0) + PolicyFilterOpMetrics.WithLabelValues(RTHooksSubsys.String(), AddContainerOperation.String(), "").Add(0) + PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), AddPodOperation.String(), "").Add(0) + PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), UpdatePodOperation.String(), "").Add(0) + PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), DeletePodOperation.String(), "").Add(0) // NOTES: // * error, error_type, type - standardize on a label @@ -72,9 +68,6 @@ func InitMetrics(registry *prometheus.Registry) { // * Rename policyfilter_metrics_total to get rid of _metrics? } -func OpInc(subsys Subsys, op Operation, err error) { - PolicyFilterOpMetrics.WithLabelValues( - subsys.String(), op.String(), - strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", ""), - ).Inc() +func OpInc(subsys Subsys, op Operation, err string) { + PolicyFilterOpMetrics.WithLabelValues(subsys.String(), op.String(), err).Inc() } diff --git a/pkg/policyfilter/error.go b/pkg/policyfilter/error.go new file mode 100644 index 00000000000..d73ff6725d4 --- /dev/null +++ b/pkg/policyfilter/error.go @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package policyfilter + +import ( + "fmt" +) + +// podNamespaceConflictErr: even if a pod changes, we expect the namespace to remain the same +type podNamespaceConflictErr struct { + podID PodID + oldNs, newNs string +} + +func (e *podNamespaceConflictErr) Error() string { + return fmt.Sprintf("conflicting namespaces for pod with id '%s': old='%s' vs new='%s'", + e.podID.String(), e.oldNs, e.newNs) +} + +// ErrorLabel returns an error label with a small cardinality so it can be used in metrics +func ErrorLabel(err error) string { + if err == nil { + return "" + } + switch err.(type) { + case *podNamespaceConflictErr: + return "pod-namespace-conflict" + default: + return "generic-error" + } +} diff --git a/pkg/policyfilter/error_test.go b/pkg/policyfilter/error_test.go new file mode 100644 index 00000000000..205fee92392 --- /dev/null +++ b/pkg/policyfilter/error_test.go @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright Authors of Tetragon + +package policyfilter + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestErrorLabel(t *testing.T) { + var err error = &podNamespaceConflictErr{PodID{}, "foo", "lala"} + require.Equal(t, "", ErrorLabel(nil)) + require.Equal(t, "pod-namespace-conflict", ErrorLabel(err)) +} diff --git a/pkg/policyfilter/rthooks/rthooks.go b/pkg/policyfilter/rthooks/rthooks.go index 4d9eab82afc..daf142d70e1 100644 --- a/pkg/policyfilter/rthooks/rthooks.go +++ b/pkg/policyfilter/rthooks/rthooks.go @@ -103,7 +103,7 @@ func createContainerHook(_ context.Context, arg *rthooks.CreateContainerArg) err if err := pfState.AddPodContainer(policyfilter.PodID(podID), namespace, pod.Labels, containerID, cgid); err != nil { log.WithError(err).Warn("failed to update policy filter, aborting hook.") } - policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, policyfilter.ErrorLabel(err)) return nil } diff --git a/pkg/policyfilter/state.go b/pkg/policyfilter/state.go index 673b1f1ed61..af8d79f0372 100644 --- a/pkg/policyfilter/state.go +++ b/pkg/policyfilter/state.go @@ -300,7 +300,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { return } err := m.updatePodHandler(pod) - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, ErrorLabel(err)) }, UpdateFunc: func(_, newObj interface{}) { pod, ok := newObj.(*v1.Pod) @@ -309,7 +309,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { return } err := m.updatePodHandler(pod) - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, ErrorLabel(err)) }, DeleteFunc: func(obj interface{}) { // Remove all containers for this pod @@ -332,7 +332,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs { "namespace": namespace, }).Warn("policyfilter, delete-pod handler: DelPod failed") } - policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, err) + policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, ErrorLabel(err)) }, } } @@ -587,7 +587,7 @@ func (m *state) AddPodContainer(podID PodID, namespace string, podLabels labels. }).Info("AddPodContainer: added pod") } else if pod.namespace != namespace { // sanity check: old and new namespace should match - return fmt.Errorf("conflicting namespaces for pod with id %s: old='%s' vs new='%s'", podID, pod.namespace, namespace) + return &podNamespaceConflictErr{podID: podID, oldNs: pod.namespace, newNs: namespace} } m.addPodContainers(pod, []string{containerID}, []CgroupID{cgID}) @@ -788,7 +788,7 @@ func (m *state) UpdatePod(podID PodID, namespace string, podLabels labels.Labels dlog.Info("UpdatePod: added pod") } else if pod.namespace != namespace { // sanity check: old and new namespace should match - return fmt.Errorf("conflicting namespaces for pod with id %s: old='%s' vs new='%s'", podID, pod.namespace, namespace) + return &podNamespaceConflictErr{podID: podID, oldNs: pod.namespace, newNs: namespace} } // labels changed: check if there are policies ads that: