Skip to content

Commit

Permalink
policyfilter: implement ErrorLabel() for metrics
Browse files Browse the repository at this point in the history
This patch implements ErrorLabel() for generating error labels for
metrics. This fixes an issue with the previous implementation where the
cardinality of the error_type label was potentially unbounded.

Signed-off-by: Kornilios Kourtis <[email protected]>
  • Loading branch information
kkourt committed Apr 5, 2024
1 parent 7024837 commit bd63a46
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 22 deletions.
1 change: 1 addition & 0 deletions docs/content/en/docs/reference/metrics.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ require (
github.com/mennanov/fieldmask-utils v1.1.2
github.com/opencontainers/runtime-spec v1.2.0
github.com/pelletier/go-toml v1.9.5
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.19.0
github.com/prometheus/client_model v0.6.1
github.com/sirupsen/logrus v1.9.3
Expand Down Expand Up @@ -150,6 +149,7 @@ require (
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect
github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
github.com/prometheus/common v0.48.0 // indirect
Expand Down
45 changes: 30 additions & 15 deletions pkg/metrics/policyfiltermetrics/policyfiltermetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@
package policyfiltermetrics

import (
"fmt"
"strings"

"github.com/cilium/tetragon/pkg/metrics/consts"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)

Expand Down Expand Up @@ -48,33 +44,52 @@ func (s Operation) String() string {
return operationLabelValues[s]
}

type OperationErr int

const (
NoErr OperationErr = iota
GenericErr
PodNamespaceConflictErr
)

var operationErrLabels = map[OperationErr]string{
NoErr: "",
GenericErr: "generic-error",
PodNamespaceConflictErr: "pod-namespace-conflict",
}

func (s OperationErr) String() string {
return operationErrLabels[s]
}

var (
PolicyFilterOpMetrics = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: consts.MetricsNamespace,
Name: "policyfilter_metrics_total",
Help: "Policy filter metrics. For internal use only.",
ConstLabels: nil,
}, []string{"subsys", "op", "error_type"})
}, []string{"subsys", "op", "error"})
)

func InitMetrics(registry *prometheus.Registry) {
registry.MustRegister(PolicyFilterOpMetrics)

// Initialize metrics with labels
PolicyFilterOpMetrics.WithLabelValues(RTHooksSubsys.String(), AddContainerOperation.String()).Add(0)
PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), AddPodOperation.String()).Add(0)
PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), UpdatePodOperation.String()).Add(0)
PolicyFilterOpMetrics.WithLabelValues(PodHandlersSubsys.String(), DeletePodOperation.String()).Add(0)
for _, subsys := range subsysLabelValues {
for _, op := range operationLabelValues {
for _, err := range operationErrLabels {
PolicyFilterOpMetrics.WithLabelValues(
subsys, op, err,
).Add(0)
}
}
}

// NOTES:
// * error, error_type, type - standardize on a label
// * Don't confuse op in policyfilter_metrics_total with ops.OpCode
// * Rename policyfilter_metrics_total to get rid of _metrics?
}

func OpInc(subsys Subsys, op Operation, err error) {
PolicyFilterOpMetrics.WithLabelValues(
subsys.String(), op.String(),
strings.ReplaceAll(fmt.Sprintf("%T", errors.Cause(err)), "*", ""),
).Inc()
func OpInc(subsys Subsys, op Operation, err string) {
PolicyFilterOpMetrics.WithLabelValues(subsys.String(), op.String(), err).Inc()
}
34 changes: 34 additions & 0 deletions pkg/policyfilter/error.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package policyfilter

import (
"fmt"

"github.com/cilium/tetragon/pkg/metrics/policyfiltermetrics"
)

// podNamespaceConflictErr: even if a pod changes, we expect the namespace to remain the same
type podNamespaceConflictErr struct {
podID PodID
oldNs, newNs string
}

func (e *podNamespaceConflictErr) Error() string {
return fmt.Sprintf("conflicting namespaces for pod with id '%s': old='%s' vs new='%s'",
e.podID.String(), e.oldNs, e.newNs)
}

// ErrorLabel returns an error label with a small cardinality so it can be used in metrics
func ErrorLabel(err error) string {
if err == nil {
return policyfiltermetrics.NoErr.String()
}
switch err.(type) {
case *podNamespaceConflictErr:
return policyfiltermetrics.PodNamespaceConflictErr.String()
default:
return policyfiltermetrics.GenericErr.String()
}
}
16 changes: 16 additions & 0 deletions pkg/policyfilter/error_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Tetragon

package policyfilter

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestErrorLabel(t *testing.T) {
var err error = &podNamespaceConflictErr{PodID{}, "foo", "lala"}
require.Equal(t, "", ErrorLabel(nil))
require.Equal(t, "pod-namespace-conflict", ErrorLabel(err))
}
2 changes: 1 addition & 1 deletion pkg/policyfilter/rthooks/rthooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func createContainerHook(_ context.Context, arg *rthooks.CreateContainerArg) err
if err := pfState.AddPodContainer(policyfilter.PodID(podID), namespace, pod.Labels, containerID, cgid); err != nil {
log.WithError(err).Warn("failed to update policy filter, aborting hook.")
}
policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, err)
policyfiltermetrics.OpInc(policyfiltermetrics.RTHooksSubsys, policyfiltermetrics.AddContainerOperation, policyfilter.ErrorLabel(err))

return nil
}
10 changes: 5 additions & 5 deletions pkg/policyfilter/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs {
return
}
err := m.updatePodHandler(pod)
policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, err)
policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.AddPodOperation, ErrorLabel(err))
},
UpdateFunc: func(_, newObj interface{}) {
pod, ok := newObj.(*v1.Pod)
Expand All @@ -309,7 +309,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs {
return
}
err := m.updatePodHandler(pod)
policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, err)
policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.UpdatePodOperation, ErrorLabel(err))
},
DeleteFunc: func(obj interface{}) {
// Remove all containers for this pod
Expand All @@ -332,7 +332,7 @@ func (m *state) getPodEventHandlers() cache.ResourceEventHandlerFuncs {
"namespace": namespace,
}).Warn("policyfilter, delete-pod handler: DelPod failed")
}
policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, err)
policyfiltermetrics.OpInc(policyfiltermetrics.PodHandlersSubsys, policyfiltermetrics.DeletePodOperation, ErrorLabel(err))
},
}
}
Expand Down Expand Up @@ -587,7 +587,7 @@ func (m *state) AddPodContainer(podID PodID, namespace string, podLabels labels.
}).Info("AddPodContainer: added pod")
} else if pod.namespace != namespace {
// sanity check: old and new namespace should match
return fmt.Errorf("conflicting namespaces for pod with id %s: old='%s' vs new='%s'", podID, pod.namespace, namespace)
return &podNamespaceConflictErr{podID: podID, oldNs: pod.namespace, newNs: namespace}
}

m.addPodContainers(pod, []string{containerID}, []CgroupID{cgID})
Expand Down Expand Up @@ -788,7 +788,7 @@ func (m *state) UpdatePod(podID PodID, namespace string, podLabels labels.Labels
dlog.Info("UpdatePod: added pod")
} else if pod.namespace != namespace {
// sanity check: old and new namespace should match
return fmt.Errorf("conflicting namespaces for pod with id %s: old='%s' vs new='%s'", podID, pod.namespace, namespace)
return &podNamespaceConflictErr{podID: podID, oldNs: pod.namespace, newNs: namespace}
}

// labels changed: check if there are policies ads that:
Expand Down

0 comments on commit bd63a46

Please sign in to comment.