From a632034340295cfde699ab39c380ce72d535faef Mon Sep 17 00:00:00 2001 From: The Nando <89834752+the-nando@users.noreply.github.com> Date: Sun, 9 Jul 2023 10:16:56 +0200 Subject: [PATCH] Add extra telemetry to monitor failures --- policy/handler.go | 2 ++ policy/manager.go | 1 + policyeval/base_worker.go | 3 +++ 3 files changed, 6 insertions(+) diff --git a/policy/handler.go b/policy/handler.go index 81244eb4..ac0e515e 100644 --- a/policy/handler.go +++ b/policy/handler.go @@ -12,6 +12,7 @@ import ( "sync" "time" + metrics "github.com/armon/go-metrics" "github.com/google/go-cmp/cmp" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-multierror" @@ -228,6 +229,7 @@ func (h *Handler) handleTick(ctx context.Context, policy *sdk.ScalingPolicy) (*s status, err := target.Status(policy.Target.Config) if err != nil { + metrics.IncrCounter([]string{"policy", "target_status", "failure_count"}, 1) h.log.Warn("failed to get target status", "error", err) return nil, err } diff --git a/policy/manager.go b/policy/manager.go index 549ce363..ebf46e27 100644 --- a/policy/manager.go +++ b/policy/manager.go @@ -116,6 +116,7 @@ func (m *Manager) monitorPolicies(ctx context.Context, evalCh chan<- *sdk.Scalin case err := <-m.policyIDsErrCh: m.log.Error("encountered an error monitoring policy IDs", "error", err) + metrics.IncrCounter([]string{"policy", "monitor", "failure_count"}, 1) if isUnrecoverableError(err) { return err } diff --git a/policyeval/base_worker.go b/policyeval/base_worker.go index c574148a..d2af45c2 100644 --- a/policyeval/base_worker.go +++ b/policyeval/base_worker.go @@ -181,6 +181,8 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati "on_check_error", eval.Policy.OnCheckError, "error", err) + metrics.IncrCounterWithLabels([]string{"policy", "run_check", "failure_count"}, 1, []metrics.Label{{Name: "check", Value: checkEval.Check.Name}}) + // Define how to handle error. // Use check behaviour if set or fail iff the policy is set to fail. switch checkEval.Check.OnError { @@ -287,6 +289,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati err = w.scaleTarget(logger, target, eval.Policy, *winner.action, currentStatus) if err != nil { + metrics.IncrCounter([]string{"target", "scale", "failure_count"}, 1) return err }