diff --git a/policy/handler.go b/policy/handler.go index 81244eb4..358985d6 100644 --- a/policy/handler.go +++ b/policy/handler.go @@ -12,6 +12,7 @@ import ( "sync" "time" + metrics "github.com/armon/go-metrics" "github.com/google/go-cmp/cmp" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-multierror" @@ -228,6 +229,7 @@ func (h *Handler) handleTick(ctx context.Context, policy *sdk.ScalingPolicy) (*s status, err := target.Status(policy.Target.Config) if err != nil { + metrics.IncrCounter([]string{"target", "status", "failure_count"}, 1) h.log.Warn("failed to get target status", "error", err) return nil, err } diff --git a/policy/manager.go b/policy/manager.go index 549ce363..bdc6ce4b 100644 --- a/policy/manager.go +++ b/policy/manager.go @@ -116,6 +116,7 @@ func (m *Manager) monitorPolicies(ctx context.Context, evalCh chan<- *sdk.Scalin case err := <-m.policyIDsErrCh: m.log.Error("encountered an error monitoring policy IDs", "error", err) + metrics.IncrCounter([]string{"policy", "manager", "failure_count"}, 1) if isUnrecoverableError(err) { return err } diff --git a/policyeval/base_worker.go b/policyeval/base_worker.go index c574148a..d1567aaf 100644 --- a/policyeval/base_worker.go +++ b/policyeval/base_worker.go @@ -114,6 +114,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati currentStatus, err := runTargetStatus(target, eval.Policy) if err != nil { + metrics.IncrCounter([]string{"target", "status", "failure_count"}, 1) return fmt.Errorf("failed to get target status: %v", err) } @@ -181,6 +182,8 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati "on_check_error", eval.Policy.OnCheckError, "error", err) + metrics.IncrCounterWithLabels([]string{"target", "status", "failure_count"}, 1, []metrics.Label{{Name: "check", Value: checkEval.Check.Name}}) + // Define how to handle error. // Use check behaviour if set or fail iff the policy is set to fail. switch checkEval.Check.OnError { @@ -287,6 +290,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati err = w.scaleTarget(logger, target, eval.Policy, *winner.action, currentStatus) if err != nil { + metrics.IncrCounter([]string{"target", "scale", "failure_count"}, 1) return err }