Skip to content

Commit

Permalink
Add extra telemetry to monitor failures
Browse files Browse the repository at this point in the history
  • Loading branch information
the-nando committed Jul 9, 2023
1 parent d715a75 commit d14cf2a
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 0 deletions.
2 changes: 2 additions & 0 deletions policy/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sync"
"time"

metrics "github.com/armon/go-metrics"
"github.com/google/go-cmp/cmp"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
Expand Down Expand Up @@ -228,6 +229,7 @@ func (h *Handler) handleTick(ctx context.Context, policy *sdk.ScalingPolicy) (*s

status, err := target.Status(policy.Target.Config)
if err != nil {
metrics.IncrCounter([]string{"target", "status", "failure_count"}, 1)
h.log.Warn("failed to get target status", "error", err)
return nil, err
}
Expand Down
1 change: 1 addition & 0 deletions policy/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ func (m *Manager) monitorPolicies(ctx context.Context, evalCh chan<- *sdk.Scalin

case err := <-m.policyIDsErrCh:
m.log.Error("encountered an error monitoring policy IDs", "error", err)
metrics.IncrCounter([]string{"policy", "manager", "failure_count"}, 1)
if isUnrecoverableError(err) {
return err
}
Expand Down
4 changes: 4 additions & 0 deletions policyeval/base_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati

currentStatus, err := runTargetStatus(target, eval.Policy)
if err != nil {
metrics.IncrCounter([]string{"target", "status", "failure_count"}, 1)
return fmt.Errorf("failed to get target status: %v", err)
}

Expand Down Expand Up @@ -181,6 +182,8 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati
"on_check_error", eval.Policy.OnCheckError,
"error", err)

metrics.IncrCounterWithLabels([]string{"target", "status", "failure_count"}, 1, []metrics.Label{{Name: "check", Value: checkEval.Check.Name}})

// Define how to handle error.
// Use check behaviour if set or fail iff the policy is set to fail.
switch checkEval.Check.OnError {
Expand Down Expand Up @@ -287,6 +290,7 @@ func (w *BaseWorker) handlePolicy(ctx context.Context, eval *sdk.ScalingEvaluati

err = w.scaleTarget(logger, target, eval.Policy, *winner.action, currentStatus)
if err != nil {
metrics.IncrCounter([]string{"target", "scale", "failure_count"}, 1)
return err
}

Expand Down

0 comments on commit d14cf2a

Please sign in to comment.