Skip to content

Commit

Permalink
Improve ruler observability on syncRules() (#5311)
Browse files Browse the repository at this point in the history
Signed-off-by: Marco Pracucci <[email protected]>
  • Loading branch information
pracucci authored Jun 22, 2023
1 parent 836c907 commit 16147a5
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
* [ENHANCEMENT] Querier: improve error message when streaming chunks from ingesters to queriers and a query limit is reached. #5245
* [ENHANCEMENT] Use new data structure for labels, to reduce memory consumption. #3555
* [ENHANCEMENT] Update alpine base image to 3.18.2. #5276
* [ENHANCEMENT] Ruler: add `cortex_ruler_sync_rules_duration_seconds` metric, tracking the time spent syncing all rule groups owned by the ruler instance. #5311
* [BUGFIX] Ingester: Handle when previous ring state is leaving and the number of tokens has changed. #5204

### Mixin
Expand Down
2 changes: 1 addition & 1 deletion pkg/ruler/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ func (r *DefaultMultiTenantManager) syncRulesToManagerConcurrently(ctx context.C
users = append(users, userID)
}

// concurrenty.ForEachJob is a helper function that runs a function for each job in parallel.
// concurrency.ForEachJob is a helper function that runs a function for each job in parallel.
// It cancel context of jobFunc once iteration is done.
// That is why the context passed to syncRulesToManager should be the global context not the context of jobFunc.
err := concurrency.ForEachJob(ctx, len(users), 10, func(_ context.Context, idx int) error {
Expand Down
24 changes: 17 additions & 7 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,11 @@ func (cfg *Config) syncQueuePollFrequency() time.Duration {
}

type rulerMetrics struct {
listRules prometheus.Histogram
loadRuleGroups prometheus.Histogram
ringCheckErrors prometheus.Counter
rulerSync *prometheus.CounterVec
listRules prometheus.Histogram
loadRuleGroups prometheus.Histogram
ringCheckErrors prometheus.Counter
rulerSync *prometheus.CounterVec
rulerSyncDuration prometheus.Histogram
}

func newRulerMetrics(reg prometheus.Registerer) *rulerMetrics {
Expand All @@ -222,6 +223,11 @@ func newRulerMetrics(reg prometheus.Registerer) *rulerMetrics {
Name: "cortex_ruler_sync_rules_total",
Help: "Total number of times the ruler sync operation triggered.",
}, []string{"reason"}),
rulerSyncDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
Name: "cortex_ruler_sync_rules_duration_seconds",
Help: "Time spent syncing all rule groups owned by this ruler instance. This metric tracks the timing of both full and partial sync, and includes the time spent loading rule groups from the storage.",
Buckets: []float64{1, 5, 10, 30, 60, 300, 600, 1800, 3600},
}),
}

// Init metrics.
Expand Down Expand Up @@ -535,12 +541,16 @@ func (r *Ruler) run(ctx context.Context) error {
// We expect this function is only called from Ruler.run().
func (r *Ruler) syncRules(ctx context.Context, userIDs []string, reason rulesSyncReason, cacheLookupEnabled bool) {
var (
configs map[string]rulespb.RuleGroupList
err error
configs map[string]rulespb.RuleGroupList
err error
startTime = time.Now()
)

level.Debug(r.logger).Log("msg", "syncing rules", "reason", reason)
level.Info(r.logger).Log("msg", "syncing rules", "reason", reason)
r.metrics.rulerSync.WithLabelValues(string(reason)).Inc()
defer func() {
r.metrics.rulerSyncDuration.Observe(time.Since(startTime).Seconds())
}()

// List rule groups to sync.
if len(userIDs) > 0 {
Expand Down

0 comments on commit 16147a5

Please sign in to comment.