From d5bfebb8940dc5e1d31d8dcf7a55e9ce75d7eed2 Mon Sep 17 00:00:00 2001 From: Vignesh Shanmugam Date: Fri, 24 May 2024 07:59:38 -0700 Subject: [PATCH] [Heartbeat] improve monitor state loader failure attempts (#39621) * [Heartbeat] improve state loader failure logs * try increasing timeouts * exit when there is no error * add state loader id --- CHANGELOG.next.asciidoc | 2 ++ .../monitors/wrappers/monitorstate/tracker.go | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 4a02b3c9a1d..1db083bad9a 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -45,6 +45,8 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Heartbeat* +- Fix monitor state loader to not wait extra seconds for the last attempt {pull}39621[39621] + *Metricbeat* - Setting period for counter cache for Prometheus remote_write at least to 60sec {pull}38553[38553] diff --git a/heartbeat/monitors/wrappers/monitorstate/tracker.go b/heartbeat/monitors/wrappers/monitorstate/tracker.go index 40a4e8f2ded..18a3b88753f 100644 --- a/heartbeat/monitors/wrappers/monitorstate/tracker.go +++ b/heartbeat/monitors/wrappers/monitorstate/tracker.go @@ -101,7 +101,7 @@ func (t *Tracker) GetCurrentState(sf stdfields.StdMonitorFields, rc RetryConfig) var loadedState *State var err error var i int - for i = 0; i < attempts; i++ { + for i = 1; i <= attempts; i++ { loadedState, err = t.stateLoader(sf) if err == nil { if loadedState != nil { @@ -111,7 +111,13 @@ func (t *Tracker) GetCurrentState(sf stdfields.StdMonitorFields, rc RetryConfig) } var loaderError LoaderError if errors.As(err, &loaderError) && !loaderError.Retry { - logp.L().Warnf("could not load last externally recorded state: %v", loaderError) + logp.L().Warnf("failed to load previous monitor state: %v", loaderError) + break + } + + // last attempt, exit and log error without sleeping + if i == attempts { + logp.L().Warnf("failed to load previous monitor state: %s after %d attempts: %v", sf.ID, i, err) break } @@ -120,17 +126,13 @@ func (t *Tracker) GetCurrentState(sf stdfields.StdMonitorFields, rc RetryConfig) if rc.waitFn != nil { sleepFor = rc.waitFn() } - logp.L().Warnf("could not load last externally recorded state, will retry again in %d milliseconds: %v", sleepFor.Milliseconds(), err) + logp.L().Warnf("could not load previous monitor state, retrying in %d milliseconds: %v", sleepFor.Milliseconds(), err) time.Sleep(sleepFor) } - if err != nil { - logp.L().Warnf("could not load prior state from elasticsearch after %d attempts, will create new state for monitor: %s", i+1, sf.ID) - } if loadedState != nil { t.states[sf.ID] = loadedState } - // Return what we found, even if nil return loadedState }