diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 3433daac7cc..0c008489de5 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -1300,7 +1300,7 @@ func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, confi c.logger.Debug(fmt.Sprintf("forced %s GC", objectName)) } else { // Compute the old threshold limit for GC using the FSM - // time table. This is a rough mapping of a time to the + // time table. This is a rough mapping of a time to the // Raft index it belongs to. tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * configThreshold) diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 1ad94d3b6c8..a637661fd6a 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -3073,7 +3073,7 @@ func TestCoreScheduler_ExpiredACLTokenGC(t *testing.T) { // Overwrite the timetable. The existing timetable has an entry due to the // ACL bootstrapping which makes witnessing a new index at a timestamp in // the past impossible. - tt := NewTimeTable(timeTableGranularity, timeTableLimit) + tt := NewTimeTable(timeTableGranularity, timeTableDefaultLimit) tt.Witness(20, time.Now().UTC().Add(-1*testServer.config.ACLTokenExpirationGCThreshold)) testServer.fsm.timetable = tt diff --git a/nomad/fsm.go b/nomad/fsm.go index d862c36177c..4669b5acde0 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -26,9 +26,11 @@ import ( const ( // timeTableGranularity is the granularity of index to time tracking timeTableGranularity = 5 * time.Minute +) - // timeTableLimit is the maximum limit of our tracking - timeTableLimit = 72 * time.Hour +var ( + // timeTableDefaultLimit is the default maximum limit of our tracking + timeTableDefaultLimit = 72 * time.Hour ) // SnapshotType is prefixed to a record in the FSM snapshot @@ -192,6 +194,11 @@ type FSMConfig struct { // JobTrackedVersions is the number of historic job versions that are kept. JobTrackedVersions int + + // LongestThreshold is the longest GC threshold that has been set in the server + // config. We use it to adjust timeTableDefaultLimit, which defaults to 72h, if + // necessary (users can have longer GC thresholds). + LongestThreshold *time.Duration } // NewFSM is used to construct a new FSM with a blank state. @@ -209,6 +216,13 @@ func NewFSM(config *FSMConfig) (*nomadFSM, error) { return nil, err } + // adjust the timeTableLimit if there's any configured GC threshold longer than + // the default 72h + timeTableLimit := timeTableDefaultLimit + if config.LongestThreshold != nil && *config.LongestThreshold > timeTableDefaultLimit { + timeTableLimit = *config.LongestThreshold * 2 + } + fsm := &nomadFSM{ evalBroker: config.EvalBroker, periodicDispatcher: config.Periodic, diff --git a/nomad/server.go b/nomad/server.go index 27dbc0587e9..dc58ceb764d 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1388,6 +1388,13 @@ func (s *Server) setupRaft() error { EventBufferSize: s.config.EventBufferSize, JobTrackedVersions: s.config.JobTrackedVersions, } + + // Check for any GC thresholds that have been set + longestThreshold := s.findLongestThreshold() + if longestThreshold != 0 { + fsmConfig.LongestThreshold = &longestThreshold + } + var err error s.fsm, err = NewFSM(fsmConfig) if err != nil { @@ -1657,6 +1664,20 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( return serf.Create(conf) } +func (s *Server) findLongestThreshold() time.Duration { + return max( + s.config.ACLTokenExpirationGCThreshold, + s.config.BatchEvalGCThreshold, + s.config.CSIPluginGCThreshold, + s.config.CSIVolumeClaimGCThreshold, + s.config.DeploymentGCThreshold, + s.config.EvalGCThreshold, + s.config.JobGCThreshold, + s.config.NodeGCThreshold, + s.config.RootKeyGCThreshold, + ) +} + // shouldReloadSchedulers checks the new config to determine if the scheduler worker pool // needs to be updated. If so, returns true and a pointer to a populated SchedulerWorkerPoolArgs func shouldReloadSchedulers(s *Server, newPoolArgs *SchedulerWorkerPoolArgs) (bool, *SchedulerWorkerPoolArgs) {