Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fsm: adjust timeTableLimit according to longest GC threshold #24112

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -1300,7 +1300,7 @@ func (c *CoreScheduler) getThreshold(eval *structs.Evaluation, objectName, confi
c.logger.Debug(fmt.Sprintf("forced %s GC", objectName))
} else {
// Compute the old threshold limit for GC using the FSM
// time table. This is a rough mapping of a time to the
// time table. This is a rough mapping of a time to the
// Raft index it belongs to.
tt := c.srv.fsm.TimeTable()
cutoff := time.Now().UTC().Add(-1 * configThreshold)
Expand Down
2 changes: 1 addition & 1 deletion nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3073,7 +3073,7 @@ func TestCoreScheduler_ExpiredACLTokenGC(t *testing.T) {
// Overwrite the timetable. The existing timetable has an entry due to the
// ACL bootstrapping which makes witnessing a new index at a timestamp in
// the past impossible.
tt := NewTimeTable(timeTableGranularity, timeTableLimit)
tt := NewTimeTable(timeTableGranularity, timeTableDefaultLimit)
tt.Witness(20, time.Now().UTC().Add(-1*testServer.config.ACLTokenExpirationGCThreshold))
testServer.fsm.timetable = tt

Expand Down
18 changes: 16 additions & 2 deletions nomad/fsm.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ import (
const (
// timeTableGranularity is the granularity of index to time tracking
timeTableGranularity = 5 * time.Minute
)

// timeTableLimit is the maximum limit of our tracking
timeTableLimit = 72 * time.Hour
var (
// timeTableDefaultLimit is the default maximum limit of our tracking
timeTableDefaultLimit = 72 * time.Hour
)

// SnapshotType is prefixed to a record in the FSM snapshot
Expand Down Expand Up @@ -192,6 +194,11 @@ type FSMConfig struct {

// JobTrackedVersions is the number of historic job versions that are kept.
JobTrackedVersions int

// LongestThreshold is the longest GC threshold that has been set in the server
// config. We use it to adjust timeTableDefaultLimit, which defaults to 72h, if
// necessary (users can have longer GC thresholds).
LongestThreshold *time.Duration
}

// NewFSM is used to construct a new FSM with a blank state.
Expand All @@ -209,6 +216,13 @@ func NewFSM(config *FSMConfig) (*nomadFSM, error) {
return nil, err
}

// adjust the timeTableLimit if there's any configured GC threshold longer than
// the default 72h
timeTableLimit := timeTableDefaultLimit
if config.LongestThreshold != nil && *config.LongestThreshold > timeTableDefaultLimit {
timeTableLimit = *config.LongestThreshold * 2
}

fsm := &nomadFSM{
evalBroker: config.EvalBroker,
periodicDispatcher: config.Periodic,
Expand Down
21 changes: 21 additions & 0 deletions nomad/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1388,6 +1388,13 @@ func (s *Server) setupRaft() error {
EventBufferSize: s.config.EventBufferSize,
JobTrackedVersions: s.config.JobTrackedVersions,
}

// Check for any GC thresholds that have been set
longestThreshold := s.findLongestThreshold()
if longestThreshold != 0 {
fsmConfig.LongestThreshold = &longestThreshold
}

var err error
s.fsm, err = NewFSM(fsmConfig)
if err != nil {
Expand Down Expand Up @@ -1657,6 +1664,20 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
return serf.Create(conf)
}

func (s *Server) findLongestThreshold() time.Duration {
return max(
s.config.ACLTokenExpirationGCThreshold,
s.config.BatchEvalGCThreshold,
s.config.CSIPluginGCThreshold,
s.config.CSIVolumeClaimGCThreshold,
s.config.DeploymentGCThreshold,
s.config.EvalGCThreshold,
s.config.JobGCThreshold,
s.config.NodeGCThreshold,
s.config.RootKeyGCThreshold,
)
}

// shouldReloadSchedulers checks the new config to determine if the scheduler worker pool
// needs to be updated. If so, returns true and a pointer to a populated SchedulerWorkerPoolArgs
func shouldReloadSchedulers(s *Server, newPoolArgs *SchedulerWorkerPoolArgs) (bool, *SchedulerWorkerPoolArgs) {
Expand Down