Skip to content

Commit

Permalink
add the ability to remove nodes older than a certain age even when th…
Browse files Browse the repository at this point in the history
…e node group has reached the min

- adds max_node_age field to node group configuration
- max_node_age by default is disabled, and needs to be a positive, greater than zero duration to enable
- when the node group has reached the minimum and there are nodes older than the max_node_age, the scale delta will be set to +1
- this will then force escalator to replace the oldest node

fixes #239

Signed-off-by: Alex Price <[email protected]>
  • Loading branch information
awprice committed May 27, 2024
1 parent 97246b1 commit d5813c5
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 8 deletions.
33 changes: 33 additions & 0 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
nodesDelta = int(math.Max(float64(nodesDelta), 1))
}

if c.scaleOnMaxNodeAge(nodeGroup, untaintedNodes) {
log.WithField("nodegroup", nodegroup).
Info("Setting scale to minimum of 1 to rotate out a node older than the max node age")
nodesDelta = int(math.Max(float64(nodesDelta), 1))
}

log.WithField("nodegroup", nodegroup).Debugf("Delta: %v", nodesDelta)

scaleOptions := scaleOpts{
Expand Down Expand Up @@ -431,6 +437,33 @@ func (c *Controller) isScaleOnStarve(
len(untaintedNodes) < nodeGroup.Opts.MaxNodes
}

// scaleOnMaxNodeAge returns true if the node group is at the minimum and there is a node in the untaintedNodes older
// than the configured node group's maxNodeAge. The idea here is to constantly rotate out the oldest nodes
// when the node group is at the minimum. This is to ensure the nodes are receiving the most up-to-date configuration
// from the cloud provider.
func (c *Controller) scaleOnMaxNodeAge(nodeGroup *NodeGroupState, untaintedNodes []*v1.Node) bool {
// Only enable this functionality if the node group has the feature enabled
if nodeGroup.Opts.MaxNodeAgeDuration() <= 0 {
return false
}

// We don't want to attempt to rotate nodes that have reached the max age if we haven't reached the minimum node
// count, as the scaling down of the node group will remove the oldest first anyway.
if len(untaintedNodes) != nodeGroup.Opts.MinNodes || len(untaintedNodes) == 0 {
return false
}

// Determine if there is an untainted node that exceeds the max age, if so then we should scale up
// to trigger that node to be replaced.
for _, n := range untaintedNodes {
if time.Now().Sub(n.CreationTimestamp.Time) > nodeGroup.Opts.MaxNodeAgeDuration() {

Check failure on line 459 in pkg/controller/controller.go

View workflow job for this annotation

GitHub Actions / lint

S1012: should use `time.Since` instead of `time.Now().Sub` (gosimple)
return true
}
}

return false
}

// RunOnce performs the main autoscaler logic once
func (c *Controller) RunOnce() error {
startTime := time.Now()
Expand Down
224 changes: 216 additions & 8 deletions pkg/controller/controller_scale_node_group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package controller

import (
"testing"
duration "time"
stdtime "time"

"github.com/atlassian/escalator/pkg/k8s"
"github.com/atlassian/escalator/pkg/k8s/resource"
Expand Down Expand Up @@ -39,7 +39,7 @@ func buildTestClient(nodes []*v1.Node, pods []*v1.Pod, nodeGroups []NodeGroupOpt
opts := Opts{
K8SClient: fakeClient,
NodeGroups: nodeGroups,
ScanInterval: 1 * duration.Minute,
ScanInterval: 1 * stdtime.Minute,
DryMode: false,
}
allPodLister, err := test.NewTestPodWatcher(pods, listerOptions.podListerOptions)
Expand Down Expand Up @@ -853,7 +853,7 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
args args
scaleUpWithCachedCapacity bool
runs int
runInterval duration.Duration
runInterval stdtime.Duration
want int
err error
}{
Expand All @@ -879,7 +879,7 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
},
false,
1,
duration.Minute,
stdtime.Minute,
-4,
nil,
},
Expand All @@ -905,7 +905,7 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
},
false,
5,
duration.Minute,
stdtime.Minute,
-2,
nil,
},
Expand All @@ -931,7 +931,7 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
},
false,
1,
duration.Minute,
stdtime.Minute,
-4,
nil,
},
Expand All @@ -958,7 +958,7 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
},
false,
1,
duration.Minute,
stdtime.Minute,
1,
nil,
},
Expand All @@ -985,7 +985,7 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
},
true,
1,
duration.Minute,
stdtime.Minute,
6,
nil,
},
Expand Down Expand Up @@ -1060,3 +1060,211 @@ func TestScaleNodeGroup_MultipleRuns(t *testing.T) {
})
}
}

func TestScaleNodeGroupNodeMaxAge(t *testing.T) {
buildNode := func(creation stdtime.Time, tainted bool) *v1.Node {
return test.BuildTestNode(test.NodeOpts{
CPU: int64(1000),
Mem: int64(1000),
Creation: creation,
Tainted: tainted,
})
}

type args struct {
nodes []*v1.Node
pods []*v1.Pod
nodeGroupOptions NodeGroupOptions
listerOptions ListerOptions
}

tests := []struct {
name string
args args
expectedNodeDelta int
err error
}{
{
"max_node_age disabled",
args{
nodes: []*v1.Node{
buildNode(time.Now().Add(-1*stdtime.Hour), false),
buildNode(time.Now().Add(-24*stdtime.Hour), false),
buildNode(time.Now().Add(-36*stdtime.Hour), false),
},
pods: nil,
nodeGroupOptions: NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 3,
MaxNodes: 10,
ScaleUpThresholdPercent: 70,
MaxNodeAge: "0",
},
listerOptions: ListerOptions{},
},
0,
nil,
},
{
"max_node_age enabled, max node age 12 hours",
args{
nodes: []*v1.Node{
buildNode(time.Now().Add(-1*stdtime.Hour), false),
buildNode(time.Now().Add(-24*stdtime.Hour), false),
buildNode(time.Now().Add(-36*stdtime.Hour), false),
},
pods: nil,
nodeGroupOptions: NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 3,
MaxNodes: 10,
ScaleUpThresholdPercent: 70,
MaxNodeAge: "12h",
},
listerOptions: ListerOptions{},
},
1,
nil,
},
{
"max_node_age enabled, max node age 48 hours",
args{
nodes: []*v1.Node{
buildNode(time.Now().Add(-1*stdtime.Hour), false),
buildNode(time.Now().Add(-24*stdtime.Hour), false),
buildNode(time.Now().Add(-36*stdtime.Hour), false),
},
pods: nil,
nodeGroupOptions: NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 3,
MaxNodes: 10,
ScaleUpThresholdPercent: 70,
MaxNodeAge: "48h",
},
listerOptions: ListerOptions{},
},
0,
nil,
},
{
"max_node_age enabled, but not at node minimum",
args{
nodes: []*v1.Node{
buildNode(time.Now().Add(-1*stdtime.Hour), false),
buildNode(time.Now().Add(-24*stdtime.Hour), false),
buildNode(time.Now().Add(-36*stdtime.Hour), false),
},
pods: nil,
nodeGroupOptions: NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 1,
MaxNodes: 10,
ScaleUpThresholdPercent: 70,
MaxNodeAge: "12h",
},
listerOptions: ListerOptions{},
},
0,
nil,
},
{
"max_node_age enabled, but no nodes",
args{
nodes: nil,
pods: nil,
nodeGroupOptions: NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 1,
MaxNodes: 10,
ScaleUpThresholdPercent: 70,
MaxNodeAge: "12h",
},
listerOptions: ListerOptions{},
},
0,
nil,
},
{
"max_node_age enabled, some nodes are tainted",
args{
nodes: []*v1.Node{
buildNode(time.Now().Add(-1*stdtime.Hour), false),
buildNode(time.Now().Add(-24*stdtime.Hour), false),
buildNode(time.Now().Add(-36*stdtime.Hour), true),
},
pods: nil,
nodeGroupOptions: NodeGroupOptions{
Name: "default",
CloudProviderGroupName: "default",
MinNodes: 1,
MaxNodes: 10,
ScaleUpThresholdPercent: 70,
MaxNodeAge: "12h",
},
listerOptions: ListerOptions{},
},
0,
nil,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
nodeGroups := []NodeGroupOptions{tt.args.nodeGroupOptions}
ngName := tt.args.nodeGroupOptions.Name
client, opts, err := buildTestClient(tt.args.nodes, tt.args.pods, nodeGroups, tt.args.listerOptions)
require.NoError(t, err)

// For these test cases we only use 1 node group/cloud provider node group
nodeGroupSize := 1

// Create a test (mock) cloud provider
testCloudProvider := test.NewCloudProvider(nodeGroupSize)
testNodeGroup := test.NewNodeGroup(
tt.args.nodeGroupOptions.CloudProviderGroupName,
tt.args.nodeGroupOptions.Name,
int64(tt.args.nodeGroupOptions.MinNodes),
int64(tt.args.nodeGroupOptions.MaxNodes),
int64(len(tt.args.nodes)),
)
testCloudProvider.RegisterNodeGroup(testNodeGroup)

// Create a node group state with the mapping of node groups to the cloud providers node groups
nodeGroupsState := BuildNodeGroupsState(nodeGroupsStateOpts{
nodeGroups: nodeGroups,
client: *client,
})

controller := &Controller{
Client: client,
Opts: opts,
stopChan: nil,
nodeGroups: nodeGroupsState,
cloudProvider: testCloudProvider,
}

nodesDelta, err := controller.scaleNodeGroup(ngName, nodeGroupsState[ngName])

// Ensure there were no errors
if tt.err == nil {
require.NoError(t, err)
} else {
require.EqualError(t, tt.err, err.Error())
}

assert.Equal(t, tt.expectedNodeDelta, nodesDelta)
if nodesDelta <= 0 {
return
}

// Ensure the node group on the cloud provider side scales up to the correct amount
assert.Equal(t, int64(len(tt.args.nodes)+nodesDelta), testNodeGroup.TargetSize())
})
}
}
27 changes: 27 additions & 0 deletions pkg/controller/node_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,13 @@ type NodeGroupOptions struct {

AWS AWSNodeGroupOptions `json:"aws" yaml:"aws"`

MaxNodeAge string `json:"max_node_age,omitempty" yaml:"max_node_age,omitempty"`

// Private variables for storing the parsed duration from the string
softDeleteGracePeriodDuration time.Duration
hardDeleteGracePeriodDuration time.Duration
scaleUpCoolDownPeriodDuration time.Duration
maxNodeAgeDuration time.Duration
}

// AWSNodeGroupOptions represents a nodegroup running on a cluster that is
Expand Down Expand Up @@ -124,6 +127,9 @@ func ValidateNodeGroup(nodegroup NodeGroupOptions) []error {
checkThat(validTaintEffect(nodegroup.TaintEffect), "taint_effect must be valid kubernetes taint")

checkThat(validAWSLifecycle(nodegroup.AWS.Lifecycle), "aws.lifecycle must be '%v' or '%v' if provided.", aws.LifecycleOnDemand, aws.LifecycleSpot)

checkThat(validMaxNodeAgeDuration(nodegroup.MaxNodeAge), "max_node_age failed to parse into a time.Duration. Set to '0' or '' to disable, or a positive Go duration to enable.")

return problems
}

Expand All @@ -137,6 +143,15 @@ func validTaintEffect(taintEffect v1.TaintEffect) bool {
return len(taintEffect) == 0 || k8s.TaintEffectTypes[taintEffect]
}

func validMaxNodeAgeDuration(maxNodeAge string) bool {
// Accept a blank maxNodeAge as valid, which will disable the feature
if maxNodeAge == "" {
return true
}
_, err := time.ParseDuration(maxNodeAge)
return err == nil
}

// SoftDeleteGracePeriodDuration lazily returns/parses the softDeleteGracePeriod string into a duration
func (n *NodeGroupOptions) SoftDeleteGracePeriodDuration() time.Duration {
if n.softDeleteGracePeriodDuration == 0 {
Expand Down Expand Up @@ -176,6 +191,18 @@ func (n *NodeGroupOptions) ScaleUpCoolDownPeriodDuration() time.Duration {
return n.scaleUpCoolDownPeriodDuration
}

func (n *NodeGroupOptions) MaxNodeAgeDuration() time.Duration {
if n.maxNodeAgeDuration == 0 {
duration, err := time.ParseDuration(n.MaxNodeAge)
if err != nil {
return 0
}
n.maxNodeAgeDuration = duration
}

return n.maxNodeAgeDuration
}

// autoDiscoverMinMaxNodeOptions returns whether the min_nodes and max_nodes options should be "auto-discovered" from the cloud provider
func (n *NodeGroupOptions) autoDiscoverMinMaxNodeOptions() bool {
return n.MinNodes == 0 && n.MaxNodes == 0
Expand Down
Loading

0 comments on commit d5813c5

Please sign in to comment.