Skip to content

Commit

Permalink
Extract getting nodes to delete for atomic node groups
Browse files Browse the repository at this point in the history
Extract logic for overriding nodes to delete when deleting
nodes from a ZeroOrMaxNodeScaling node group.
Simplifies the code and removes code duplication.
  • Loading branch information
bskiba committed Jul 18, 2024
1 parent dee3865 commit 1ea1385
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 35 deletions.
72 changes: 37 additions & 35 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu
csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) {

nodeGroups := a.nodeGroupsById()
nodesToBeDeletedByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode)
nodesToDeleteByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode)
for _, unregisteredNode := range allUnregisteredNodes {
nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node)
if err != nil {
Expand All @@ -762,12 +762,12 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu

if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) {
klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name)
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], unregisteredNode)
nodesToDeleteByNodeGroupId[nodeGroup.Id()] = append(nodesToDeleteByNodeGroupId[nodeGroup.Id()], unregisteredNode)
}
}

removedAny := false
for nodeGroupId, unregisteredNodesToDelete := range nodesToBeDeletedByNodeGroupId {
for nodeGroupId, unregisteredNodesToDelete := range nodesToDeleteByNodeGroupId {
nodeGroup := nodeGroups[nodeGroupId]

klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(unregisteredNodesToDelete), nodeGroupId)
Expand All @@ -787,20 +787,13 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu
}
nodesToDelete := toNodes(unregisteredNodesToDelete)

opts, err := nodeGroup.GetOptions(a.NodeGroupDefaults)
if err != nil && err != cloudprovider.ErrNotImplemented {
klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err)
isZeroOrMaxNodesScaling, nodesToDeleteOverride, err := zeroOrMaxNodesToDelete(a.NodeGroupDefaults, nodeGroup)
if err != nil {
klog.Warningf("Failed to remove unregistered nodes from node group %s: %v", nodeGroupId, err)
continue
}
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
// should stick to the all-or-nothing principle. Deleting all nodes.
if opts != nil && opts.ZeroOrMaxNodeScaling {
instances, err := nodeGroup.Nodes()
if err != nil {
klog.Warningf("Failed to fill in unregistered nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err)
continue
}
nodesToDelete = instancesToFakeNodes(instances)
if isZeroOrMaxNodesScaling {
nodesToDelete = nodesToDeleteOverride
}

err = nodeGroup.DeleteNodes(nodesToDelete)
Expand Down Expand Up @@ -835,35 +828,22 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodeGroups := a.nodeGroupsById()
nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()
nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()

deletedAny := false

for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId {
for nodeGroupId, nodesToDelete := range nodesToDeleteByNodeGroupId {
var err error
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId)
klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToDelete), nodeGroupId)

nodeGroup := nodeGroups[nodeGroupId]
if nodeGroup == nil {
err = fmt.Errorf("node group %s not found", nodeGroupId)
} else {
var opts *config.NodeGroupAutoscalingOptions
opts, err = nodeGroup.GetOptions(a.NodeGroupDefaults)
if err != nil && err != cloudprovider.ErrNotImplemented {
klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err)
continue
}
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
// should stick to the all-or-nothing principle. Deleting all nodes.
if opts != nil && opts.ZeroOrMaxNodeScaling {
instances, err := nodeGroup.Nodes()
if err != nil {
klog.Warningf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err)
continue
}
nodesToBeDeleted = instancesToFakeNodes(instances)
} else if isZeroOrMaxNodesScaling, nodesToDeleteOverride, err := zeroOrMaxNodesToDelete(a.NodeGroupDefaults, nodeGroup); err == nil {
if isZeroOrMaxNodesScaling {
nodesToDelete = nodesToDeleteOverride
}
err = nodeGroup.DeleteNodes(nodesToBeDeleted)
err = nodeGroup.DeleteNodes(nodesToDelete)
}

if err != nil {
Expand All @@ -877,6 +857,28 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
return deletedAny
}

// zeroOrMaxNodesToDelete returns false if the node group is not a "ZeroOrMaxNodeScaling" scaling node group.
// Returns true for "ZeroOrMaxNodeScaling" scaling node group, together with an override list of nodes to delete.
// For a "ZeroOrMaxNodeScaling" node group node deletion is atomic and should delete all nodes.
//
// Ideally this would be handled by the node group itself, but with current setup, it would have to be implemented by every cloud provider.
func zeroOrMaxNodesToDelete(defaults config.NodeGroupAutoscalingOptions, nodeGroup cloudprovider.NodeGroup) (bool, []*apiv1.Node, error) {
opts, err := nodeGroup.GetOptions(defaults)
if err != nil && err != cloudprovider.ErrNotImplemented {
return false, []*apiv1.Node{}, fmt.Errorf("Failed to get node group options for %s: %s", nodeGroup.Id(), err)
}
// If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup
// should stick to the all-or-nothing principle. Deleting all nodes.
if opts != nil && opts.ZeroOrMaxNodeScaling {
instances, err := nodeGroup.Nodes()
if err != nil {
return false, []*apiv1.Node{}, fmt.Errorf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroup.Id(), err)
}
return true, instancesToFakeNodes(instances), nil
}
return false, []*apiv1.Node{}, nil
}

// instancesToNodes returns a list of fake nodes with just names populated,
// so that they can be passed as nodes to delete
func instancesToFakeNodes(instances []cloudprovider.Instance) []*apiv1.Node {
Expand Down
95 changes: 95 additions & 0 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2266,6 +2266,101 @@ func TestRemoveOldUnregisteredNodesAtomic(t *testing.T) {
assert.ElementsMatch(t, wantNames, deletedNames)
}

func TestZeroOrMaxNodesToDelete(t *testing.T) {
defaultOptions := config.NodeGroupAutoscalingOptions{}
nodeGroupAtomic := &mockprovider.NodeGroup{}
nodeGroupAtomic.On("GetOptions", defaultOptions).Return(
&config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
}, nil)
nodeGroupAtomic.On("Nodes").Return([]cloudprovider.Instance{
{
Id: "D1",
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceRunning,
},
},
{
Id: "D2",
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceCreating,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OtherErrorClass,
ErrorCode: "OTHER",
},
},
},
}, nil)

nodeGroupRegular := &mockprovider.NodeGroup{}
nodeGroupRegular.On("GetOptions", defaultOptions).Return(
&config.NodeGroupAutoscalingOptions{}, nil)

nodeGroupOptionsError := &mockprovider.NodeGroup{}
nodeGroupOptionsError.On("GetOptions", defaultOptions).Return(
&config.NodeGroupAutoscalingOptions{}, fmt.Errorf("GetOptions error"))
nodeGroupOptionsError.On("Id").Return("ng-1")

nodeGroupInstancesError := &mockprovider.NodeGroup{}
nodeGroupInstancesError.On("GetOptions", defaultOptions).Return(
&config.NodeGroupAutoscalingOptions{
ZeroOrMaxNodeScaling: true,
}, nil)
nodeGroupInstancesError.On("Nodes").Return([]cloudprovider.Instance{}, fmt.Errorf("Instances error"))
nodeGroupInstancesError.On("Id").Return("ng-2")

testCases := []struct {
name string
nodeGroup cloudprovider.NodeGroup
wantIsZeroOrMax bool
wantErr bool
wantInstanceNames []string
}{
{
name: "No override for non-atomic node group",
nodeGroup: nodeGroupRegular,
wantIsZeroOrMax: false,
wantErr: false,
wantInstanceNames: []string{},
}, {
name: "Override for all nodes from atomic node pool",
nodeGroup: nodeGroupAtomic,
wantIsZeroOrMax: true,
wantErr: false,
wantInstanceNames: []string{"D1", "D2"},
}, {
name: "Error on GetOptions",
nodeGroup: nodeGroupOptionsError,
wantIsZeroOrMax: false,
wantErr: true,
wantInstanceNames: []string{},
}, {
name: "Error on GetInstances",
nodeGroup: nodeGroupInstancesError,
wantIsZeroOrMax: false,
wantErr: true,
wantInstanceNames: []string{},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
gotIsZeroOrMax, gotInstances, gotErr := zeroOrMaxNodesToDelete(defaultOptions, tc.nodeGroup)
assert.Equal(t, tc.wantIsZeroOrMax, gotIsZeroOrMax, "Unexpected isZeroOrMax, want:%v, got:%v", tc.wantIsZeroOrMax, gotIsZeroOrMax)
if tc.wantErr {
assert.Error(t, gotErr)
} else {
assert.NoError(t, gotErr)
}
gotInstanceNames := []string{}
for _, instance := range gotInstances {
gotInstanceNames = append(gotInstanceNames, instance.Name)
}
assert.ElementsMatch(t, tc.wantInstanceNames, gotInstanceNames, "Unexpected instances to delete, want: %v, got %v", tc.wantInstanceNames, gotInstanceNames)
})
}
}

func TestSubtractNodes(t *testing.T) {
ns := make([]*apiv1.Node, 5)
for i := 0; i < len(ns); i++ {
Expand Down

0 comments on commit 1ea1385

Please sign in to comment.