diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index d3c59b56c59c..3814af293687 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -743,7 +743,7 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { nodeGroups := a.nodeGroupsById() - nodesToBeDeletedByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) + nodesToDeleteByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) for _, unregisteredNode := range allUnregisteredNodes { nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node) if err != nil { @@ -762,12 +762,12 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name) - nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], unregisteredNode) + nodesToDeleteByNodeGroupId[nodeGroup.Id()] = append(nodesToDeleteByNodeGroupId[nodeGroup.Id()], unregisteredNode) } } removedAny := false - for nodeGroupId, unregisteredNodesToDelete := range nodesToBeDeletedByNodeGroupId { + for nodeGroupId, unregisteredNodesToDelete := range nodesToDeleteByNodeGroupId { nodeGroup := nodeGroups[nodeGroupId] klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(unregisteredNodesToDelete), nodeGroupId) @@ -787,20 +787,13 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu } nodesToDelete := toNodes(unregisteredNodesToDelete) - opts, err := nodeGroup.GetOptions(a.NodeGroupDefaults) - if err != nil && err != cloudprovider.ErrNotImplemented { - klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err) + isZeroOrMaxNodesScaling, nodesToDeleteOverride, err := zeroOrMaxNodesToDelete(a.NodeGroupDefaults, nodeGroup) + if err != nil { + klog.Warningf("Failed to remove unregistered nodes from node group %s: %v", nodeGroupId, err) continue } - // If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup - // should stick to the all-or-nothing principle. Deleting all nodes. - if opts != nil && opts.ZeroOrMaxNodeScaling { - instances, err := nodeGroup.Nodes() - if err != nil { - klog.Warningf("Failed to fill in unregistered nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err) - continue - } - nodesToDelete = instancesToFakeNodes(instances) + if isZeroOrMaxNodesScaling { + nodesToDelete = nodesToDeleteOverride } err = nodeGroup.DeleteNodes(nodesToDelete) @@ -835,35 +828,22 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodeGroups := a.nodeGroupsById() - nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors() + nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors() deletedAny := false - for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId { + for nodeGroupId, nodesToDelete := range nodesToDeleteByNodeGroupId { var err error - klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId) + klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToDelete), nodeGroupId) nodeGroup := nodeGroups[nodeGroupId] if nodeGroup == nil { err = fmt.Errorf("node group %s not found", nodeGroupId) - } else { - var opts *config.NodeGroupAutoscalingOptions - opts, err = nodeGroup.GetOptions(a.NodeGroupDefaults) - if err != nil && err != cloudprovider.ErrNotImplemented { - klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err) - continue - } - // If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup - // should stick to the all-or-nothing principle. Deleting all nodes. - if opts != nil && opts.ZeroOrMaxNodeScaling { - instances, err := nodeGroup.Nodes() - if err != nil { - klog.Warningf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err) - continue - } - nodesToBeDeleted = instancesToFakeNodes(instances) + } else if isZeroOrMaxNodesScaling, nodesToDeleteOverride, err := zeroOrMaxNodesToDelete(a.NodeGroupDefaults, nodeGroup); err == nil { + if isZeroOrMaxNodesScaling { + nodesToDelete = nodesToDeleteOverride } - err = nodeGroup.DeleteNodes(nodesToBeDeleted) + err = nodeGroup.DeleteNodes(nodesToDelete) } if err != nil { @@ -877,6 +857,28 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { return deletedAny } +// zeroOrMaxNodesToDelete returns false if the node group is not a "ZeroOrMaxNodeScaling" scaling node group. +// Returns true for "ZeroOrMaxNodeScaling" scaling node group, together with an override list of nodes to delete. +// For a "ZeroOrMaxNodeScaling" node group node deletion is atomic and should delete all nodes. +// +// Ideally this would be handled by the node group itself, but with current setup, it would have to be implemented by every cloud provider. +func zeroOrMaxNodesToDelete(defaults config.NodeGroupAutoscalingOptions, nodeGroup cloudprovider.NodeGroup) (bool, []*apiv1.Node, error) { + opts, err := nodeGroup.GetOptions(defaults) + if err != nil && err != cloudprovider.ErrNotImplemented { + return false, []*apiv1.Node{}, fmt.Errorf("Failed to get node group options for %s: %s", nodeGroup.Id(), err) + } + // If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup + // should stick to the all-or-nothing principle. Deleting all nodes. + if opts != nil && opts.ZeroOrMaxNodeScaling { + instances, err := nodeGroup.Nodes() + if err != nil { + return false, []*apiv1.Node{}, fmt.Errorf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroup.Id(), err) + } + return true, instancesToFakeNodes(instances), nil + } + return false, []*apiv1.Node{}, nil +} + // instancesToNodes returns a list of fake nodes with just names populated, // so that they can be passed as nodes to delete func instancesToFakeNodes(instances []cloudprovider.Instance) []*apiv1.Node { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index f47e88f2021f..c476f4247f6f 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -2266,6 +2266,101 @@ func TestRemoveOldUnregisteredNodesAtomic(t *testing.T) { assert.ElementsMatch(t, wantNames, deletedNames) } +func TestZeroOrMaxNodesToDelete(t *testing.T) { + defaultOptions := config.NodeGroupAutoscalingOptions{} + nodeGroupAtomic := &mockprovider.NodeGroup{} + nodeGroupAtomic.On("GetOptions", defaultOptions).Return( + &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, nil) + nodeGroupAtomic.On("Nodes").Return([]cloudprovider.Instance{ + { + Id: "D1", + Status: &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceRunning, + }, + }, + { + Id: "D2", + Status: &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceCreating, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OtherErrorClass, + ErrorCode: "OTHER", + }, + }, + }, + }, nil) + + nodeGroupRegular := &mockprovider.NodeGroup{} + nodeGroupRegular.On("GetOptions", defaultOptions).Return( + &config.NodeGroupAutoscalingOptions{}, nil) + + nodeGroupOptionsError := &mockprovider.NodeGroup{} + nodeGroupOptionsError.On("GetOptions", defaultOptions).Return( + &config.NodeGroupAutoscalingOptions{}, fmt.Errorf("GetOptions error")) + nodeGroupOptionsError.On("Id").Return("ng-1") + + nodeGroupInstancesError := &mockprovider.NodeGroup{} + nodeGroupInstancesError.On("GetOptions", defaultOptions).Return( + &config.NodeGroupAutoscalingOptions{ + ZeroOrMaxNodeScaling: true, + }, nil) + nodeGroupInstancesError.On("Nodes").Return([]cloudprovider.Instance{}, fmt.Errorf("Instances error")) + nodeGroupInstancesError.On("Id").Return("ng-2") + + testCases := []struct { + name string + nodeGroup cloudprovider.NodeGroup + wantIsZeroOrMax bool + wantErr bool + wantInstanceNames []string + }{ + { + name: "No override for non-atomic node group", + nodeGroup: nodeGroupRegular, + wantIsZeroOrMax: false, + wantErr: false, + wantInstanceNames: []string{}, + }, { + name: "Override for all nodes from atomic node pool", + nodeGroup: nodeGroupAtomic, + wantIsZeroOrMax: true, + wantErr: false, + wantInstanceNames: []string{"D1", "D2"}, + }, { + name: "Error on GetOptions", + nodeGroup: nodeGroupOptionsError, + wantIsZeroOrMax: false, + wantErr: true, + wantInstanceNames: []string{}, + }, { + name: "Error on GetInstances", + nodeGroup: nodeGroupInstancesError, + wantIsZeroOrMax: false, + wantErr: true, + wantInstanceNames: []string{}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + gotIsZeroOrMax, gotInstances, gotErr := zeroOrMaxNodesToDelete(defaultOptions, tc.nodeGroup) + assert.Equal(t, tc.wantIsZeroOrMax, gotIsZeroOrMax, "Unexpected isZeroOrMax, want:%v, got:%v", tc.wantIsZeroOrMax, gotIsZeroOrMax) + if tc.wantErr { + assert.Error(t, gotErr) + } else { + assert.NoError(t, gotErr) + } + gotInstanceNames := []string{} + for _, instance := range gotInstances { + gotInstanceNames = append(gotInstanceNames, instance.Name) + } + assert.ElementsMatch(t, tc.wantInstanceNames, gotInstanceNames, "Unexpected instances to delete, want: %v, got %v", tc.wantInstanceNames, gotInstanceNames) + }) + } +} + func TestSubtractNodes(t *testing.T) { ns := make([]*apiv1.Node, 5) for i := 0; i < len(ns); i++ {