Skip to content

Commit

Permalink
Do not break the loop after removing failed scale up nodes
Browse files Browse the repository at this point in the history
Clean up cluster state after removing failed scale up nodes,
so that the loop can continue. Most importantly, update the
target for the affected node group, so that the deleted nodes
are not considered upcoming.
  • Loading branch information
bskiba committed Jul 26, 2024
1 parent d08f0b4 commit c766715
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 6 deletions.
19 changes: 19 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,25 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov
csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
}

// HandleDeletingFailedScaleUpNodes updates the registry after deleting nodes that failed
// to scale up. This ensures that the deleted nodes are not treated as upcoming when
// performing scale up. Upcoming nodes are calculated based on node group current target
// size (see GetUpcomingNodes function).
func (csr *ClusterStateRegistry) HandleDeletingFailedScaleUpNodes(nodeGroup cloudprovider.NodeGroup) error {
klog.V(4).Infof("Updating state after failed scale up for %s nodeGroup", nodeGroup.Id())

csr.InvalidateNodeInstancesCacheEntry(nodeGroup)
targetSize, err := nodeGroup.TargetSize()
if err != nil {
return err
}

csr.Lock()
defer csr.Unlock()
csr.acceptableRanges[nodeGroup.Id()] = csr.acceptableRangeForNodeGroup(targetSize, nodeGroup.Id())
return nil
}

// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, nodeInfosForGroups map[string]*schedulerframework.NodeInfo, currentTime time.Time) error {
csr.updateNodeGroupMetrics()
Expand Down
67 changes: 67 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,73 @@ func TestUpdateIncorrectNodeGroupSizes(t *testing.T) {
}
}

func TestHandleDeletingFailedScaleUpNodes(t *testing.T) {
ngName := "ng1"
testCases := []struct {
name string
acceptableRange AcceptableRange
readiness Readiness
newTarget int
scaleUpRequest *ScaleUpRequest
wantAcceptableRange AcceptableRange
wantUpcoming int
}{
{
name: "failed scale up by 3 nodes",
acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 1)},
newTarget: 1,
wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1},
wantUpcoming: 0,
}, {
name: "partially failed scale up",
acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8},
readiness: Readiness{Ready: make([]string, 5)},
newTarget: 6,
wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6},
scaleUpRequest: &ScaleUpRequest{Increase: 1},
wantUpcoming: 1,
}, {
name: "scale up ongoing, no change",
acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 1)},
newTarget: 4,
wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
scaleUpRequest: &ScaleUpRequest{Increase: 3},
wantUpcoming: 3,
}, {
name: "no scale up, no change",
acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 4)},
newTarget: 4,
wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4},
wantUpcoming: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget)

fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder,
newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{}))
clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange}
clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness}
if tc.scaleUpRequest != nil {
clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest}
}

clusterState.HandleDeletingFailedScaleUpNodes(provider.GetNodeGroup(ngName))
assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName])
upcomingCounts, _ := clusterState.GetUpcomingNodes()
if upcoming, found := upcomingCounts[ngName]; found {
assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName])
}
})
}
}

func TestTruncateIfExceedMaxSize(t *testing.T) {
testCases := []struct {
name string
Expand Down
19 changes: 13 additions & 6 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -448,9 +448,8 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
return nil
}

if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
if err = a.deleteCreatedNodesWithErrors(); err != nil {
klog.Errorf("Error while deleting created nodes with errors, skipping the iteration: %v", err)
}

// Check if there has been a constant difference between the number of nodes in k8s and
Expand Down Expand Up @@ -831,13 +830,14 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node {
return nodes
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() error {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodeGroups := a.nodeGroupsById()
nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors()

deletedAny := false
var returnErr error

for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId {
var err error
Expand Down Expand Up @@ -870,11 +870,18 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
} else {
deletedAny = true
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
err = a.clusterStateRegistry.HandleDeletingFailedScaleUpNodes(nodeGroup)
if err != nil {
returnErr = fmt.Errorf("Failed to clean up state after deleting failed nodes from %s node group: %v", nodeGroup.Id(), err)
klog.Warningf(returnErr.Error())
}
}
}

return deletedAny
if deletedAny {
klog.V(0).Infof("Some nodes that failed to create were removed.")
}
return returnErr
}

// instancesToNodes returns a list of fake nodes with just names populated,
Expand Down

0 comments on commit c766715

Please sign in to comment.