Skip to content

Commit

Permalink
Do not break the loop after removing failed scale up nodes
Browse files Browse the repository at this point in the history
Clean up cluster state after removing failed scale up nodes,
so that the loop can continue. Most importantly, update the
target for the affected node group, so that the deleted nodes
are not considered upcoming.
  • Loading branch information
bskiba committed Jul 24, 2024
1 parent d08f0b4 commit e6f06b7
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 1 deletion.
19 changes: 19 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,25 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov
csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime)
}

// HandleDeletingFailedScaleUpNodes updates the registry after deleting nodes that failed
// to scale up. This ensures that the deleted nodes are not treated as upcoming when
// performing scale up. Upcoming nodes are calculated based on node group current target
// size (see GetUpcomingNodes function).
func (csr *ClusterStateRegistry) HandleDeletingFailedScaleUpNodes(nodeGroup cloudprovider.NodeGroup) error {
klog.V(4).Infof("Updating state after failed scale up for %s nodeGroup", nodeGroup.Id())

csr.InvalidateNodeInstancesCacheEntry(nodeGroup)
targetSize, err := nodeGroup.TargetSize()
if err != nil {
return err
}

csr.Lock()
defer csr.Unlock()
csr.acceptableRanges[nodeGroup.Id()] = csr.acceptableRangeForNodeGroup(targetSize, nodeGroup.Id())
return nil
}

// UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats
func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, nodeInfosForGroups map[string]*schedulerframework.NodeInfo, currentTime time.Time) error {
csr.updateNodeGroupMetrics()
Expand Down
67 changes: 67 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,73 @@ func TestUpdateIncorrectNodeGroupSizes(t *testing.T) {
}
}

func TestHandleDeletingFailedScaleUpNodes(t *testing.T) {
ngName := "ng1"
testCases := []struct {
name string
acceptableRange AcceptableRange
readiness Readiness
newTarget int
scaleUpRequest *ScaleUpRequest
wantAcceptableRange AcceptableRange
wantUpcoming int
}{
{
name: "failed scale up by 3 nodes",
acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 1)},
newTarget: 1,
wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1},
wantUpcoming: 0,
}, {
name: "partially failed scale up",
acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8},
readiness: Readiness{Ready: make([]string, 5)},
newTarget: 6,
wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6},
scaleUpRequest: &ScaleUpRequest{Increase: 1},
wantUpcoming: 1,
}, {
name: "scale up ongoing, no change",
acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 1)},
newTarget: 4,
wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4},
scaleUpRequest: &ScaleUpRequest{Increase: 3},
wantUpcoming: 3,
}, {
name: "no scale up, no change",
acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4},
readiness: Readiness{Ready: make([]string, 4)},
newTarget: 4,
wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4},
wantUpcoming: 0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget)

fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder,
newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{}))
clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange}
clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness}
if tc.scaleUpRequest != nil {
clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest}
}

clusterState.HandleDeletingFailedScaleUpNodes(provider.GetNodeGroup(ngName))
assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName])
upcomingCounts, _ := clusterState.GetUpcomingNodes()
if upcoming, found := upcomingCounts[ngName]; found {
assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName])
}
})
}
}

func TestTruncateIfExceedMaxSize(t *testing.T) {
testCases := []struct {
name string
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err)
} else {
deletedAny = true
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
a.clusterStateRegistry.HandleDeletingFailedScaleUpNodes(nodeGroup)
}
}

Expand Down

0 comments on commit e6f06b7

Please sign in to comment.