From e6f06b79694910a105dbd5b0a743aee37ccef1ec Mon Sep 17 00:00:00 2001 From: "Beata Lach (Skiba)" Date: Wed, 24 Jul 2024 11:32:16 +0000 Subject: [PATCH] Do not break the loop after removing failed scale up nodes Clean up cluster state after removing failed scale up nodes, so that the loop can continue. Most importantly, update the target for the affected node group, so that the deleted nodes are not considered upcoming. --- .../clusterstate/clusterstate.go | 19 ++++++ .../clusterstate/clusterstate_test.go | 67 +++++++++++++++++++ cluster-autoscaler/core/static_autoscaler.go | 2 +- 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate.go b/cluster-autoscaler/clusterstate/clusterstate.go index 89af26cffc1f..1525ea141e77 100644 --- a/cluster-autoscaler/clusterstate/clusterstate.go +++ b/cluster-autoscaler/clusterstate/clusterstate.go @@ -333,6 +333,25 @@ func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprov csr.backoffNodeGroup(nodeGroup, errorInfo, currentTime) } +// HandleDeletingFailedScaleUpNodes updates the registry after deleting nodes that failed +// to scale up. This ensures that the deleted nodes are not treated as upcoming when +// performing scale up. Upcoming nodes are calculated based on node group current target +// size (see GetUpcomingNodes function). +func (csr *ClusterStateRegistry) HandleDeletingFailedScaleUpNodes(nodeGroup cloudprovider.NodeGroup) error { + klog.V(4).Infof("Updating state after failed scale up for %s nodeGroup", nodeGroup.Id()) + + csr.InvalidateNodeInstancesCacheEntry(nodeGroup) + targetSize, err := nodeGroup.TargetSize() + if err != nil { + return err + } + + csr.Lock() + defer csr.Unlock() + csr.acceptableRanges[nodeGroup.Id()] = csr.acceptableRangeForNodeGroup(targetSize, nodeGroup.Id()) + return nil +} + // UpdateNodes updates the state of the nodes in the ClusterStateRegistry and recalculates the stats func (csr *ClusterStateRegistry) UpdateNodes(nodes []*apiv1.Node, nodeInfosForGroups map[string]*schedulerframework.NodeInfo, currentTime time.Time) error { csr.updateNodeGroupMetrics() diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index c9290e2dcb0f..278e23a8cb2d 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -1340,6 +1340,73 @@ func TestUpdateIncorrectNodeGroupSizes(t *testing.T) { } } +func TestHandleDeletingFailedScaleUpNodes(t *testing.T) { + ngName := "ng1" + testCases := []struct { + name string + acceptableRange AcceptableRange + readiness Readiness + newTarget int + scaleUpRequest *ScaleUpRequest + wantAcceptableRange AcceptableRange + wantUpcoming int + }{ + { + name: "failed scale up by 3 nodes", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 1, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1}, + wantUpcoming: 0, + }, { + name: "partially failed scale up", + acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8}, + readiness: Readiness{Ready: make([]string, 5)}, + newTarget: 6, + wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6}, + scaleUpRequest: &ScaleUpRequest{Increase: 1}, + wantUpcoming: 1, + }, { + name: "scale up ongoing, no change", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + scaleUpRequest: &ScaleUpRequest{Increase: 3}, + wantUpcoming: 3, + }, { + name: "no scale up, no change", + acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 4)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + wantUpcoming: 0, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget) + + fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, + newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{})) + clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange} + clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness} + if tc.scaleUpRequest != nil { + clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest} + } + + clusterState.HandleDeletingFailedScaleUpNodes(provider.GetNodeGroup(ngName)) + assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName]) + upcomingCounts, _ := clusterState.GetUpcomingNodes() + if upcoming, found := upcomingCounts[ngName]; found { + assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName]) + } + }) + } +} + func TestTruncateIfExceedMaxSize(t *testing.T) { testCases := []struct { name string diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index d3c59b56c59c..54ec599670b0 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -870,7 +870,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { klog.Warningf("Error while trying to delete nodes from %v: %v", nodeGroupId, err) } else { deletedAny = true - a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) + a.clusterStateRegistry.HandleDeletingFailedScaleUpNodes(nodeGroup) } }