From dc53cb92897f96889a4e05ed147131a36bafd45b Mon Sep 17 00:00:00 2001 From: "Beata Lach (Skiba)" Date: Wed, 24 Jul 2024 11:32:16 +0000 Subject: [PATCH] Do not break the loop after removing failed scale up nodes Clean up cluster state after removing failed scale up nodes, so that the loop can continue. Most importantly, update the target for the affected node group, so that the deleted nodes are not considered upcoming. --- .../clusterstate/clusterstate_test.go | 71 +++++++++++++++++++ cluster-autoscaler/core/static_autoscaler.go | 12 ++-- .../core/static_autoscaler_test.go | 26 +++---- 3 files changed, 91 insertions(+), 18 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 7b4fb3d6c736..ac11a1cfa5c1 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -176,6 +176,77 @@ func TestHasNodeGroupStartedScaleUp(t *testing.T) { } } +// TestRecalculateStateAfterNodeGroupSizeChanged checks that Recalculate updates state correctly after +// some node group size changed. We verify that acceptable ranges are updated accordingly +// and that the UpcomingNodes reflect the node group size change (important for recalculating state after +// deleting scale-up nodes that failed to create). +func TestRecalculateStateAfterNodeGroupSizeChanged(t *testing.T) { + ngName := "ng1" + testCases := []struct { + name string + acceptableRange AcceptableRange + readiness Readiness + newTarget int + scaleUpRequest *ScaleUpRequest + wantAcceptableRange AcceptableRange + wantUpcoming int + }{ + { + name: "failed scale up by 3 nodes", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 1, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1}, + wantUpcoming: 0, + }, { + name: "partially failed scale up", + acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8}, + readiness: Readiness{Ready: make([]string, 5)}, + newTarget: 6, + wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6}, + scaleUpRequest: &ScaleUpRequest{Increase: 1}, + wantUpcoming: 1, + }, { + name: "scale up ongoing, no change", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + scaleUpRequest: &ScaleUpRequest{Increase: 3}, + wantUpcoming: 3, + }, { + name: "no scale up, no change", + acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 4)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + wantUpcoming: 0, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget) + + fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, + newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{})) + clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange} + clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness} + if tc.scaleUpRequest != nil { + clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest} + } + + clusterState.Recalculate() + assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName]) + upcomingCounts, _ := clusterState.GetUpcomingNodes() + if upcoming, found := upcomingCounts[ngName]; found { + assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName]) + } + }) + } +} + func TestOKOneUnreadyNode(t *testing.T) { now := time.Now() diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index d3c59b56c59c..007d43d07e81 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -448,10 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return nil } - if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes { - klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") - return nil - } + a.deleteCreatedNodesWithErrors() // Check if there has been a constant difference between the number of nodes in k8s and // the number of nodes on the cloud provider side. @@ -831,7 +828,7 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { return nodes } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodeGroups := a.nodeGroupsById() @@ -874,7 +871,10 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { } } - return deletedAny + if deletedAny { + klog.V(0).Infof("Some nodes that failed to create were removed, recalculating cluster state.") + a.clusterStateRegistry.Recalculate() + } } // instancesToNodes returns a list of fake nodes with just names populated, diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index f47e88f2021f..2962f42f8b22 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1709,8 +1709,11 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes := autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() + + // nodes should be deleted + expectedDeleteCalls := 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1734,10 +1737,12 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() // nodes should be deleted again + expectedDeleteCalls += 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) + nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { if len(nodes) != 4 { @@ -1798,11 +1803,10 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() - // we expect no more Delete Nodes - nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) + // we expect no more Delete Nodes, don't increase expectedDeleteCalls + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // failed node not included by NodeGroupForNode nodeGroupC := &mockprovider.NodeGroup{} @@ -1840,8 +1844,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) // No nodes are deleted when failed nodes don't have matching node groups - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) nodeGroupAtomic := &mockprovider.NodeGroup{} @@ -1896,8 +1899,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool {