Skip to content

Commit

Permalink
Add retries around kubectl wait (#9065)
Browse files Browse the repository at this point in the history
Signed-off-by: Rahul Ganesh <[email protected]>
Co-authored-by: Rahul Ganesh <[email protected]>
  • Loading branch information
rahulbabu95 and Rahul Ganesh authored Dec 13, 2024
1 parent 01d134a commit 92985cc
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 4 deletions.
24 changes: 20 additions & 4 deletions pkg/clustermanager/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ const (
var (
clusterctlNetworkErrorRegex = regexp.MustCompile(`.*failed to connect to the management cluster:.*`)
clusterctlMoveProvisionedInfraErrorRegex = regexp.MustCompile(`.*failed to check for provisioned infrastructure*`)
kubectlResourceNotFoundRegex = regexp.MustCompile(`.*the server doesn't have a resource type "(.*)".*`)
eksaClusterResourceType = fmt.Sprintf("clusters.%s", v1alpha1.GroupVersion.Group)
)

Expand Down Expand Up @@ -211,20 +212,32 @@ func WithNoTimeouts() ClusterManagerOpt {
func clusterctlMoveWaitForInfrastructureRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) {
// Retry both network and cluster move errors.
if match := (clusterctlNetworkErrorRegex.MatchString(err.Error()) || clusterctlMoveProvisionedInfraErrorRegex.MatchString(err.Error())); match {
return true, clusterctlMoveRetryWaitTime(totalRetries)
return true, exponentialRetryWaitTime(totalRetries)
}
return false, 0
}

func clusterctlMoveRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) {
// Retry only network errors.
if match := clusterctlNetworkErrorRegex.MatchString(err.Error()); match {
return true, clusterctlMoveRetryWaitTime(totalRetries)
return true, exponentialRetryWaitTime(totalRetries)
}
return false, 0
}

func clusterctlMoveRetryWaitTime(totalRetries int) time.Duration {
func kubectlWaitRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) {
// Sometimes it is possible that the clusterctl move is successful,
// but the clusters.cluster.x-k8s.io resource is not available on the cluster yet.
//
// Retry on transient 'server doesn't have a resource type' errors.
// Use existing exponential backoff implementation for retry on these errors.
if match := kubectlResourceNotFoundRegex.MatchString(err.Error()); match {
return true, exponentialRetryWaitTime(totalRetries)
}
return false, 0
}

func exponentialRetryWaitTime(totalRetries int) time.Duration {
// Exponential backoff on errors. Retrier built-in backoff is linear, so implementing here.

// Retrier first calls the policy before retry #1. We want it zero-based for exponentiation.
Expand Down Expand Up @@ -297,7 +310,10 @@ func (c *ClusterManager) MoveCAPI(ctx context.Context, from, to *types.Cluster,
}

logger.V(3).Info("Waiting for workload cluster control plane to be ready after move")
err = c.clusterClient.WaitForControlPlaneReady(ctx, to, c.controlPlaneWaitAfterMoveTimeout.String(), clusterName)
r = retrier.New(c.clusterctlMoveTimeout, retrier.WithRetryPolicy(kubectlWaitRetryPolicy))
err = r.Retry(func() error {
return c.clusterClient.WaitForControlPlaneReady(ctx, to, c.controlPlaneWaitAfterMoveTimeout.String(), clusterName)
})
if err != nil {
return err
}
Expand Down
56 changes: 56 additions & 0 deletions pkg/clustermanager/cluster_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,62 @@ func TestClusterManagerMoveCAPIRetrySuccess(t *testing.T) {
}
}

func TestClusterManagerMoveKubectlWaitRetrySuccess(t *testing.T) {
from := &types.Cluster{
Name: "from-cluster",
}
to := &types.Cluster{
Name: "to-cluster",
}
clusterSpec := test.NewClusterSpec(func(s *cluster.Spec) {
s.Cluster.Name = to.Name
s.Cluster.Spec.ControlPlaneConfiguration.Count = 3
s.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{{Count: ptr.Int(3), MachineGroupRef: &v1alpha1.Ref{Name: "test-wn"}}}
})
ctx := context.Background()

c, m := newClusterManager(t)
kcp, mds := getKcpAndMdsForNodeCount(0)
m.client.EXPECT().GetKubeadmControlPlane(ctx,
from,
to.Name,
gomock.AssignableToTypeOf(executables.WithCluster(from)),
gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
).Return(kcp, nil)
m.client.EXPECT().GetMachineDeploymentsForCluster(ctx,
to.Name,
gomock.AssignableToTypeOf(executables.WithCluster(from)),
gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
).Return(mds, nil)
m.client.EXPECT().GetMachines(ctx, from, to.Name)
m.client.EXPECT().WaitForClusterReady(ctx, from, "1h0m0s", to.Name)
m.client.EXPECT().MoveManagement(ctx, from, to, to.Name)
firstTry := m.client.EXPECT().WaitForControlPlaneReady(ctx, to, "15m0s", to.Name).Return(errors.New("executing wait: error: the server doesn't have a resource type \"clusters\""))
secondTry := m.client.EXPECT().WaitForControlPlaneReady(ctx, to, "15m0s", to.Name).Return(nil)
gomock.InOrder(
firstTry,
secondTry,
)
m.client.EXPECT().ValidateControlPlaneNodes(ctx, to, to.Name)
m.client.EXPECT().CountMachineDeploymentReplicasReady(ctx, to.Name, to.KubeconfigFile)
m.client.EXPECT().GetKubeadmControlPlane(ctx,
to,
to.Name,
gomock.AssignableToTypeOf(executables.WithCluster(from)),
gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
).Return(kcp, nil)
m.client.EXPECT().GetMachineDeploymentsForCluster(ctx,
to.Name,
gomock.AssignableToTypeOf(executables.WithCluster(from)),
gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
).Return(mds, nil)
m.client.EXPECT().GetMachines(ctx, to, to.Name)

if err := c.MoveCAPI(ctx, from, to, to.Name, clusterSpec); err != nil {
t.Errorf("ClusterManager.MoveCAPI() error = %v, wantErr nil", err)
}
}

func TestClusterManagerMoveCAPIErrorMove(t *testing.T) {
from := &types.Cluster{
Name: "from-cluster",
Expand Down

0 comments on commit 92985cc

Please sign in to comment.