From 4b6a5f9c7ba7d13b4b7db77c5e50dd40bee3cfb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Ma=C5=82ek?= Date: Thu, 9 May 2024 15:32:22 +0200 Subject: [PATCH] chore: handle Gateway cleanup errors (#248) * chore: handle Gateway cleanup errors --------- Co-authored-by: Jakub Warczarek --- controller/controlplane/controller.go | 55 ++++++++++++++++++------ controller/gateway/controller.go | 8 +++- controller/gateway/controller_cleanup.go | 51 ++++++++++++++++++---- 3 files changed, 92 insertions(+), 22 deletions(-) diff --git a/controller/controlplane/controller.go b/controller/controlplane/controller.go index 0a436ff24..184a58b20 100644 --- a/controller/controlplane/controller.go +++ b/controller/controlplane/controller.go @@ -228,10 +228,14 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu log.Trace(logger, "validating ControlPlane resource conditions", cp) if r.ensureIsMarkedScheduled(cp) { - err := r.patchStatus(ctx, logger, cp) + res, err := r.patchStatus(ctx, logger, cp) if err != nil { + log.Debug(logger, "unable to update ControlPlane resource", cp, "error", err) + return res, err + } + if res.Requeue { log.Debug(logger, "unable to update ControlPlane resource", cp) - return ctrl.Result{}, err + return res, nil } log.Debug(logger, "ControlPlane resource now marked as scheduled", cp) @@ -401,11 +405,16 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu if res != op.Noop { if !dataplaneIsSet { log.Debug(logger, "DataPlane not set, deployment for ControlPlane has been scaled down to 0 replicas", cp) - err := r.patchStatus(ctx, logger, cp) + res, err := r.patchStatus(ctx, logger, cp) if err != nil { - log.Debug(logger, "unable to reconcile ControlPlane status", cp) + log.Debug(logger, "unable to reconcile ControlPlane status", cp, "error", err) + return ctrl.Result{}, err } - return ctrl.Result{}, err + if res.Requeue { + log.Debug(logger, "unable to update ControlPlane resource", cp) + return res, nil + } + return ctrl.Result{}, nil } return ctrl.Result{}, nil // requeue will be triggered by the creation or update of the owned object } @@ -418,16 +427,31 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu k8sutils.NewCondition(k8sutils.ReadyType, metav1.ConditionFalse, k8sutils.WaitingToBecomeReadyReason, k8sutils.WaitingToBecomeReadyMessage), cp, ) - return ctrl.Result{}, r.patchStatus(ctx, logger, cp) + + res, err := r.patchStatus(ctx, logger, cp) + if err != nil { + log.Debug(logger, "unable to patch ControlPlane status", cp, "error", err) + return ctrl.Result{}, err + } + if res.Requeue { + log.Debug(logger, "unable to patch ControlPlane status", cp) + return res, nil + } + return ctrl.Result{}, nil } markAsProvisioned(cp) k8sutils.SetReady(cp) - if err = r.patchStatus(ctx, logger, cp); err != nil { - log.Debug(logger, "unable to reconcile the ControlPlane resource", cp) + result, err := r.patchStatus(ctx, logger, cp) + if err != nil { + log.Debug(logger, "unable to patch ControlPlane status", cp, "error", err) return ctrl.Result{}, err } + if result.Requeue { + log.Debug(logger, "unable to patch ControlPlane status", cp) + return result, nil + } log.Debug(logger, "reconciliation complete for ControlPlane resource", cp) return ctrl.Result{}, nil @@ -444,18 +468,25 @@ func validateControlPlane(controlPlane *operatorv1beta1.ControlPlane, devMode bo } // patchStatus Patches the resource status only when there are changes in the Conditions -func (r *Reconciler) patchStatus(ctx context.Context, logger logr.Logger, updated *operatorv1beta1.ControlPlane) error { +func (r *Reconciler) patchStatus(ctx context.Context, logger logr.Logger, updated *operatorv1beta1.ControlPlane) (ctrl.Result, error) { current := &operatorv1beta1.ControlPlane{} err := r.Client.Get(ctx, client.ObjectKeyFromObject(updated), current) if err != nil && !k8serrors.IsNotFound(err) { - return err + return ctrl.Result{}, err } if k8sutils.NeedsUpdate(current, updated) { log.Debug(logger, "patching ControlPlane status", updated, "status", updated.Status) - return r.Client.Status().Patch(ctx, updated, client.MergeFrom(current)) + if err := r.Client.Status().Patch(ctx, updated, client.MergeFrom(current)); err != nil { + if k8serrors.IsConflict(err) { + log.Debug(logger, "conflict found when updating ControlPlane, retrying", current) + return ctrl.Result{Requeue: true, RequeueAfter: requeueWithoutBackoff}, nil + } + return ctrl.Result{}, fmt.Errorf("failed updating ControlPlane's status : %w", err) + } + return ctrl.Result{}, nil } - return nil + return ctrl.Result{}, nil } diff --git a/controller/gateway/controller.go b/controller/gateway/controller.go index ff380e724..6078bbb6b 100644 --- a/controller/gateway/controller.go +++ b/controller/gateway/controller.go @@ -119,7 +119,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu if cpFinalizerSet || dpFinalizerSet || npFinalizerSet { log.Trace(logger, "Setting finalizers", gateway) if err := r.Client.Update(ctx, &gateway); err != nil { - return ctrl.Result{}, fmt.Errorf("failed updating Gateway's finalizers: %w", err) + res, err := handleGatewayFinalizerPatchOrUpdateError(err, &gateway, logger) + if err != nil { + return res, fmt.Errorf("failed updating Gateway's finalizers: %w", err) + } + if res.Requeue { + return res, nil + } } return ctrl.Result{}, nil } diff --git a/controller/gateway/controller_cleanup.go b/controller/gateway/controller_cleanup.go index 264b05cd9..75ab5f24a 100644 --- a/controller/gateway/controller_cleanup.go +++ b/controller/gateway/controller_cleanup.go @@ -5,6 +5,8 @@ import ( "time" "github.com/go-logr/logr" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -18,6 +20,8 @@ import ( // Reconciler - Cleanup // ---------------------------------------------------------------------------- +const requeueWithoutBackoff = 200 * time.Millisecond + // cleanup determines whether cleanup is needed/underway for a Gateway and // performs all necessary cleanup steps. Namely, it cleans up resources // managed on behalf of the Gateway and removes the finalizers once all @@ -64,9 +68,9 @@ func (r *Reconciler) cleanup( } else { oldGateway := gateway.DeepCopy() if controllerutil.RemoveFinalizer(gateway, string(GatewayFinalizerCleanupControlPlanes)) { - err := r.Client.Patch(ctx, gateway, client.MergeFrom(oldGateway)) - if err != nil { - return true, ctrl.Result{}, err + if err := r.Client.Patch(ctx, gateway, client.MergeFrom(oldGateway)); err != nil { + res, err := handleGatewayFinalizerPatchOrUpdateError(err, gateway, logger) + return true, res, err } log.Debug(logger, "finalizer for cleaning up controlplanes removed", gateway) return true, ctrl.Result{}, nil @@ -91,9 +95,9 @@ func (r *Reconciler) cleanup( } else { oldGateway := gateway.DeepCopy() if controllerutil.RemoveFinalizer(gateway, string(GatewayFinalizerCleanupDataPlanes)) { - err := r.Client.Patch(ctx, gateway, client.MergeFrom(oldGateway)) - if err != nil { - return true, ctrl.Result{}, err + if err := r.Client.Patch(ctx, gateway, client.MergeFrom(oldGateway)); err != nil { + res, err := handleGatewayFinalizerPatchOrUpdateError(err, gateway, logger) + return true, res, err } log.Debug(logger, "finalizer for cleaning up dataplanes removed", gateway) return true, ctrl.Result{}, nil @@ -117,9 +121,9 @@ func (r *Reconciler) cleanup( } else { oldGateway := gateway.DeepCopy() if controllerutil.RemoveFinalizer(gateway, string(GatewayFinalizerCleanupNetworkpolicies)) { - err := r.Client.Patch(ctx, gateway, client.MergeFrom(oldGateway)) - if err != nil { - return true, ctrl.Result{}, err + if err := r.Client.Patch(ctx, gateway, client.MergeFrom(oldGateway)); err != nil { + res, err := handleGatewayFinalizerPatchOrUpdateError(err, gateway, logger) + return true, res, err } log.Debug(logger, "finalizer for cleaning up network policies removed", gateway) return true, ctrl.Result{}, nil @@ -129,3 +133,32 @@ func (r *Reconciler) cleanup( log.Debug(logger, "owned resources cleanup completed", gateway) return true, ctrl.Result{}, nil } + +func handleGatewayFinalizerPatchOrUpdateError(err error, gateway *gatewayv1.Gateway, logger logr.Logger) (ctrl.Result, error) { + // Short cirtcuit. + if err == nil { + return ctrl.Result{}, nil + } + + // If the Gateway is not found, then requeue without an error. + if k8serrors.IsNotFound(err) { + return ctrl.Result{ + Requeue: true, + RequeueAfter: requeueWithoutBackoff, + }, nil + } + // Since controllers use cached clients, it's possible that the Gateway is out of sync with what + // is in the API server and this causes: + // Forbidden: no new finalizers can be added if the object is being deleted, found new finalizers []string{...} + // Code below handles that gracefully to not show users the errors that are not actionable. + if cause, ok := k8serrors.StatusCause(err, metav1.CauseTypeForbidden); k8serrors.IsInvalid(err) && ok { + log.Debug(logger, "failed to delete a finalizer on Gateway, requeueing request", gateway, "cause", cause) + return ctrl.Result{ + Requeue: true, + RequeueAfter: requeueWithoutBackoff, + }, nil + } + + // Return the error as is. + return ctrl.Result{}, err +}