From 0c91ae1e24189419f33166d64eb3dce7d77b1c1f Mon Sep 17 00:00:00 2001 From: razo7 Date: Tue, 1 Aug 2023 13:18:40 +0300 Subject: [PATCH 1/3] Add three status conditions for FAR CR Processing, FenceAgentActionSucceeded, and Succeeded status conditions have been added to verify FAR remediation status --- api/v1alpha1/fenceagentsremediation_types.go | 32 +++++++- api/v1alpha1/zz_generated.deepcopy.go | 10 ++- ...nts-remediation.clusterserviceversion.yaml | 8 ++ ...on.medik8s.io_fenceagentsremediations.yaml | 75 +++++++++++++++++++ ...on.medik8s.io_fenceagentsremediations.yaml | 75 +++++++++++++++++++ ...nts-remediation.clusterserviceversion.yaml | 8 ++ pkg/utils/errors.go | 9 +++ .../common/pkg/conditions/conditions.go | 13 ++++ vendor/modules.txt | 1 + 9 files changed, 227 insertions(+), 4 deletions(-) create mode 100644 pkg/utils/errors.go create mode 100644 vendor/github.com/medik8s/common/pkg/conditions/conditions.go diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 6d159520..9f3f9cec 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -23,16 +23,34 @@ import ( // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. -type ParameterName string -type NodeName string - const ( // FARFinalizer is a finalizer for a FenceAgentsRemediation CR deletion FARFinalizer string = "fence-agents-remediation.medik8s.io/far-finalizer" // Taints FARNoExecuteTaintKey = "medik8s.io/fence-agents-remediation" + // FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not + FenceAgentActionSucceededType = "FenceAgentActionSucceeded" + // condition messages + RemediationStartedConditionMessage = "FAR CR was found, the CR name matches one of the cluster nodes, and a finalizer was set" + FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" + RemediationFinishedConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" ) +// ProcessingChangeReason represents the reason of updating the processing condition +type ProcessingChangeReason string + +const ( + // RemediationStarted - CR was found, its name matches a node, and a finalizer was set + RemediationStarted ProcessingChangeReason = "RemediationStarted" + // FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully + FenceAgentSucceeded ProcessingChangeReason = "FenceAgentSucceeded" + // RemediationFinished - The unhealthy node was fully remediated (it was tainted, fenced by FA and all of its resources have been deleted) + RemediationFinished ProcessingChangeReason = "RemediationFinished" +) + +type ParameterName string +type NodeName string + // FenceAgentsRemediationSpec defines the desired state of FenceAgentsRemediation type FenceAgentsRemediationSpec struct { // Agent is the name of fence agent that will be used @@ -52,6 +70,14 @@ type FenceAgentsRemediationSpec struct { type FenceAgentsRemediationStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file + + // Represents the observations of a FenceAgentsRemediation's current state. + // Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded", and "Succeeded". + // +listType=map + // +listMapKey=type + //+optional + // +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:io.kubernetes.conditions" + Conditions []metav1.Condition `json:"conditions,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index c7dfa22e..f9d8dbdb 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ limitations under the License. package v1alpha1 import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -31,7 +32,7 @@ func (in *FenceAgentsRemediation) DeepCopyInto(out *FenceAgentsRemediation) { out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FenceAgentsRemediation. @@ -126,6 +127,13 @@ func (in *FenceAgentsRemediationSpec) DeepCopy() *FenceAgentsRemediationSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FenceAgentsRemediationStatus) DeepCopyInto(out *FenceAgentsRemediationStatus) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FenceAgentsRemediationStatus. diff --git a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml index b602965e..4d42c7d5 100644 --- a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml @@ -79,6 +79,14 @@ spec: which node is about to be fenced (i.e., they are common for all the nodes) displayName: Shared Parameters path: sharedparameters + statusDescriptors: + - description: 'Represents the observations of a FenceAgentsRemediation''s current + state. Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded", + and "Succeeded".' + displayName: conditions + path: conditions + x-descriptors: + - urn:alm:descriptor:io.kubernetes.conditions version: v1alpha1 - description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates API diff --git a/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml b/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml index 677bfce5..bf2904a3 100644 --- a/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml +++ b/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml @@ -63,6 +63,81 @@ spec: status: description: FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation + properties: + conditions: + description: 'Represents the observations of a FenceAgentsRemediation''s + current state. Known .status.conditions.type are: "Processing", + "FenceAgentActionSucceeded", and "Succeeded".' + items: + description: "Condition contains details for one aspect of the current + state of this API Resource. --- This struct is intended for direct + use as an array at the field path .status.conditions. For example, + \n type FooStatus struct{ // Represents the observations of a + foo's current state. // Known .status.conditions.type are: \"Available\", + \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge + // +listType=map // +listMapKey=type Conditions []metav1.Condition + `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" + protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be when + the underlying condition changed. If that is not known, then + using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if .metadata.generation + is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the current + state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. Producers + of specific condition types may define expected values and + meanings for this field, and whether the values are considered + a guaranteed API. The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + --- Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object served: true diff --git a/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml b/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml index bc97bd14..c0cfb7cf 100644 --- a/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml +++ b/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml @@ -61,6 +61,81 @@ spec: status: description: FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation + properties: + conditions: + description: 'Represents the observations of a FenceAgentsRemediation''s + current state. Known .status.conditions.type are: "Processing", + "FenceAgentActionSucceeded", and "Succeeded".' + items: + description: "Condition contains details for one aspect of the current + state of this API Resource. --- This struct is intended for direct + use as an array at the field path .status.conditions. For example, + \n type FooStatus struct{ // Represents the observations of a + foo's current state. // Known .status.conditions.type are: \"Available\", + \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge + // +listType=map // +listMapKey=type Conditions []metav1.Condition + `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" + protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be when + the underlying condition changed. If that is not known, then + using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if .metadata.generation + is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the current + state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. Producers + of specific condition types may define expected values and + meanings for this field, and whether the values are considered + a guaranteed API. The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + --- Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object served: true diff --git a/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml b/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml index 02537490..622ac84a 100644 --- a/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml @@ -38,6 +38,14 @@ spec: which node is about to be fenced (i.e., they are common for all the nodes) displayName: Shared Parameters path: sharedparameters + statusDescriptors: + - description: 'Represents the observations of a FenceAgentsRemediation''s current + state. Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded", + and "Succeeded".' + displayName: conditions + path: conditions + x-descriptors: + - urn:alm:descriptor:io.kubernetes.conditions version: v1alpha1 - description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates API diff --git a/pkg/utils/errors.go b/pkg/utils/errors.go new file mode 100644 index 00000000..8a23ad22 --- /dev/null +++ b/pkg/utils/errors.go @@ -0,0 +1,9 @@ +package utils + +const ( + // condition status error messages + NoFenceAgentsRemediationCRFound = "noFenceAgentsRemediationCRFound" + ConditionNotSetError = "ConditionNotSet" + ConditionSetButNoMatchError = "ConditionSetButNoMatch" + ConditionSetAndMatchSuccess = "ConditionSetAndMatchSuccess" +) diff --git a/vendor/github.com/medik8s/common/pkg/conditions/conditions.go b/vendor/github.com/medik8s/common/pkg/conditions/conditions.go new file mode 100644 index 00000000..fc84e504 --- /dev/null +++ b/vendor/github.com/medik8s/common/pkg/conditions/conditions.go @@ -0,0 +1,13 @@ +package conditions + +const ( + + // These are condition types used on remediation CRs + + // ProcessingType is the condition type used to signal the remediation has started and it is in progress, or has finished + ProcessingType = "Processing" + // SucceededType is the condition type used to signal whether the remediation was successful or not + SucceededType = "Succeeded" + // PermanentNodeDeletionExpectedType is the condition type used to signal that the unhealthy node will be permanently deleted. + PermanentNodeDeletionExpectedType = "PermanentNodeDeletionExpected" +) diff --git a/vendor/modules.txt b/vendor/modules.txt index 6b199f8f..3e829182 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -95,6 +95,7 @@ github.com/matttproud/golang_protobuf_extensions/pbutil # github.com/medik8s/common v1.7.0 ## explicit; go 1.20 github.com/medik8s/common/pkg/annotations +github.com/medik8s/common/pkg/conditions github.com/medik8s/common/pkg/labels # github.com/moby/spdystream v0.2.0 ## explicit; go 1.13 From 9b28a038d1f298a742549bedb16d9209496209d8 Mon Sep 17 00:00:00 2001 From: razo7 Date: Tue, 1 Aug 2023 13:38:22 +0300 Subject: [PATCH 2/3] Restructure reconcile using the status conditions of FAR CR Processing, FenceAgentActionSucceeded, and Succeeded status conditions help exclude the FA execution and workload deletion sections from every reconcile call. It would help us avoid any second (and more) remediation --- .../fenceagentsremediation_controller.go | 228 ++++++++++++++---- 1 file changed, 182 insertions(+), 46 deletions(-) diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index c4c1b5df..81273cda 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -20,12 +20,17 @@ import ( "context" "errors" "fmt" + "time" "github.com/go-logr/logr" commonAnnotations "github.com/medik8s/common/pkg/annotations" + commonConditions "github.com/medik8s/common/pkg/conditions" apiErrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + utilErrors "k8s.io/apimachinery/pkg/util/errors" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -39,9 +44,10 @@ const ( // errors errorMissingParams = "nodeParameters or sharedParameters or both are missing, and they cannot be empty" errorMissingNodeParams = "node parameter is required, and cannot be empty" - SuccessFAResponse = "Success: Rebooted" - parameterActionName = "--action" - parameterActionValue = "reboot" + + SuccessFAResponse = "Success: Rebooted" + parameterActionName = "--action" + parameterActionValue = "reboot" ) // FenceAgentsRemediationReconciler reconciles a FenceAgentsRemediation object @@ -76,7 +82,7 @@ func (r *FenceAgentsRemediationReconciler) SetupWithManager(mgr ctrl.Manager) er // // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.2/pkg/reconcile -func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (finalResult ctrl.Result, finalErr error) { r.Log.Info("Begin FenceAgentsRemediation Reconcile") defer r.Log.Info("Finish FenceAgentsRemediation Reconcile") emptyResult := ctrl.Result{} @@ -93,6 +99,17 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct r.Log.Error(err, "Failed to get FenceAgentsRemediation CR") return emptyResult, err } + + // At the end of each Reconcile try to update CR's status + defer func() { + if updateErr := r.updateStatus(ctx, far); updateErr != nil { + if !apiErrors.IsConflict(updateErr) { + finalErr = utilErrors.NewAggregate([]error{updateErr, finalErr}) + } + finalResult.RequeueAfter = time.Second + } + }() + // Validate FAR CR name to match a nodeName from the cluster r.Log.Info("Check FAR CR's name") valid, err := utils.IsNodeNameValid(r.Client, req.Name) @@ -119,12 +136,23 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct return emptyResult, fmt.Errorf("failed to add finalizer to the CR - %w", err) } r.Log.Info("Finalizer was added", "CR Name", req.Name) - return emptyResult, nil - // TODO: should return Requeue: true when the CR has a status and not end reconcile with empty result when a finalizer has been added - //return ctrl.Result{Requeue: true}, nil + + if err := updateConditions(v1alpha1.RemediationStarted, &far.Status.Conditions, r.Log); err != nil { + return emptyResult, err + } + return ctrl.Result{Requeue: true}, nil } else if controllerutil.ContainsFinalizer(far, v1alpha1.FARFinalizer) && !far.ObjectMeta.DeletionTimestamp.IsZero() { // Delete CR only when a finalizer and DeletionTimestamp are set r.Log.Info("CR's deletion timestamp is not zero, and FAR finalizer exists", "CR Name", req.Name) + + if !meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, commonConditions.SucceededType, metav1.ConditionTrue) { + processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status + fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType).Status + succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status + r.Log.Info("FAR didn't finish remediate the node ", "CR Name", req.Name, "processing condition", processingCondition, + "fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition) + } + // remove node's taints if err := utils.RemoveTaint(r.Client, far.Name); err != nil && !apiErrors.IsNotFound(err) { return emptyResult, err @@ -137,50 +165,69 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct r.Log.Info("Finalizer was removed", "CR Name", req.Name) return emptyResult, nil } - - // Fetch the FAR's pod - r.Log.Info("Fetch FAR's pod") - pod, err := utils.GetFenceAgentsRemediationPod(r.Client) - if err != nil { - r.Log.Error(err, "Can't find FAR's pod by its label", "CR's Name", req.Name) - return emptyResult, err - } - //TODO: Check that FA is excutable? run cli.IsExecuteable - - // Build FA parameters - r.Log.Info("Combine fence agent parameters", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) - faParams, err := buildFenceAgentParams(far) - if err != nil { - r.Log.Error(err, "Invalid sharedParameters/nodeParameters from CR - edit/recreate the CR", "CR's Name", req.Name) - return emptyResult, nil - } - // Add medik8s remediation taint - r.Log.Info("Add Medik8s remediation taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + // Add FAR (medik8s) remediation taint + r.Log.Info("Try adding FAR (Medik8s) remediation taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) if err := utils.AppendTaint(r.Client, far.Name); err != nil { return emptyResult, err } - cmd := append([]string{far.Spec.Agent}, faParams...) - // The Fence Agent is excutable and the parameters structure are valid, but we don't check their values - r.Log.Info("Execute the fence agent", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) - outputRes, outputErr, err := r.Executor.Execute(pod, cmd) - if err != nil { - // response was a failure message - r.Log.Error(err, "Fence Agent response was a failure", "CR's Name", req.Name) - return emptyResult, err - } - if outputErr != "" || outputRes != SuccessFAResponse+"\n" { - // response wasn't failure or sucesss message - err := fmt.Errorf("unknown fence agent response - expecting `%s` response, but we received `%s`", SuccessFAResponse, outputRes) - r.Log.Error(err, "Fence Agent response wasn't a success message", "CR's Name", req.Name) - return emptyResult, err + + if meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.ProcessingType) && + !meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) { + // The remeditation has already been processed, thus we can begin with exuecting the FA for the node + // We run the FA until its action (reboot) was succeeded, and we verify it with the fenceAgentActionSucceeded condition + + // Fetch the FAR's pod + r.Log.Info("Fetch FAR's pod") + pod, err := utils.GetFenceAgentsRemediationPod(r.Client) + if err != nil { + r.Log.Error(err, "Can't find FAR's pod by its label", "CR's Name", req.Name) + return emptyResult, err + } + //TODO: Check that FA is excutable? run cli.IsExecuteable + + // Build FA parameters + r.Log.Info("Combine fence agent parameters", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + faParams, err := buildFenceAgentParams(far) + if err != nil { + r.Log.Error(err, "Invalid sharedParameters/nodeParameters from CR - edit/recreate the CR", "CR's Name", req.Name) + return emptyResult, nil + } + + cmd := append([]string{far.Spec.Agent}, faParams...) + // The Fence Agent is excutable and the parameters structure are valid, but we don't check their values + r.Log.Info("Execute the fence agent", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + outputRes, outputErr, err := r.Executor.Execute(pod, cmd) + if err != nil { + // response was a failure message + r.Log.Error(err, "Fence Agent response was a failure", "CR's Name", req.Name) + return emptyResult, err + } + if outputErr != "" || outputRes != SuccessFAResponse+"\n" { + // response wasn't failure or sucesss message + err := fmt.Errorf("unknown fence agent response - expecting `%s` response, but we received `%s`", SuccessFAResponse, outputRes) + r.Log.Error(err, "Fence Agent response wasn't a success message", "CR's Name", req.Name) + return emptyResult, err + } + + r.Log.Info("Fence Agent command was finished successfully", "Fence Agent", far.Spec.Agent, "Node name", req.Name, "Response", SuccessFAResponse) + if err := updateConditions(v1alpha1.FenceAgentSucceeded, &far.Status.Conditions, r.Log); err != nil { + return emptyResult, err + } + return ctrl.Result{Requeue: true}, nil } - r.Log.Info("Fence Agent command was finished successfully", "Fence Agent", far.Spec.Agent, "Node name", req.Name, "Response", SuccessFAResponse) + if meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) && + !meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.SucceededType) { + // Fence agent action succeeded, and now we try to remove workloads (pods and their volume attachments) + r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil { + r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name) + return emptyResult, err + } - // Reboot was finished and now we remove workloads (pods and their VA) - r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) - if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil { - r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name) - return emptyResult, err + if err := updateConditions(v1alpha1.RemediationFinished, &far.Status.Conditions, r.Log); err != nil { + return emptyResult, err + } + r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name) } return emptyResult, nil @@ -195,6 +242,95 @@ func isTimedOutByNHC(far *v1alpha1.FenceAgentsRemediation) bool { return false } +// updateStatus updates the CR status, and returns an error if it fails +func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far *v1alpha1.FenceAgentsRemediation) error { + if err := r.Client.Status().Update(ctx, far); err != nil { + if !apiErrors.IsConflict(err) { + r.Log.Error(err, "failed to update far status") + } + return err + } + return nil +} + +// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ProcessingChangeReason. +// return an error if an unknown ProcessingChangeReason is provided +func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error { + + var ( + processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus + conditionMessage string + ) + // All the ProcessingChangeReasons can happen one after another + // RemediationStarted will always be the first reason. + // FenceAgentSucceeded can only happen after RemediationStarted happened + // RemediationFinished can only happen after FenceAgentSucceeded happened + + switch reason { + case v1alpha1.RemediationStarted: + processingConditionStatus = metav1.ConditionTrue + fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown + succeededConditionStatus = metav1.ConditionUnknown + conditionMessage = v1alpha1.RemediationStartedConditionMessage + case v1alpha1.FenceAgentSucceeded: + fenceAgentActionSucceededConditionStatus = metav1.ConditionTrue + conditionMessage = v1alpha1.FenceAgentSucceededConditionMessage + case v1alpha1.RemediationFinished: + processingConditionStatus = metav1.ConditionFalse + succeededConditionStatus = metav1.ConditionTrue + conditionMessage = v1alpha1.RemediationFinishedConditionMessage + default: + err := fmt.Errorf("unknown processingChangeReason:%s", reason) + log.Error(err, "couldn't update FAR Status Conditions") + return err + } + + // if ProcessingType is already false, then it cannot be changed to true again + if processingConditionStatus == metav1.ConditionTrue && + meta.IsStatusConditionFalse(*currentConditions, commonConditions.ProcessingType) { + return nil + } + + // if FenceAgentActionSucceededType is already false, then it cannot be changed to true again + if fenceAgentActionSucceededConditionStatus == metav1.ConditionTrue && + meta.IsStatusConditionFalse(*currentConditions, v1alpha1.FenceAgentActionSucceededType) { + return nil + } + + log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason)) + // if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value + if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) { + meta.SetStatusCondition(currentConditions, metav1.Condition{ + Type: commonConditions.ProcessingType, + Status: processingConditionStatus, + Reason: string(reason), + Message: conditionMessage, + }) + } + + // if the requested Status.Conditions.FenceAgentActionSucceeded is different then the current one, then update Status.Conditions.FenceAgentActionSucceeded value + if fenceAgentActionSucceededConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, v1alpha1.FenceAgentActionSucceededType, fenceAgentActionSucceededConditionStatus) { + meta.SetStatusCondition(currentConditions, metav1.Condition{ + Type: v1alpha1.FenceAgentActionSucceededType, + Status: fenceAgentActionSucceededConditionStatus, + Reason: string(reason), + Message: conditionMessage, + }) + } + + // if the requested Status.Conditions.Succeeded is different then the current one, then update Status.Conditions.Succeeded value + if succeededConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.SucceededType, succeededConditionStatus) { + meta.SetStatusCondition(currentConditions, metav1.Condition{ + Type: commonConditions.SucceededType, + Status: succeededConditionStatus, + Reason: string(reason), + Message: conditionMessage, + }) + } + + return nil +} + // buildFenceAgentParams collects the FAR's parameters for the node based on FAR CR, and if the CR is missing parameters // or the CR's name don't match nodeParamter name or it has an action which is different than reboot, then return an error func buildFenceAgentParams(far *v1alpha1.FenceAgentsRemediation) ([]string, error) { From 1b38f7f5b58070d19f423999b062a3a16a609bb9 Mon Sep 17 00:00:00 2001 From: razo7 Date: Tue, 1 Aug 2023 13:38:31 +0300 Subject: [PATCH 3/3] Add tests for status conditions Unit tests and e2e tests have been added to verify the expected behaviour by looking whether the status conditions have been set and what's their value --- .../fenceagentsremediation_controller_test.go | 44 +++++++++++++++++-- test/e2e/far_e2e_test.go | 27 ++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/controllers/fenceagentsremediation_controller_test.go b/controllers/fenceagentsremediation_controller_test.go index 43387093..46c2aedc 100644 --- a/controllers/fenceagentsremediation_controller_test.go +++ b/controllers/fenceagentsremediation_controller_test.go @@ -23,12 +23,14 @@ import ( "time" "github.com/go-logr/logr" + commonConditions "github.com/medik8s/common/pkg/conditions" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -46,6 +48,10 @@ const ( testPodName = "far-pod-test-1" vaName1 = "va-test-1" vaName2 = "va-test-2" + + // intervals + timeoutDeletion = 10 * time.Second // this timeout is used after all the other steps have finished successfully + pollInterval = 250 * time.Millisecond ) var ( @@ -158,6 +164,11 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName1, resourceDeletionWasTriggered) testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) + + By("Having Succeed condition set to true") + verifyExpectedStatusConditionError(underTestFAR, commonConditions.ProcessingType, utils.ConditionSetAndMatchSuccess, metav1.ConditionFalse) + verifyExpectedStatusConditionError(underTestFAR, v1alpha1.FenceAgentActionSucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) + verifyExpectedStatusConditionError(underTestFAR, commonConditions.SucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) }) }) When("creating invalid FAR CR Name", func() { @@ -186,6 +197,11 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName1, resourceDeletionWasTriggered) testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) + + By("Not having any condition set") + verifyExpectedStatusConditionError(underTestFAR, commonConditions.ProcessingType, utils.ConditionNotSetError, metav1.ConditionUnknown) + verifyExpectedStatusConditionError(underTestFAR, v1alpha1.FenceAgentActionSucceededType, utils.ConditionNotSetError, metav1.ConditionUnknown) + verifyExpectedStatusConditionError(underTestFAR, commonConditions.SucceededType, utils.ConditionNotSetError, metav1.ConditionUnknown) }) }) }) @@ -294,6 +310,8 @@ func cliCommandsEquality(far *v1alpha1.FenceAgentsRemediation) (bool, error) { return isEqualStringLists(mocksExecuter.command, expectedCommand), nil } +// TODO: Think about using Generics for the next two functions + // testVADeletion tests whether the volume attachment no longer exist for successful FAR CR // and consistently check if the volume attachment exist and was not deleted func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { @@ -307,7 +325,7 @@ func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), vaKey, va) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + }, timeoutDeletion, pollInterval).Should(BeTrue()) log.Info("Volume attachment is no longer exist", "va", vaName) } else { ConsistentlyWithOffset(1, func() bool { @@ -315,7 +333,7 @@ func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), vaKey, va) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) + }, timeoutDeletion, pollInterval).Should(BeFalse()) log.Info("Volume attachment exist", "va", vaName) } } @@ -333,7 +351,7 @@ func testPodDeletion(podName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), podKey, pod) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + }, timeoutDeletion, pollInterval).Should(BeTrue()) log.Info("Pod is no longer exist", "pod", podName) } else { ConsistentlyWithOffset(1, func() bool { @@ -341,11 +359,29 @@ func testPodDeletion(podName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), podKey, pod) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) + }, timeoutDeletion, pollInterval).Should(BeFalse()) log.Info("Pod exist", "pod", podName) } } +// verifyExpectedStatusConditionState checks whether the status condition state matches the expectedResult +func verifyExpectedStatusConditionError(testFAR *v1alpha1.FenceAgentsRemediation, conditionType, expectedError string, conditionStatus metav1.ConditionStatus) { + far := &v1alpha1.FenceAgentsRemediation{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(testFAR), far); err != nil { + return utils.NoFenceAgentsRemediationCRFound + } + if gotCondition := meta.FindStatusCondition(far.Status.Conditions, conditionType); gotCondition == nil { + return utils.ConditionNotSetError + } + if meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, conditionType, conditionStatus) { + return utils.ConditionSetAndMatchSuccess + } + return utils.ConditionSetButNoMatchError + + }, timeoutDeletion, pollInterval).Should(Equal(expectedError), "'%v' status condition was expected to be %v", conditionType, conditionStatus) +} + // Implements Execute function to mock/test Execute of FenceAgentsRemediationReconciler type mockExecuter struct { command []string diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 0a41d267..aaf7afe4 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -6,6 +6,7 @@ import ( "math/rand" "time" + commonConditions "github.com/medik8s/common/pkg/conditions" medik8sLabels "github.com/medik8s/common/pkg/labels" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -13,6 +14,7 @@ import ( corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apiErrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" @@ -422,6 +424,25 @@ func checkPodDeleted(pod *corev1.Pod) { log.Info("Pod has already been deleted", "pod name", pod.Name) } +// verifyExpectedStatusConditionState checks whether the status condition state matches the expectedResult +func verifyExpectedStatusConditionError(nodeName, conditionType, expectedError string, conditionStatus metav1.ConditionStatus) { + far := &v1alpha1.FenceAgentsRemediation{} + farNamespacedName := client.ObjectKey{Name: nodeName, Namespace: operatorNsName} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), farNamespacedName, far); err != nil { + return utils.NoFenceAgentsRemediationCRFound + } + if gotCondition := meta.FindStatusCondition(far.Status.Conditions, conditionType); gotCondition == nil { + return utils.ConditionNotSetError + } + if meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, conditionType, conditionStatus) { + return utils.ConditionSetAndMatchSuccess + } + return utils.ConditionSetButNoMatchError + + }, timeoutDeletion, pollInterval).Should(Equal(expectedError), "'%v' status condition was expected to be %v", conditionType, conditionStatus) +} + // checkRemediation verify whether the node was remediated func checkRemediation(nodeName string, nodeBootTimeBefore time.Time, oldPodCreationTime time.Time, va *storagev1.VolumeAttachment, pod *corev1.Pod) { By("Check if FAR NoExecute taint was added") @@ -439,4 +460,10 @@ func checkRemediation(nodeName string, nodeBootTimeBefore time.Time, oldPodCreat By("checking if old pod has been deleted") checkPodDeleted(pod) + + By("checking if the status conditions match a successful remediation") + verifyExpectedStatusConditionError(nodeName, commonConditions.ProcessingType, utils.ConditionSetAndMatchSuccess, metav1.ConditionFalse) + verifyExpectedStatusConditionError(nodeName, v1alpha1.FenceAgentActionSucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) + verifyExpectedStatusConditionError(nodeName, commonConditions.SucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) + }