diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 6d159520..9f3f9cec 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -23,16 +23,34 @@ import ( // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. -type ParameterName string -type NodeName string - const ( // FARFinalizer is a finalizer for a FenceAgentsRemediation CR deletion FARFinalizer string = "fence-agents-remediation.medik8s.io/far-finalizer" // Taints FARNoExecuteTaintKey = "medik8s.io/fence-agents-remediation" + // FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not + FenceAgentActionSucceededType = "FenceAgentActionSucceeded" + // condition messages + RemediationStartedConditionMessage = "FAR CR was found, the CR name matches one of the cluster nodes, and a finalizer was set" + FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" + RemediationFinishedConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" ) +// ProcessingChangeReason represents the reason of updating the processing condition +type ProcessingChangeReason string + +const ( + // RemediationStarted - CR was found, its name matches a node, and a finalizer was set + RemediationStarted ProcessingChangeReason = "RemediationStarted" + // FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully + FenceAgentSucceeded ProcessingChangeReason = "FenceAgentSucceeded" + // RemediationFinished - The unhealthy node was fully remediated (it was tainted, fenced by FA and all of its resources have been deleted) + RemediationFinished ProcessingChangeReason = "RemediationFinished" +) + +type ParameterName string +type NodeName string + // FenceAgentsRemediationSpec defines the desired state of FenceAgentsRemediation type FenceAgentsRemediationSpec struct { // Agent is the name of fence agent that will be used @@ -52,6 +70,14 @@ type FenceAgentsRemediationSpec struct { type FenceAgentsRemediationStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file + + // Represents the observations of a FenceAgentsRemediation's current state. + // Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded", and "Succeeded". + // +listType=map + // +listMapKey=type + //+optional + // +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:io.kubernetes.conditions" + Conditions []metav1.Condition `json:"conditions,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index c7dfa22e..f9d8dbdb 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ limitations under the License. package v1alpha1 import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -31,7 +32,7 @@ func (in *FenceAgentsRemediation) DeepCopyInto(out *FenceAgentsRemediation) { out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FenceAgentsRemediation. @@ -126,6 +127,13 @@ func (in *FenceAgentsRemediationSpec) DeepCopy() *FenceAgentsRemediationSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FenceAgentsRemediationStatus) DeepCopyInto(out *FenceAgentsRemediationStatus) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FenceAgentsRemediationStatus. diff --git a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml index b602965e..4d42c7d5 100644 --- a/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/fence-agents-remediation.clusterserviceversion.yaml @@ -79,6 +79,14 @@ spec: which node is about to be fenced (i.e., they are common for all the nodes) displayName: Shared Parameters path: sharedparameters + statusDescriptors: + - description: 'Represents the observations of a FenceAgentsRemediation''s current + state. Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded", + and "Succeeded".' + displayName: conditions + path: conditions + x-descriptors: + - urn:alm:descriptor:io.kubernetes.conditions version: v1alpha1 - description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates API diff --git a/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml b/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml index 677bfce5..bf2904a3 100644 --- a/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml +++ b/bundle/manifests/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml @@ -63,6 +63,81 @@ spec: status: description: FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation + properties: + conditions: + description: 'Represents the observations of a FenceAgentsRemediation''s + current state. Known .status.conditions.type are: "Processing", + "FenceAgentActionSucceeded", and "Succeeded".' + items: + description: "Condition contains details for one aspect of the current + state of this API Resource. --- This struct is intended for direct + use as an array at the field path .status.conditions. For example, + \n type FooStatus struct{ // Represents the observations of a + foo's current state. // Known .status.conditions.type are: \"Available\", + \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge + // +listType=map // +listMapKey=type Conditions []metav1.Condition + `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" + protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be when + the underlying condition changed. If that is not known, then + using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if .metadata.generation + is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the current + state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. Producers + of specific condition types may define expected values and + meanings for this field, and whether the values are considered + a guaranteed API. The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + --- Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object served: true diff --git a/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml b/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml index bc97bd14..c0cfb7cf 100644 --- a/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml +++ b/config/crd/bases/fence-agents-remediation.medik8s.io_fenceagentsremediations.yaml @@ -61,6 +61,81 @@ spec: status: description: FenceAgentsRemediationStatus defines the observed state of FenceAgentsRemediation + properties: + conditions: + description: 'Represents the observations of a FenceAgentsRemediation''s + current state. Known .status.conditions.type are: "Processing", + "FenceAgentActionSucceeded", and "Succeeded".' + items: + description: "Condition contains details for one aspect of the current + state of this API Resource. --- This struct is intended for direct + use as an array at the field path .status.conditions. For example, + \n type FooStatus struct{ // Represents the observations of a + foo's current state. // Known .status.conditions.type are: \"Available\", + \"Progressing\", and \"Degraded\" // +patchMergeKey=type // +patchStrategy=merge + // +listType=map // +listMapKey=type Conditions []metav1.Condition + `json:\"conditions,omitempty\" patchStrategy:\"merge\" patchMergeKey:\"type\" + protobuf:\"bytes,1,rep,name=conditions\"` \n // other fields }" + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition + transitioned from one status to another. This should be when + the underlying condition changed. If that is not known, then + using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating + details about the transition. This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation + that the condition was set based upon. For instance, if .metadata.generation + is currently 12, but the .status.conditions[x].observedGeneration + is 9, the condition is out of date with respect to the current + state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: reason contains a programmatic identifier indicating + the reason for the condition's last transition. Producers + of specific condition types may define expected values and + meanings for this field, and whether the values are considered + a guaranteed API. The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + --- Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map type: object type: object served: true diff --git a/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml b/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml index 02537490..622ac84a 100644 --- a/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/fence-agents-remediation.clusterserviceversion.yaml @@ -38,6 +38,14 @@ spec: which node is about to be fenced (i.e., they are common for all the nodes) displayName: Shared Parameters path: sharedparameters + statusDescriptors: + - description: 'Represents the observations of a FenceAgentsRemediation''s current + state. Known .status.conditions.type are: "Processing", "FenceAgentActionSucceeded", + and "Succeeded".' + displayName: conditions + path: conditions + x-descriptors: + - urn:alm:descriptor:io.kubernetes.conditions version: v1alpha1 - description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates API diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index c4c1b5df..81273cda 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -20,12 +20,17 @@ import ( "context" "errors" "fmt" + "time" "github.com/go-logr/logr" commonAnnotations "github.com/medik8s/common/pkg/annotations" + commonConditions "github.com/medik8s/common/pkg/conditions" apiErrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + utilErrors "k8s.io/apimachinery/pkg/util/errors" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -39,9 +44,10 @@ const ( // errors errorMissingParams = "nodeParameters or sharedParameters or both are missing, and they cannot be empty" errorMissingNodeParams = "node parameter is required, and cannot be empty" - SuccessFAResponse = "Success: Rebooted" - parameterActionName = "--action" - parameterActionValue = "reboot" + + SuccessFAResponse = "Success: Rebooted" + parameterActionName = "--action" + parameterActionValue = "reboot" ) // FenceAgentsRemediationReconciler reconciles a FenceAgentsRemediation object @@ -76,7 +82,7 @@ func (r *FenceAgentsRemediationReconciler) SetupWithManager(mgr ctrl.Manager) er // // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.2/pkg/reconcile -func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (finalResult ctrl.Result, finalErr error) { r.Log.Info("Begin FenceAgentsRemediation Reconcile") defer r.Log.Info("Finish FenceAgentsRemediation Reconcile") emptyResult := ctrl.Result{} @@ -93,6 +99,17 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct r.Log.Error(err, "Failed to get FenceAgentsRemediation CR") return emptyResult, err } + + // At the end of each Reconcile try to update CR's status + defer func() { + if updateErr := r.updateStatus(ctx, far); updateErr != nil { + if !apiErrors.IsConflict(updateErr) { + finalErr = utilErrors.NewAggregate([]error{updateErr, finalErr}) + } + finalResult.RequeueAfter = time.Second + } + }() + // Validate FAR CR name to match a nodeName from the cluster r.Log.Info("Check FAR CR's name") valid, err := utils.IsNodeNameValid(r.Client, req.Name) @@ -119,12 +136,23 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct return emptyResult, fmt.Errorf("failed to add finalizer to the CR - %w", err) } r.Log.Info("Finalizer was added", "CR Name", req.Name) - return emptyResult, nil - // TODO: should return Requeue: true when the CR has a status and not end reconcile with empty result when a finalizer has been added - //return ctrl.Result{Requeue: true}, nil + + if err := updateConditions(v1alpha1.RemediationStarted, &far.Status.Conditions, r.Log); err != nil { + return emptyResult, err + } + return ctrl.Result{Requeue: true}, nil } else if controllerutil.ContainsFinalizer(far, v1alpha1.FARFinalizer) && !far.ObjectMeta.DeletionTimestamp.IsZero() { // Delete CR only when a finalizer and DeletionTimestamp are set r.Log.Info("CR's deletion timestamp is not zero, and FAR finalizer exists", "CR Name", req.Name) + + if !meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, commonConditions.SucceededType, metav1.ConditionTrue) { + processingCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.ProcessingType).Status + fenceAgentActionSucceededCondition := meta.FindStatusCondition(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType).Status + succeededCondition := meta.FindStatusCondition(far.Status.Conditions, commonConditions.SucceededType).Status + r.Log.Info("FAR didn't finish remediate the node ", "CR Name", req.Name, "processing condition", processingCondition, + "fenceAgentActionSucceeded condition", fenceAgentActionSucceededCondition, "succeeded condition", succeededCondition) + } + // remove node's taints if err := utils.RemoveTaint(r.Client, far.Name); err != nil && !apiErrors.IsNotFound(err) { return emptyResult, err @@ -137,50 +165,69 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct r.Log.Info("Finalizer was removed", "CR Name", req.Name) return emptyResult, nil } - - // Fetch the FAR's pod - r.Log.Info("Fetch FAR's pod") - pod, err := utils.GetFenceAgentsRemediationPod(r.Client) - if err != nil { - r.Log.Error(err, "Can't find FAR's pod by its label", "CR's Name", req.Name) - return emptyResult, err - } - //TODO: Check that FA is excutable? run cli.IsExecuteable - - // Build FA parameters - r.Log.Info("Combine fence agent parameters", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) - faParams, err := buildFenceAgentParams(far) - if err != nil { - r.Log.Error(err, "Invalid sharedParameters/nodeParameters from CR - edit/recreate the CR", "CR's Name", req.Name) - return emptyResult, nil - } - // Add medik8s remediation taint - r.Log.Info("Add Medik8s remediation taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + // Add FAR (medik8s) remediation taint + r.Log.Info("Try adding FAR (Medik8s) remediation taint", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) if err := utils.AppendTaint(r.Client, far.Name); err != nil { return emptyResult, err } - cmd := append([]string{far.Spec.Agent}, faParams...) - // The Fence Agent is excutable and the parameters structure are valid, but we don't check their values - r.Log.Info("Execute the fence agent", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) - outputRes, outputErr, err := r.Executor.Execute(pod, cmd) - if err != nil { - // response was a failure message - r.Log.Error(err, "Fence Agent response was a failure", "CR's Name", req.Name) - return emptyResult, err - } - if outputErr != "" || outputRes != SuccessFAResponse+"\n" { - // response wasn't failure or sucesss message - err := fmt.Errorf("unknown fence agent response - expecting `%s` response, but we received `%s`", SuccessFAResponse, outputRes) - r.Log.Error(err, "Fence Agent response wasn't a success message", "CR's Name", req.Name) - return emptyResult, err + + if meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.ProcessingType) && + !meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) { + // The remeditation has already been processed, thus we can begin with exuecting the FA for the node + // We run the FA until its action (reboot) was succeeded, and we verify it with the fenceAgentActionSucceeded condition + + // Fetch the FAR's pod + r.Log.Info("Fetch FAR's pod") + pod, err := utils.GetFenceAgentsRemediationPod(r.Client) + if err != nil { + r.Log.Error(err, "Can't find FAR's pod by its label", "CR's Name", req.Name) + return emptyResult, err + } + //TODO: Check that FA is excutable? run cli.IsExecuteable + + // Build FA parameters + r.Log.Info("Combine fence agent parameters", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + faParams, err := buildFenceAgentParams(far) + if err != nil { + r.Log.Error(err, "Invalid sharedParameters/nodeParameters from CR - edit/recreate the CR", "CR's Name", req.Name) + return emptyResult, nil + } + + cmd := append([]string{far.Spec.Agent}, faParams...) + // The Fence Agent is excutable and the parameters structure are valid, but we don't check their values + r.Log.Info("Execute the fence agent", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + outputRes, outputErr, err := r.Executor.Execute(pod, cmd) + if err != nil { + // response was a failure message + r.Log.Error(err, "Fence Agent response was a failure", "CR's Name", req.Name) + return emptyResult, err + } + if outputErr != "" || outputRes != SuccessFAResponse+"\n" { + // response wasn't failure or sucesss message + err := fmt.Errorf("unknown fence agent response - expecting `%s` response, but we received `%s`", SuccessFAResponse, outputRes) + r.Log.Error(err, "Fence Agent response wasn't a success message", "CR's Name", req.Name) + return emptyResult, err + } + + r.Log.Info("Fence Agent command was finished successfully", "Fence Agent", far.Spec.Agent, "Node name", req.Name, "Response", SuccessFAResponse) + if err := updateConditions(v1alpha1.FenceAgentSucceeded, &far.Status.Conditions, r.Log); err != nil { + return emptyResult, err + } + return ctrl.Result{Requeue: true}, nil } - r.Log.Info("Fence Agent command was finished successfully", "Fence Agent", far.Spec.Agent, "Node name", req.Name, "Response", SuccessFAResponse) + if meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) && + !meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.SucceededType) { + // Fence agent action succeeded, and now we try to remove workloads (pods and their volume attachments) + r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) + if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil { + r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name) + return emptyResult, err + } - // Reboot was finished and now we remove workloads (pods and their VA) - r.Log.Info("Manual workload deletion", "Fence Agent", far.Spec.Agent, "Node Name", req.Name) - if err := utils.DeleteResources(ctx, r.Client, req.Name); err != nil { - r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name) - return emptyResult, err + if err := updateConditions(v1alpha1.RemediationFinished, &far.Status.Conditions, r.Log); err != nil { + return emptyResult, err + } + r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name) } return emptyResult, nil @@ -195,6 +242,95 @@ func isTimedOutByNHC(far *v1alpha1.FenceAgentsRemediation) bool { return false } +// updateStatus updates the CR status, and returns an error if it fails +func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far *v1alpha1.FenceAgentsRemediation) error { + if err := r.Client.Status().Update(ctx, far); err != nil { + if !apiErrors.IsConflict(err) { + r.Log.Error(err, "failed to update far status") + } + return err + } + return nil +} + +// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ProcessingChangeReason. +// return an error if an unknown ProcessingChangeReason is provided +func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error { + + var ( + processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus + conditionMessage string + ) + // All the ProcessingChangeReasons can happen one after another + // RemediationStarted will always be the first reason. + // FenceAgentSucceeded can only happen after RemediationStarted happened + // RemediationFinished can only happen after FenceAgentSucceeded happened + + switch reason { + case v1alpha1.RemediationStarted: + processingConditionStatus = metav1.ConditionTrue + fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown + succeededConditionStatus = metav1.ConditionUnknown + conditionMessage = v1alpha1.RemediationStartedConditionMessage + case v1alpha1.FenceAgentSucceeded: + fenceAgentActionSucceededConditionStatus = metav1.ConditionTrue + conditionMessage = v1alpha1.FenceAgentSucceededConditionMessage + case v1alpha1.RemediationFinished: + processingConditionStatus = metav1.ConditionFalse + succeededConditionStatus = metav1.ConditionTrue + conditionMessage = v1alpha1.RemediationFinishedConditionMessage + default: + err := fmt.Errorf("unknown processingChangeReason:%s", reason) + log.Error(err, "couldn't update FAR Status Conditions") + return err + } + + // if ProcessingType is already false, then it cannot be changed to true again + if processingConditionStatus == metav1.ConditionTrue && + meta.IsStatusConditionFalse(*currentConditions, commonConditions.ProcessingType) { + return nil + } + + // if FenceAgentActionSucceededType is already false, then it cannot be changed to true again + if fenceAgentActionSucceededConditionStatus == metav1.ConditionTrue && + meta.IsStatusConditionFalse(*currentConditions, v1alpha1.FenceAgentActionSucceededType) { + return nil + } + + log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason)) + // if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value + if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) { + meta.SetStatusCondition(currentConditions, metav1.Condition{ + Type: commonConditions.ProcessingType, + Status: processingConditionStatus, + Reason: string(reason), + Message: conditionMessage, + }) + } + + // if the requested Status.Conditions.FenceAgentActionSucceeded is different then the current one, then update Status.Conditions.FenceAgentActionSucceeded value + if fenceAgentActionSucceededConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, v1alpha1.FenceAgentActionSucceededType, fenceAgentActionSucceededConditionStatus) { + meta.SetStatusCondition(currentConditions, metav1.Condition{ + Type: v1alpha1.FenceAgentActionSucceededType, + Status: fenceAgentActionSucceededConditionStatus, + Reason: string(reason), + Message: conditionMessage, + }) + } + + // if the requested Status.Conditions.Succeeded is different then the current one, then update Status.Conditions.Succeeded value + if succeededConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.SucceededType, succeededConditionStatus) { + meta.SetStatusCondition(currentConditions, metav1.Condition{ + Type: commonConditions.SucceededType, + Status: succeededConditionStatus, + Reason: string(reason), + Message: conditionMessage, + }) + } + + return nil +} + // buildFenceAgentParams collects the FAR's parameters for the node based on FAR CR, and if the CR is missing parameters // or the CR's name don't match nodeParamter name or it has an action which is different than reboot, then return an error func buildFenceAgentParams(far *v1alpha1.FenceAgentsRemediation) ([]string, error) { diff --git a/controllers/fenceagentsremediation_controller_test.go b/controllers/fenceagentsremediation_controller_test.go index 43387093..46c2aedc 100644 --- a/controllers/fenceagentsremediation_controller_test.go +++ b/controllers/fenceagentsremediation_controller_test.go @@ -23,12 +23,14 @@ import ( "time" "github.com/go-logr/logr" + commonConditions "github.com/medik8s/common/pkg/conditions" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -46,6 +48,10 @@ const ( testPodName = "far-pod-test-1" vaName1 = "va-test-1" vaName2 = "va-test-2" + + // intervals + timeoutDeletion = 10 * time.Second // this timeout is used after all the other steps have finished successfully + pollInterval = 250 * time.Millisecond ) var ( @@ -158,6 +164,11 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName1, resourceDeletionWasTriggered) testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) + + By("Having Succeed condition set to true") + verifyExpectedStatusConditionError(underTestFAR, commonConditions.ProcessingType, utils.ConditionSetAndMatchSuccess, metav1.ConditionFalse) + verifyExpectedStatusConditionError(underTestFAR, v1alpha1.FenceAgentActionSucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) + verifyExpectedStatusConditionError(underTestFAR, commonConditions.SucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) }) }) When("creating invalid FAR CR Name", func() { @@ -186,6 +197,11 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName1, resourceDeletionWasTriggered) testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) + + By("Not having any condition set") + verifyExpectedStatusConditionError(underTestFAR, commonConditions.ProcessingType, utils.ConditionNotSetError, metav1.ConditionUnknown) + verifyExpectedStatusConditionError(underTestFAR, v1alpha1.FenceAgentActionSucceededType, utils.ConditionNotSetError, metav1.ConditionUnknown) + verifyExpectedStatusConditionError(underTestFAR, commonConditions.SucceededType, utils.ConditionNotSetError, metav1.ConditionUnknown) }) }) }) @@ -294,6 +310,8 @@ func cliCommandsEquality(far *v1alpha1.FenceAgentsRemediation) (bool, error) { return isEqualStringLists(mocksExecuter.command, expectedCommand), nil } +// TODO: Think about using Generics for the next two functions + // testVADeletion tests whether the volume attachment no longer exist for successful FAR CR // and consistently check if the volume attachment exist and was not deleted func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { @@ -307,7 +325,7 @@ func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), vaKey, va) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + }, timeoutDeletion, pollInterval).Should(BeTrue()) log.Info("Volume attachment is no longer exist", "va", vaName) } else { ConsistentlyWithOffset(1, func() bool { @@ -315,7 +333,7 @@ func testVADeletion(vaName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), vaKey, va) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) + }, timeoutDeletion, pollInterval).Should(BeFalse()) log.Info("Volume attachment exist", "va", vaName) } } @@ -333,7 +351,7 @@ func testPodDeletion(podName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), podKey, pod) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + }, timeoutDeletion, pollInterval).Should(BeTrue()) log.Info("Pod is no longer exist", "pod", podName) } else { ConsistentlyWithOffset(1, func() bool { @@ -341,11 +359,29 @@ func testPodDeletion(podName string, resourceDeletionWasTriggered bool) { err := k8sClient.Get(context.Background(), podKey, pod) return apierrors.IsNotFound(err) - }, 5*time.Second, 250*time.Millisecond).Should(BeFalse()) + }, timeoutDeletion, pollInterval).Should(BeFalse()) log.Info("Pod exist", "pod", podName) } } +// verifyExpectedStatusConditionState checks whether the status condition state matches the expectedResult +func verifyExpectedStatusConditionError(testFAR *v1alpha1.FenceAgentsRemediation, conditionType, expectedError string, conditionStatus metav1.ConditionStatus) { + far := &v1alpha1.FenceAgentsRemediation{} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(testFAR), far); err != nil { + return utils.NoFenceAgentsRemediationCRFound + } + if gotCondition := meta.FindStatusCondition(far.Status.Conditions, conditionType); gotCondition == nil { + return utils.ConditionNotSetError + } + if meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, conditionType, conditionStatus) { + return utils.ConditionSetAndMatchSuccess + } + return utils.ConditionSetButNoMatchError + + }, timeoutDeletion, pollInterval).Should(Equal(expectedError), "'%v' status condition was expected to be %v", conditionType, conditionStatus) +} + // Implements Execute function to mock/test Execute of FenceAgentsRemediationReconciler type mockExecuter struct { command []string diff --git a/pkg/utils/errors.go b/pkg/utils/errors.go new file mode 100644 index 00000000..8a23ad22 --- /dev/null +++ b/pkg/utils/errors.go @@ -0,0 +1,9 @@ +package utils + +const ( + // condition status error messages + NoFenceAgentsRemediationCRFound = "noFenceAgentsRemediationCRFound" + ConditionNotSetError = "ConditionNotSet" + ConditionSetButNoMatchError = "ConditionSetButNoMatch" + ConditionSetAndMatchSuccess = "ConditionSetAndMatchSuccess" +) diff --git a/test/e2e/far_e2e_test.go b/test/e2e/far_e2e_test.go index 0a41d267..aaf7afe4 100644 --- a/test/e2e/far_e2e_test.go +++ b/test/e2e/far_e2e_test.go @@ -6,6 +6,7 @@ import ( "math/rand" "time" + commonConditions "github.com/medik8s/common/pkg/conditions" medik8sLabels "github.com/medik8s/common/pkg/labels" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -13,6 +14,7 @@ import ( corev1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apiErrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" @@ -422,6 +424,25 @@ func checkPodDeleted(pod *corev1.Pod) { log.Info("Pod has already been deleted", "pod name", pod.Name) } +// verifyExpectedStatusConditionState checks whether the status condition state matches the expectedResult +func verifyExpectedStatusConditionError(nodeName, conditionType, expectedError string, conditionStatus metav1.ConditionStatus) { + far := &v1alpha1.FenceAgentsRemediation{} + farNamespacedName := client.ObjectKey{Name: nodeName, Namespace: operatorNsName} + Eventually(func() string { + if err := k8sClient.Get(context.Background(), farNamespacedName, far); err != nil { + return utils.NoFenceAgentsRemediationCRFound + } + if gotCondition := meta.FindStatusCondition(far.Status.Conditions, conditionType); gotCondition == nil { + return utils.ConditionNotSetError + } + if meta.IsStatusConditionPresentAndEqual(far.Status.Conditions, conditionType, conditionStatus) { + return utils.ConditionSetAndMatchSuccess + } + return utils.ConditionSetButNoMatchError + + }, timeoutDeletion, pollInterval).Should(Equal(expectedError), "'%v' status condition was expected to be %v", conditionType, conditionStatus) +} + // checkRemediation verify whether the node was remediated func checkRemediation(nodeName string, nodeBootTimeBefore time.Time, oldPodCreationTime time.Time, va *storagev1.VolumeAttachment, pod *corev1.Pod) { By("Check if FAR NoExecute taint was added") @@ -439,4 +460,10 @@ func checkRemediation(nodeName string, nodeBootTimeBefore time.Time, oldPodCreat By("checking if old pod has been deleted") checkPodDeleted(pod) + + By("checking if the status conditions match a successful remediation") + verifyExpectedStatusConditionError(nodeName, commonConditions.ProcessingType, utils.ConditionSetAndMatchSuccess, metav1.ConditionFalse) + verifyExpectedStatusConditionError(nodeName, v1alpha1.FenceAgentActionSucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) + verifyExpectedStatusConditionError(nodeName, commonConditions.SucceededType, utils.ConditionSetAndMatchSuccess, metav1.ConditionTrue) + } diff --git a/vendor/github.com/medik8s/common/pkg/conditions/conditions.go b/vendor/github.com/medik8s/common/pkg/conditions/conditions.go new file mode 100644 index 00000000..fc84e504 --- /dev/null +++ b/vendor/github.com/medik8s/common/pkg/conditions/conditions.go @@ -0,0 +1,13 @@ +package conditions + +const ( + + // These are condition types used on remediation CRs + + // ProcessingType is the condition type used to signal the remediation has started and it is in progress, or has finished + ProcessingType = "Processing" + // SucceededType is the condition type used to signal whether the remediation was successful or not + SucceededType = "Succeeded" + // PermanentNodeDeletionExpectedType is the condition type used to signal that the unhealthy node will be permanently deleted. + PermanentNodeDeletionExpectedType = "PermanentNodeDeletionExpected" +) diff --git a/vendor/modules.txt b/vendor/modules.txt index 6b199f8f..3e829182 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -95,6 +95,7 @@ github.com/matttproud/golang_protobuf_extensions/pbutil # github.com/medik8s/common v1.7.0 ## explicit; go 1.20 github.com/medik8s/common/pkg/annotations +github.com/medik8s/common/pkg/conditions github.com/medik8s/common/pkg/labels # github.com/moby/spdystream v0.2.0 ## explicit; go 1.13