diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 9f3f9cec..8486b9fa 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -31,21 +31,27 @@ const ( // FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not FenceAgentActionSucceededType = "FenceAgentActionSucceeded" // condition messages - RemediationStartedConditionMessage = "FAR CR was found, the CR name matches one of the cluster nodes, and a finalizer was set" - FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" - RemediationFinishedConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" + RemediationFinishedNodeNotFoundConditionMessage = "Node Healthcheck timeout annotation has been set" + RemediationInterruptedByNHCConditionMessage = "FAR CR name doesn't match a node name" + RemediationStartedConditionMessage = "FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR" + FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" + RemediationFinishedSuccessfullyConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" ) -// ProcessingChangeReason represents the reason of updating the processing condition -type ProcessingChangeReason string +// ConditionsChangeReason represents the reason of updating the some or all the conditions +type ConditionsChangeReason string const ( + // RemediationFinishedNodeNotFound - CR was found but its name doesn't matche a node + RemediationFinishedNodeNotFound ConditionsChangeReason = "RemediationFinishedNodeNotFound" + // RemediationInterruptedByNHC - Remediation was interrupted by NHC timeout annotation + RemediationInterruptedByNHC ConditionsChangeReason = "RemediationInterruptedByNHC" // RemediationStarted - CR was found, its name matches a node, and a finalizer was set - RemediationStarted ProcessingChangeReason = "RemediationStarted" + RemediationStarted ConditionsChangeReason = "RemediationStarted" // FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully - FenceAgentSucceeded ProcessingChangeReason = "FenceAgentSucceeded" - // RemediationFinished - The unhealthy node was fully remediated (it was tainted, fenced by FA and all of its resources have been deleted) - RemediationFinished ProcessingChangeReason = "RemediationFinished" + FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded" + // RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted) + RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully" ) type ParameterName string diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index 81273cda..10aaf754 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -119,14 +119,15 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct } if !valid { r.Log.Error(err, "Didn't find a node matching the CR's name", "CR's Name", req.Name) - return emptyResult, nil + err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, &far.Status.Conditions, r.Log) + return emptyResult, err } // Check NHC timeout annotation if isTimedOutByNHC(far) { r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator") - // TODO: update status and return its error - return emptyResult, nil + err := updateConditions(v1alpha1.RemediationInterruptedByNHC, &far.Status.Conditions, r.Log) + return emptyResult, err } // Add finalizer when the CR is created @@ -224,7 +225,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct return emptyResult, err } - if err := updateConditions(v1alpha1.RemediationFinished, &far.Status.Conditions, r.Log); err != nil { + if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, &far.Status.Conditions, r.Log); err != nil { return emptyResult, err } r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name) @@ -253,20 +254,36 @@ func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far return nil } -// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ProcessingChangeReason. -// return an error if an unknown ProcessingChangeReason is provided -func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error { +// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ConditionsChangeReason. +// return an error if an unknown ConditionsChangeReason is provided +func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error { var ( processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus conditionMessage string ) - // All the ProcessingChangeReasons can happen one after another - // RemediationStarted will always be the first reason. + // RemediationFinishedNodeNotFound and RemediationInterruptedByNHC reasons can happen at any time the Reconcile runs + // Except these two reasons, there are another three reasons that can only happen one after another + // RemediationStarted will always be the first reason (out of these three) // FenceAgentSucceeded can only happen after RemediationStarted happened - // RemediationFinished can only happen after FenceAgentSucceeded happened + // RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened switch reason { + case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationInterruptedByNHC: + processingConditionStatus = metav1.ConditionFalse + fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse + succeededConditionStatus = metav1.ConditionFalse + // Different reasons share the same effect to the conditions, but they have different message + switch reason { + case v1alpha1.RemediationFinishedNodeNotFound: + conditionMessage = v1alpha1.RemediationFinishedNodeNotFoundConditionMessage + case v1alpha1.RemediationInterruptedByNHC: + conditionMessage = v1alpha1.RemediationInterruptedByNHCConditionMessage + default: + err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason) + log.Error(err, "couldn't update FAR Status Conditions") + return err + } case v1alpha1.RemediationStarted: processingConditionStatus = metav1.ConditionTrue fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown @@ -275,28 +292,16 @@ func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions case v1alpha1.FenceAgentSucceeded: fenceAgentActionSucceededConditionStatus = metav1.ConditionTrue conditionMessage = v1alpha1.FenceAgentSucceededConditionMessage - case v1alpha1.RemediationFinished: + case v1alpha1.RemediationFinishedSuccessfully: processingConditionStatus = metav1.ConditionFalse succeededConditionStatus = metav1.ConditionTrue - conditionMessage = v1alpha1.RemediationFinishedConditionMessage + conditionMessage = v1alpha1.RemediationFinishedSuccessfullyConditionMessage default: - err := fmt.Errorf("unknown processingChangeReason:%s", reason) + err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason) log.Error(err, "couldn't update FAR Status Conditions") return err } - // if ProcessingType is already false, then it cannot be changed to true again - if processingConditionStatus == metav1.ConditionTrue && - meta.IsStatusConditionFalse(*currentConditions, commonConditions.ProcessingType) { - return nil - } - - // if FenceAgentActionSucceededType is already false, then it cannot be changed to true again - if fenceAgentActionSucceededConditionStatus == metav1.ConditionTrue && - meta.IsStatusConditionFalse(*currentConditions, v1alpha1.FenceAgentActionSucceededType) { - return nil - } - log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason)) // if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) { diff --git a/controllers/fenceagentsremediation_controller_test.go b/controllers/fenceagentsremediation_controller_test.go index 8b90b815..51e0f67a 100644 --- a/controllers/fenceagentsremediation_controller_test.go +++ b/controllers/fenceagentsremediation_controller_test.go @@ -125,6 +125,7 @@ var _ = Describe("FAR Controller", func() { farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace} farNoExecuteTaint := utils.CreateFARNoExecuteTaint() resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value + conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status } BeforeEach(func() { // Create two VAs and two pods, and at the end clean them up with DeferCleanup va1 := createVA(vaName1, workerNode) @@ -165,8 +166,7 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) - By("Having Succeed condition set to true") - conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status } + By("Having Succeeded, FenceAgentActionSucceeded conditions set to true, and Processing set to false") verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse)) verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue)) verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue)) @@ -199,10 +199,10 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) - By("Not having any condition set") - verifyStatusCondition(dummyNode, commonConditions.ProcessingType, nil) - verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, nil) - verifyStatusCondition(dummyNode, commonConditions.SucceededType, nil) + By("Having all three conditions set to false") + verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse)) + verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse)) + verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse)) }) }) })