From 9126ce453937283ea37547af53a20783980e8108 Mon Sep 17 00:00:00 2001 From: razo7 Date: Mon, 14 Aug 2023 17:25:53 +0300 Subject: [PATCH 1/5] Add two reasons for updating and stoping remediation When either FAR CR name doesn't match a node name or Node Healthcheck timeout annotation has been set we update the CR three conditions to false --- api/v1alpha1/fenceagentsremediation_types.go | 23 ++++++++----- .../fenceagentsremediation_controller.go | 34 ++++++++++++------- .../fenceagentsremediation_controller_test.go | 8 ++--- 3 files changed, 39 insertions(+), 26 deletions(-) diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 9f3f9cec..1d785954 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -31,21 +31,26 @@ const ( // FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not FenceAgentActionSucceededType = "FenceAgentActionSucceeded" // condition messages - RemediationStartedConditionMessage = "FAR CR was found, the CR name matches one of the cluster nodes, and a finalizer was set" - FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" - RemediationFinishedConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" + RemediationInterruptedConditionMessage = "FAR CR name doesn't match a node name or Node Healthcheck timeout annotation has been set" + RemediationStartedConditionMessage = "FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR" + FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" + RemediationFinishedSuccessfullyConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" ) -// ProcessingChangeReason represents the reason of updating the processing condition -type ProcessingChangeReason string +// ConditionsChangeReason represents the reason of updating the some or all the conditions +type ConditionsChangeReason string const ( + // RemediationFinishedNodeNotFound - CR was found but its name doesn't matche a node + RemediationFinishedNodeNotFound ConditionsChangeReason = "RemediationFinishedNodeNotFound" + // RemediationStoppedByNHC - Remediation was interrupted by NHC timeout annotation + RemediationStoppedByNHC ConditionsChangeReason = "RemediationStoppedByNHC" // RemediationStarted - CR was found, its name matches a node, and a finalizer was set - RemediationStarted ProcessingChangeReason = "RemediationStarted" + RemediationStarted ConditionsChangeReason = "RemediationStarted" // FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully - FenceAgentSucceeded ProcessingChangeReason = "FenceAgentSucceeded" - // RemediationFinished - The unhealthy node was fully remediated (it was tainted, fenced by FA and all of its resources have been deleted) - RemediationFinished ProcessingChangeReason = "RemediationFinished" + FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded" + // RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted) + RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully" ) type ParameterName string diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index 81273cda..090d7606 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -119,14 +119,15 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct } if !valid { r.Log.Error(err, "Didn't find a node matching the CR's name", "CR's Name", req.Name) - return emptyResult, nil + err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, &far.Status.Conditions, r.Log) + return emptyResult, err } // Check NHC timeout annotation if isTimedOutByNHC(far) { r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator") - // TODO: update status and return its error - return emptyResult, nil + err := updateConditions(v1alpha1.RemediationStoppedByNHC, &far.Status.Conditions, r.Log) + return emptyResult, err } // Add finalizer when the CR is created @@ -224,7 +225,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct return emptyResult, err } - if err := updateConditions(v1alpha1.RemediationFinished, &far.Status.Conditions, r.Log); err != nil { + if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, &far.Status.Conditions, r.Log); err != nil { return emptyResult, err } r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name) @@ -253,20 +254,27 @@ func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far return nil } -// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ProcessingChangeReason. -// return an error if an unknown ProcessingChangeReason is provided -func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error { +// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ConditionsChangeReason. +// return an error if an unknown ConditionsChangeReason is provided +func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error { var ( processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus conditionMessage string ) - // All the ProcessingChangeReasons can happen one after another - // RemediationStarted will always be the first reason. + // RemediationFinishedNodeNotFound and RemediationStoppedByNHC reasons can happen at any time the Reconcile runs + // Except these two reasons, there are another three reasons that can only happen one after another + // RemediationStarted will always be the first reason (out of these three) // FenceAgentSucceeded can only happen after RemediationStarted happened - // RemediationFinished can only happen after FenceAgentSucceeded happened + // RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened switch reason { + case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationStoppedByNHC: + processingConditionStatus = metav1.ConditionFalse + fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse + succeededConditionStatus = metav1.ConditionFalse + // These two reasons share the same message as they share the same effect + conditionMessage = v1alpha1.RemediationInterruptedConditionMessage case v1alpha1.RemediationStarted: processingConditionStatus = metav1.ConditionTrue fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown @@ -275,12 +283,12 @@ func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions case v1alpha1.FenceAgentSucceeded: fenceAgentActionSucceededConditionStatus = metav1.ConditionTrue conditionMessage = v1alpha1.FenceAgentSucceededConditionMessage - case v1alpha1.RemediationFinished: + case v1alpha1.RemediationFinishedSuccessfully: processingConditionStatus = metav1.ConditionFalse succeededConditionStatus = metav1.ConditionTrue - conditionMessage = v1alpha1.RemediationFinishedConditionMessage + conditionMessage = v1alpha1.RemediationFinishedSuccessfullyConditionMessage default: - err := fmt.Errorf("unknown processingChangeReason:%s", reason) + err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason) log.Error(err, "couldn't update FAR Status Conditions") return err } diff --git a/controllers/fenceagentsremediation_controller_test.go b/controllers/fenceagentsremediation_controller_test.go index 8b90b815..9a58e11e 100644 --- a/controllers/fenceagentsremediation_controller_test.go +++ b/controllers/fenceagentsremediation_controller_test.go @@ -125,6 +125,7 @@ var _ = Describe("FAR Controller", func() { farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace} farNoExecuteTaint := utils.CreateFARNoExecuteTaint() resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value + conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status } BeforeEach(func() { // Create two VAs and two pods, and at the end clean them up with DeferCleanup va1 := createVA(vaName1, workerNode) @@ -166,7 +167,6 @@ var _ = Describe("FAR Controller", func() { testPodDeletion(testPodName, resourceDeletionWasTriggered) By("Having Succeed condition set to true") - conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status } verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse)) verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue)) verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue)) @@ -200,9 +200,9 @@ var _ = Describe("FAR Controller", func() { testPodDeletion(testPodName, resourceDeletionWasTriggered) By("Not having any condition set") - verifyStatusCondition(dummyNode, commonConditions.ProcessingType, nil) - verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, nil) - verifyStatusCondition(dummyNode, commonConditions.SucceededType, nil) + verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse)) + verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse)) + verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse)) }) }) }) From 8005d8a3381bf2ddb06395e7dd4d3fb665991e0c Mon Sep 17 00:00:00 2001 From: razo7 Date: Mon, 14 Aug 2023 17:26:55 +0300 Subject: [PATCH 2/5] Don't stop update of condition from false to true It is not necessary --- controllers/fenceagentsremediation_controller.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index 090d7606..1c1b9a13 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -293,18 +293,6 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions return err } - // if ProcessingType is already false, then it cannot be changed to true again - if processingConditionStatus == metav1.ConditionTrue && - meta.IsStatusConditionFalse(*currentConditions, commonConditions.ProcessingType) { - return nil - } - - // if FenceAgentActionSucceededType is already false, then it cannot be changed to true again - if fenceAgentActionSucceededConditionStatus == metav1.ConditionTrue && - meta.IsStatusConditionFalse(*currentConditions, v1alpha1.FenceAgentActionSucceededType) { - return nil - } - log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason)) // if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) { From 6f62b11df8cec55fb6e8417db0612c17cdd5f181 Mon Sep 17 00:00:00 2001 From: razo7 Date: Tue, 15 Aug 2023 09:36:32 +0300 Subject: [PATCH 3/5] Better reason for NHC annotation signal RemediationStoppedByNHC -> RemediationInterruptedByNHC. NHC is signalling a timeout that interrupts the remediation and it does not stopping/terminating a remediation. FAR does that when it notice the annotation. --- api/v1alpha1/fenceagentsremediation_types.go | 4 ++-- controllers/fenceagentsremediation_controller.go | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 1d785954..6f3109d5 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -43,8 +43,8 @@ type ConditionsChangeReason string const ( // RemediationFinishedNodeNotFound - CR was found but its name doesn't matche a node RemediationFinishedNodeNotFound ConditionsChangeReason = "RemediationFinishedNodeNotFound" - // RemediationStoppedByNHC - Remediation was interrupted by NHC timeout annotation - RemediationStoppedByNHC ConditionsChangeReason = "RemediationStoppedByNHC" + // RemediationInterruptedByNHC - Remediation was interrupted by NHC timeout annotation + RemediationInterruptedByNHC ConditionsChangeReason = "RemediationInterruptedByNHC" // RemediationStarted - CR was found, its name matches a node, and a finalizer was set RemediationStarted ConditionsChangeReason = "RemediationStarted" // FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index 1c1b9a13..5ea0b851 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -126,7 +126,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct // Check NHC timeout annotation if isTimedOutByNHC(far) { r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator") - err := updateConditions(v1alpha1.RemediationStoppedByNHC, &far.Status.Conditions, r.Log) + err := updateConditions(v1alpha1.RemediationInterruptedByNHC, &far.Status.Conditions, r.Log) return emptyResult, err } @@ -262,14 +262,14 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus conditionMessage string ) - // RemediationFinishedNodeNotFound and RemediationStoppedByNHC reasons can happen at any time the Reconcile runs + // RemediationFinishedNodeNotFound and RemediationInterruptedByNHC reasons can happen at any time the Reconcile runs // Except these two reasons, there are another three reasons that can only happen one after another // RemediationStarted will always be the first reason (out of these three) // FenceAgentSucceeded can only happen after RemediationStarted happened // RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened switch reason { - case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationStoppedByNHC: + case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationInterruptedByNHC: processingConditionStatus = metav1.ConditionFalse fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse succeededConditionStatus = metav1.ConditionFalse From 01241d676408489ae80c55c7eb3f51cdd84f5cd2 Mon Sep 17 00:00:00 2001 From: razo7 Date: Tue, 15 Aug 2023 10:39:22 +0300 Subject: [PATCH 4/5] Separate condition message for RemediationFinishedNodeNotFound and RemediationInterruptedByNHC Clearer message for the users --- api/v1alpha1/fenceagentsremediation_types.go | 3 ++- controllers/fenceagentsremediation_controller.go | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/api/v1alpha1/fenceagentsremediation_types.go b/api/v1alpha1/fenceagentsremediation_types.go index 6f3109d5..8486b9fa 100644 --- a/api/v1alpha1/fenceagentsremediation_types.go +++ b/api/v1alpha1/fenceagentsremediation_types.go @@ -31,7 +31,8 @@ const ( // FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not FenceAgentActionSucceededType = "FenceAgentActionSucceeded" // condition messages - RemediationInterruptedConditionMessage = "FAR CR name doesn't match a node name or Node Healthcheck timeout annotation has been set" + RemediationFinishedNodeNotFoundConditionMessage = "Node Healthcheck timeout annotation has been set" + RemediationInterruptedByNHCConditionMessage = "FAR CR name doesn't match a node name" RemediationStartedConditionMessage = "FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR" FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully" RemediationFinishedSuccessfullyConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)" diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index 5ea0b851..9ab33c55 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -273,8 +273,12 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions processingConditionStatus = metav1.ConditionFalse fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse succeededConditionStatus = metav1.ConditionFalse - // These two reasons share the same message as they share the same effect - conditionMessage = v1alpha1.RemediationInterruptedConditionMessage + if reason == v1alpha1.RemediationFinishedNodeNotFound { + conditionMessage = v1alpha1.RemediationFinishedNodeNotFoundConditionMessage + } else { + conditionMessage = v1alpha1.RemediationInterruptedByNHCConditionMessage + } + case v1alpha1.RemediationStarted: processingConditionStatus = metav1.ConditionTrue fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown From c0e69f298892e8edefda1f8a9bab203e8670e653 Mon Sep 17 00:00:00 2001 From: razo7 Date: Wed, 16 Aug 2023 12:08:55 +0300 Subject: [PATCH 5/5] Switch case usage for conditionMessage of a failure It helps us forgetting this part when adding another reason to the case statement above. --- controllers/fenceagentsremediation_controller.go | 11 ++++++++--- controllers/fenceagentsremediation_controller_test.go | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/controllers/fenceagentsremediation_controller.go b/controllers/fenceagentsremediation_controller.go index 9ab33c55..10aaf754 100644 --- a/controllers/fenceagentsremediation_controller.go +++ b/controllers/fenceagentsremediation_controller.go @@ -273,12 +273,17 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions processingConditionStatus = metav1.ConditionFalse fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse succeededConditionStatus = metav1.ConditionFalse - if reason == v1alpha1.RemediationFinishedNodeNotFound { + // Different reasons share the same effect to the conditions, but they have different message + switch reason { + case v1alpha1.RemediationFinishedNodeNotFound: conditionMessage = v1alpha1.RemediationFinishedNodeNotFoundConditionMessage - } else { + case v1alpha1.RemediationInterruptedByNHC: conditionMessage = v1alpha1.RemediationInterruptedByNHCConditionMessage + default: + err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason) + log.Error(err, "couldn't update FAR Status Conditions") + return err } - case v1alpha1.RemediationStarted: processingConditionStatus = metav1.ConditionTrue fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown diff --git a/controllers/fenceagentsremediation_controller_test.go b/controllers/fenceagentsremediation_controller_test.go index 9a58e11e..51e0f67a 100644 --- a/controllers/fenceagentsremediation_controller_test.go +++ b/controllers/fenceagentsremediation_controller_test.go @@ -166,7 +166,7 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) - By("Having Succeed condition set to true") + By("Having Succeeded, FenceAgentActionSucceeded conditions set to true, and Processing set to false") verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse)) verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue)) verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue)) @@ -199,7 +199,7 @@ var _ = Describe("FAR Controller", func() { testVADeletion(vaName2, resourceDeletionWasTriggered) testPodDeletion(testPodName, resourceDeletionWasTriggered) - By("Not having any condition set") + By("Having all three conditions set to false") verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse)) verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse)) verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse))