Skip to content

Commit

Permalink
Merge pull request #76 from razo7/stop-remediation-for-2-reasons
Browse files Browse the repository at this point in the history
Add Two Reasons for Updating and Stopping Remediation
  • Loading branch information
openshift-merge-robot authored Aug 16, 2023
2 parents 001c1c5 + c0e69f2 commit 200c062
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 40 deletions.
24 changes: 15 additions & 9 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,27 @@ const (
// FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not
FenceAgentActionSucceededType = "FenceAgentActionSucceeded"
// condition messages
RemediationStartedConditionMessage = "FAR CR was found, the CR name matches one of the cluster nodes, and a finalizer was set"
FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully"
RemediationFinishedConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)"
RemediationFinishedNodeNotFoundConditionMessage = "Node Healthcheck timeout annotation has been set"
RemediationInterruptedByNHCConditionMessage = "FAR CR name doesn't match a node name"
RemediationStartedConditionMessage = "FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR"
FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully"
RemediationFinishedSuccessfullyConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)"
)

// ProcessingChangeReason represents the reason of updating the processing condition
type ProcessingChangeReason string
// ConditionsChangeReason represents the reason of updating the some or all the conditions
type ConditionsChangeReason string

const (
// RemediationFinishedNodeNotFound - CR was found but its name doesn't matche a node
RemediationFinishedNodeNotFound ConditionsChangeReason = "RemediationFinishedNodeNotFound"
// RemediationInterruptedByNHC - Remediation was interrupted by NHC timeout annotation
RemediationInterruptedByNHC ConditionsChangeReason = "RemediationInterruptedByNHC"
// RemediationStarted - CR was found, its name matches a node, and a finalizer was set
RemediationStarted ProcessingChangeReason = "RemediationStarted"
RemediationStarted ConditionsChangeReason = "RemediationStarted"
// FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully
FenceAgentSucceeded ProcessingChangeReason = "FenceAgentSucceeded"
// RemediationFinished - The unhealthy node was fully remediated (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinished ProcessingChangeReason = "RemediationFinished"
FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded"
// RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully"
)

type ParameterName string
Expand Down
55 changes: 30 additions & 25 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,15 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
if !valid {
r.Log.Error(err, "Didn't find a node matching the CR's name", "CR's Name", req.Name)
return emptyResult, nil
err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, &far.Status.Conditions, r.Log)
return emptyResult, err
}

// Check NHC timeout annotation
if isTimedOutByNHC(far) {
r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator")
// TODO: update status and return its error
return emptyResult, nil
err := updateConditions(v1alpha1.RemediationInterruptedByNHC, &far.Status.Conditions, r.Log)
return emptyResult, err
}

// Add finalizer when the CR is created
Expand Down Expand Up @@ -224,7 +225,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, err
}

if err := updateConditions(v1alpha1.RemediationFinished, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, &far.Status.Conditions, r.Log); err != nil {
return emptyResult, err
}
r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name)
Expand Down Expand Up @@ -253,20 +254,36 @@ func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far
return nil
}

// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ProcessingChangeReason.
// return an error if an unknown ProcessingChangeReason is provided
func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error {
// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ConditionsChangeReason.
// return an error if an unknown ConditionsChangeReason is provided
func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error {

var (
processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus
conditionMessage string
)
// All the ProcessingChangeReasons can happen one after another
// RemediationStarted will always be the first reason.
// RemediationFinishedNodeNotFound and RemediationInterruptedByNHC reasons can happen at any time the Reconcile runs
// Except these two reasons, there are another three reasons that can only happen one after another
// RemediationStarted will always be the first reason (out of these three)
// FenceAgentSucceeded can only happen after RemediationStarted happened
// RemediationFinished can only happen after FenceAgentSucceeded happened
// RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened

switch reason {
case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationInterruptedByNHC:
processingConditionStatus = metav1.ConditionFalse
fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionFalse
// Different reasons share the same effect to the conditions, but they have different message
switch reason {
case v1alpha1.RemediationFinishedNodeNotFound:
conditionMessage = v1alpha1.RemediationFinishedNodeNotFoundConditionMessage
case v1alpha1.RemediationInterruptedByNHC:
conditionMessage = v1alpha1.RemediationInterruptedByNHCConditionMessage
default:
err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason)
log.Error(err, "couldn't update FAR Status Conditions")
return err
}
case v1alpha1.RemediationStarted:
processingConditionStatus = metav1.ConditionTrue
fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown
Expand All @@ -275,28 +292,16 @@ func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions
case v1alpha1.FenceAgentSucceeded:
fenceAgentActionSucceededConditionStatus = metav1.ConditionTrue
conditionMessage = v1alpha1.FenceAgentSucceededConditionMessage
case v1alpha1.RemediationFinished:
case v1alpha1.RemediationFinishedSuccessfully:
processingConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionTrue
conditionMessage = v1alpha1.RemediationFinishedConditionMessage
conditionMessage = v1alpha1.RemediationFinishedSuccessfullyConditionMessage
default:
err := fmt.Errorf("unknown processingChangeReason:%s", reason)
err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason)
log.Error(err, "couldn't update FAR Status Conditions")
return err
}

// if ProcessingType is already false, then it cannot be changed to true again
if processingConditionStatus == metav1.ConditionTrue &&
meta.IsStatusConditionFalse(*currentConditions, commonConditions.ProcessingType) {
return nil
}

// if FenceAgentActionSucceededType is already false, then it cannot be changed to true again
if fenceAgentActionSucceededConditionStatus == metav1.ConditionTrue &&
meta.IsStatusConditionFalse(*currentConditions, v1alpha1.FenceAgentActionSucceededType) {
return nil
}

log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason))
// if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value
if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) {
Expand Down
12 changes: 6 additions & 6 deletions controllers/fenceagentsremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ var _ = Describe("FAR Controller", func() {
farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace}
farNoExecuteTaint := utils.CreateFARNoExecuteTaint()
resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value
conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status }
BeforeEach(func() {
// Create two VAs and two pods, and at the end clean them up with DeferCleanup
va1 := createVA(vaName1, workerNode)
Expand Down Expand Up @@ -165,8 +166,7 @@ var _ = Describe("FAR Controller", func() {
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Having Succeed condition set to true")
conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status }
By("Having Succeeded, FenceAgentActionSucceeded conditions set to true, and Processing set to false")
verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue))
verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue))
Expand Down Expand Up @@ -199,10 +199,10 @@ var _ = Describe("FAR Controller", func() {
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Not having any condition set")
verifyStatusCondition(dummyNode, commonConditions.ProcessingType, nil)
verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, nil)
verifyStatusCondition(dummyNode, commonConditions.SucceededType, nil)
By("Having all three conditions set to false")
verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse))
})
})
})
Expand Down

0 comments on commit 200c062

Please sign in to comment.