Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Two Reasons for Updating and Stopping Remediation #76

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,27 @@ const (
// FenceAgentActionSucceededType is the condition type used to signal whether the Fence Agent action was succeeded successfully or not
FenceAgentActionSucceededType = "FenceAgentActionSucceeded"
// condition messages
RemediationStartedConditionMessage = "FAR CR was found, the CR name matches one of the cluster nodes, and a finalizer was set"
FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully"
RemediationFinishedConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)"
RemediationFinishedNodeNotFoundConditionMessage = "Node Healthcheck timeout annotation has been set"
RemediationInterruptedByNHCConditionMessage = "FAR CR name doesn't match a node name"
RemediationStartedConditionMessage = "FAR CR was found, its name matches one of the cluster nodes, and a finalizer was set to the CR"
FenceAgentSucceededConditionMessage = "FAR taint was added, fence agent command has been created and executed successfully"
RemediationFinishedSuccessfullyConditionMessage = "The unhealthy node was fully remediated (it was tainted, fenced using FA and all the node resources have been deleted)"
)

// ProcessingChangeReason represents the reason of updating the processing condition
type ProcessingChangeReason string
// ConditionsChangeReason represents the reason of updating the some or all the conditions
type ConditionsChangeReason string

const (
// RemediationFinishedNodeNotFound - CR was found but its name doesn't matche a node
RemediationFinishedNodeNotFound ConditionsChangeReason = "RemediationFinishedNodeNotFound"
// RemediationInterruptedByNHC - Remediation was interrupted by NHC timeout annotation
RemediationInterruptedByNHC ConditionsChangeReason = "RemediationInterruptedByNHC"
// RemediationStarted - CR was found, its name matches a node, and a finalizer was set
RemediationStarted ProcessingChangeReason = "RemediationStarted"
RemediationStarted ConditionsChangeReason = "RemediationStarted"
// FenceAgentSucceeded - FAR taint was added, fence agent command has been created and executed successfully
FenceAgentSucceeded ProcessingChangeReason = "FenceAgentSucceeded"
// RemediationFinished - The unhealthy node was fully remediated (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinished ProcessingChangeReason = "RemediationFinished"
FenceAgentSucceeded ConditionsChangeReason = "FenceAgentSucceeded"
// RemediationFinishedSuccessfully - The unhealthy node was fully remediated/fenced (it was tainted, fenced by FA and all of its resources have been deleted)
RemediationFinishedSuccessfully ConditionsChangeReason = "RemediationFinishedSuccessfully"
)

type ParameterName string
Expand Down
50 changes: 25 additions & 25 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,15 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
if !valid {
r.Log.Error(err, "Didn't find a node matching the CR's name", "CR's Name", req.Name)
return emptyResult, nil
err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, &far.Status.Conditions, r.Log)
return emptyResult, err
}

// Check NHC timeout annotation
if isTimedOutByNHC(far) {
r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator")
// TODO: update status and return its error
return emptyResult, nil
err := updateConditions(v1alpha1.RemediationInterruptedByNHC, &far.Status.Conditions, r.Log)
return emptyResult, err
}

// Add finalizer when the CR is created
Expand Down Expand Up @@ -224,7 +225,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, err
}

if err := updateConditions(v1alpha1.RemediationFinished, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, &far.Status.Conditions, r.Log); err != nil {
return emptyResult, err
}
r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name)
Expand Down Expand Up @@ -253,20 +254,31 @@ func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far
return nil
}

// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ProcessingChangeReason.
// return an error if an unknown ProcessingChangeReason is provided
func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error {
// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ConditionsChangeReason.
// return an error if an unknown ConditionsChangeReason is provided
func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error {

var (
processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus
conditionMessage string
)
// All the ProcessingChangeReasons can happen one after another
// RemediationStarted will always be the first reason.
// RemediationFinishedNodeNotFound and RemediationInterruptedByNHC reasons can happen at any time the Reconcile runs
// Except these two reasons, there are another three reasons that can only happen one after another
// RemediationStarted will always be the first reason (out of these three)
// FenceAgentSucceeded can only happen after RemediationStarted happened
// RemediationFinished can only happen after FenceAgentSucceeded happened
// RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened

switch reason {
case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationInterruptedByNHC:
processingConditionStatus = metav1.ConditionFalse
fenceAgentActionSucceededConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionFalse
if reason == v1alpha1.RemediationFinishedNodeNotFound {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO a switch makes more sense here, which errors when a reason isn't handled. Prevents forgetting this part when adding another reason to the case statement above.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point

conditionMessage = v1alpha1.RemediationFinishedNodeNotFoundConditionMessage
} else {
conditionMessage = v1alpha1.RemediationInterruptedByNHCConditionMessage
}

case v1alpha1.RemediationStarted:
processingConditionStatus = metav1.ConditionTrue
fenceAgentActionSucceededConditionStatus = metav1.ConditionUnknown
Expand All @@ -275,28 +287,16 @@ func updateConditions(reason v1alpha1.ProcessingChangeReason, currentConditions
case v1alpha1.FenceAgentSucceeded:
fenceAgentActionSucceededConditionStatus = metav1.ConditionTrue
conditionMessage = v1alpha1.FenceAgentSucceededConditionMessage
case v1alpha1.RemediationFinished:
case v1alpha1.RemediationFinishedSuccessfully:
processingConditionStatus = metav1.ConditionFalse
succeededConditionStatus = metav1.ConditionTrue
conditionMessage = v1alpha1.RemediationFinishedConditionMessage
conditionMessage = v1alpha1.RemediationFinishedSuccessfullyConditionMessage
default:
err := fmt.Errorf("unknown processingChangeReason:%s", reason)
err := fmt.Errorf("unknown ConditionsChangeReason:%s", reason)
log.Error(err, "couldn't update FAR Status Conditions")
return err
}

// if ProcessingType is already false, then it cannot be changed to true again
if processingConditionStatus == metav1.ConditionTrue &&
meta.IsStatusConditionFalse(*currentConditions, commonConditions.ProcessingType) {
return nil
}

// if FenceAgentActionSucceededType is already false, then it cannot be changed to true again
if fenceAgentActionSucceededConditionStatus == metav1.ConditionTrue &&
meta.IsStatusConditionFalse(*currentConditions, v1alpha1.FenceAgentActionSucceededType) {
return nil
}

log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason))
// if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value
if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) {
Expand Down
8 changes: 4 additions & 4 deletions controllers/fenceagentsremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ var _ = Describe("FAR Controller", func() {
farNamespacedName := client.ObjectKey{Name: workerNode, Namespace: defaultNamespace}
farNoExecuteTaint := utils.CreateFARNoExecuteTaint()
resourceDeletionWasTriggered := true // corresponds to testVADeletion bool value
conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status }
BeforeEach(func() {
// Create two VAs and two pods, and at the end clean them up with DeferCleanup
va1 := createVA(vaName1, workerNode)
Expand Down Expand Up @@ -166,7 +167,6 @@ var _ = Describe("FAR Controller", func() {
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Having Succeed condition set to true")
conditionStatusPointer := func(status metav1.ConditionStatus) *metav1.ConditionStatus { return &status }
verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue))
verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue))
Expand Down Expand Up @@ -200,9 +200,9 @@ var _ = Describe("FAR Controller", func() {
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Not having any condition set")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

message does not fit anymore

verifyStatusCondition(dummyNode, commonConditions.ProcessingType, nil)
verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, nil)
verifyStatusCondition(dummyNode, commonConditions.SucceededType, nil)
verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse))
})
})
})
Expand Down