Skip to content

Commit

Permalink
Merge pull request #74 from razo7/requeue-reconcile-with-offset
Browse files Browse the repository at this point in the history
Stale Cache on Fast Reconcile
  • Loading branch information
openshift-merge-robot authored Aug 18, 2023
2 parents 200c062 + 8f87c07 commit 3e0294e
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 23 deletions.
8 changes: 8 additions & 0 deletions api/v1alpha1/fenceagentsremediation_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,14 @@ type FenceAgentsRemediationStatus struct {
//+optional
// +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:io.kubernetes.conditions"
Conditions []metav1.Condition `json:"conditions,omitempty"`

// LastUpdateTime is the last time the status was updated.
//
//+optional
//+kubebuilder:validation:Type=string
//+kubebuilder:validation:Format=date-time
//+operator-sdk:csv:customresourcedefinitions:type=status
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
}

//+kubebuilder:object:root=true
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ spec:
path: conditions
x-descriptors:
- urn:alm:descriptor:io.kubernetes.conditions
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
version: v1alpha1
- description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates
API
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
type: object
type: object
served: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ spec:
path: conditions
x-descriptors:
- urn:alm:descriptor:io.kubernetes.conditions
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
version: v1alpha1
- description: FenceAgentsRemediationTemplate is the Schema for the fenceagentsremediationtemplates
API
Expand Down
61 changes: 45 additions & 16 deletions controllers/fenceagentsremediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
utilErrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/wait"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
Expand Down Expand Up @@ -85,7 +86,9 @@ func (r *FenceAgentsRemediationReconciler) SetupWithManager(mgr ctrl.Manager) er
func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (finalResult ctrl.Result, finalErr error) {
r.Log.Info("Begin FenceAgentsRemediation Reconcile")
defer r.Log.Info("Finish FenceAgentsRemediation Reconcile")
// Reconcile requeue results
emptyResult := ctrl.Result{}
requeueImmediately := ctrl.Result{Requeue: true}

// Fetch the FenceAgentsRemediation instance
far := &v1alpha1.FenceAgentsRemediation{}
Expand All @@ -100,13 +103,13 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
return emptyResult, err
}

// At the end of each Reconcile try to update CR's status
// At the end of each Reconcile we try to update CR's status
defer func() {
if updateErr := r.updateStatus(ctx, far); updateErr != nil {
if !apiErrors.IsConflict(updateErr) {
finalErr = utilErrors.NewAggregate([]error{updateErr, finalErr})
if apiErrors.IsConflict(updateErr) {
r.Log.Info("Conflict has occurred on updating the CR status")
}
finalResult.RequeueAfter = time.Second
finalErr = utilErrors.NewAggregate([]error{updateErr, finalErr})
}
}()

Expand All @@ -119,14 +122,14 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
if !valid {
r.Log.Error(err, "Didn't find a node matching the CR's name", "CR's Name", req.Name)
err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, &far.Status.Conditions, r.Log)
err := updateConditions(v1alpha1.RemediationFinishedNodeNotFound, far, r.Log)
return emptyResult, err
}

// Check NHC timeout annotation
if isTimedOutByNHC(far) {
r.Log.Info("FAR remediation was stopped by Node Healthcheck Operator")
err := updateConditions(v1alpha1.RemediationInterruptedByNHC, &far.Status.Conditions, r.Log)
err := updateConditions(v1alpha1.RemediationInterruptedByNHC, far, r.Log)
return emptyResult, err
}

Expand All @@ -138,10 +141,10 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}
r.Log.Info("Finalizer was added", "CR Name", req.Name)

if err := updateConditions(v1alpha1.RemediationStarted, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.RemediationStarted, far, r.Log); err != nil {
return emptyResult, err
}
return ctrl.Result{Requeue: true}, nil
return requeueImmediately, nil
} else if controllerutil.ContainsFinalizer(far, v1alpha1.FARFinalizer) && !far.ObjectMeta.DeletionTimestamp.IsZero() {
// Delete CR only when a finalizer and DeletionTimestamp are set
r.Log.Info("CR's deletion timestamp is not zero, and FAR finalizer exists", "CR Name", req.Name)
Expand Down Expand Up @@ -211,10 +214,10 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
}

r.Log.Info("Fence Agent command was finished successfully", "Fence Agent", far.Spec.Agent, "Node name", req.Name, "Response", SuccessFAResponse)
if err := updateConditions(v1alpha1.FenceAgentSucceeded, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.FenceAgentSucceeded, far, r.Log); err != nil {
return emptyResult, err
}
return ctrl.Result{Requeue: true}, nil
return requeueImmediately, nil
}
if meta.IsStatusConditionTrue(far.Status.Conditions, v1alpha1.FenceAgentActionSucceededType) &&
!meta.IsStatusConditionTrue(far.Status.Conditions, commonConditions.SucceededType) {
Expand All @@ -224,8 +227,7 @@ func (r *FenceAgentsRemediationReconciler) Reconcile(ctx context.Context, req ct
r.Log.Error(err, "Manual workload deletion has failed", "CR's Name", req.Name)
return emptyResult, err
}

if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, &far.Status.Conditions, r.Log); err != nil {
if err := updateConditions(v1alpha1.RemediationFinishedSuccessfully, far, r.Log); err != nil {
return emptyResult, err
}
r.Log.Info("FenceAgentsRemediation CR has completed to remediate the node", "Node Name", req.Name)
Expand All @@ -247,27 +249,46 @@ func isTimedOutByNHC(far *v1alpha1.FenceAgentsRemediation) bool {
func (r *FenceAgentsRemediationReconciler) updateStatus(ctx context.Context, far *v1alpha1.FenceAgentsRemediation) error {
if err := r.Client.Status().Update(ctx, far); err != nil {
if !apiErrors.IsConflict(err) {
r.Log.Error(err, "failed to update far status")
r.Log.Error(err, "failed to update far status in case on a non conflict")
}
return err
}
// Wait until the cache is updated in order to prevent reading a stale status in the next reconcile
// and making wrong decisions based on it.
pollingTimeout := 5 * time.Second
pollErr := wait.PollUntilContextTimeout(ctx, 200*time.Millisecond, pollingTimeout, true, func(ctx context.Context) (bool, error) {
tmpFar := &v1alpha1.FenceAgentsRemediation{}
if err := r.Client.Get(ctx, client.ObjectKeyFromObject(far), tmpFar); err != nil {
if apiErrors.IsNotFound(err) {
// nothing we can do anymore
return true, nil
}
return false, nil
}
return tmpFar.Status.LastUpdateTime != nil && (tmpFar.Status.LastUpdateTime.Equal(far.Status.LastUpdateTime) || tmpFar.Status.LastUpdateTime.After(far.Status.LastUpdateTime.Time)), nil
})
if pollErr != nil {
return fmt.Errorf("failed to wait for updated cache to be updated in status update after %f seconds of timeout - %w", pollingTimeout.Seconds(), pollErr)
}
return nil
}

// updateConditions updates the status conditions of a FenceAgentsRemediation object based on the provided ConditionsChangeReason.
// return an error if an unknown ConditionsChangeReason is provided
func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions *[]metav1.Condition, log logr.Logger) error {
func updateConditions(reason v1alpha1.ConditionsChangeReason, far *v1alpha1.FenceAgentsRemediation, log logr.Logger) error {

var (
processingConditionStatus, fenceAgentActionSucceededConditionStatus, succeededConditionStatus metav1.ConditionStatus
conditionMessage string
)
currentConditions := &far.Status.Conditions
conditionHasBeenChanged := false

// RemediationFinishedNodeNotFound and RemediationInterruptedByNHC reasons can happen at any time the Reconcile runs
// Except these two reasons, there are another three reasons that can only happen one after another
// RemediationStarted will always be the first reason (out of these three)
// FenceAgentSucceeded can only happen after RemediationStarted happened
// RemediationFinishedSuccessfully can only happen after FenceAgentSucceeded happened

switch reason {
case v1alpha1.RemediationFinishedNodeNotFound, v1alpha1.RemediationInterruptedByNHC:
processingConditionStatus = metav1.ConditionFalse
Expand Down Expand Up @@ -302,7 +323,6 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
return err
}

log.Info("updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason))
// if the requested Status.Conditions.Processing is different then the current one, then update Status.Conditions.Processing value
if processingConditionStatus != "" && !meta.IsStatusConditionPresentAndEqual(*currentConditions, commonConditions.ProcessingType, processingConditionStatus) {
meta.SetStatusCondition(currentConditions, metav1.Condition{
Expand All @@ -311,6 +331,7 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
Reason: string(reason),
Message: conditionMessage,
})
conditionHasBeenChanged = true
}

// if the requested Status.Conditions.FenceAgentActionSucceeded is different then the current one, then update Status.Conditions.FenceAgentActionSucceeded value
Expand All @@ -321,6 +342,7 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
Reason: string(reason),
Message: conditionMessage,
})
conditionHasBeenChanged = true
}

// if the requested Status.Conditions.Succeeded is different then the current one, then update Status.Conditions.Succeeded value
Expand All @@ -331,7 +353,14 @@ func updateConditions(reason v1alpha1.ConditionsChangeReason, currentConditions
Reason: string(reason),
Message: conditionMessage,
})
conditionHasBeenChanged = true
}
// Only update lastUpdate when there were other changes
if conditionHasBeenChanged {
now := metav1.Now()
far.Status.LastUpdateTime = &now
}
log.Info("Updating Status Condition", "processingConditionStatus", processingConditionStatus, "fenceAgentActionSucceededConditionStatus", fenceAgentActionSucceededConditionStatus, "succededConditionStatus", succeededConditionStatus, "reason", string(reason), "LastUpdateTime", far.Status.LastUpdateTime)

return nil
}
Expand Down
17 changes: 10 additions & 7 deletions controllers/fenceagentsremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ const (
vaName2 = "va-test-2"

// intervals
timeoutDeletion = 10 * time.Second // this timeout is used after all the other steps have finished successfully
pollInterval = 250 * time.Millisecond
timeoutDeletion = 2 * time.Second // this timeout is used after all the other steps have finished successfully
timeoutFinalizer = 1 * time.Second
pollInterval = 250 * time.Millisecond
)

var (
Expand Down Expand Up @@ -155,7 +156,7 @@ var _ = Describe("FAR Controller", func() {
g.Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
res, _ := cliCommandsEquality(underTestFAR)
return utils.TaintExists(node.Spec.Taints, &farNoExecuteTaint) && res
}, 100*time.Millisecond, 10*time.Millisecond).Should(BeTrue(), "taint should be added, and command format is correct")
}, timeoutFinalizer, pollInterval).Should(BeTrue(), "taint should be added, and command format is correct")

// If taint was added, then definitely the finalizer was added as well
By("Having a finalizer if we have a remediation taint")
Expand All @@ -166,7 +167,8 @@ var _ = Describe("FAR Controller", func() {
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Having Succeeded, FenceAgentActionSucceeded conditions set to true, and Processing set to false")
By("Verifying correct conditions for successfull remediation")
Expect(underTestFAR.Status.LastUpdateTime).ToNot(BeNil())
verifyStatusCondition(workerNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(workerNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionTrue))
verifyStatusCondition(workerNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionTrue))
Expand All @@ -184,10 +186,10 @@ var _ = Describe("FAR Controller", func() {

By("Not having finalizer")
farNamespacedName.Name = underTestFAR.Name
Eventually(func(g Gomega) bool {
Consistently(func(g Gomega) bool {
g.Expect(k8sClient.Get(context.Background(), farNamespacedName, underTestFAR)).To(Succeed())
return controllerutil.ContainsFinalizer(underTestFAR, v1alpha1.FARFinalizer)
}, 100*time.Millisecond, 10*time.Millisecond).Should(BeFalse(), "finalizer shouldn't be added")
}, timeoutFinalizer, pollInterval).Should(BeFalse(), "finalizer shouldn't be added")

// If finalizer is missing, then a taint shouldn't be existed
By("Not having remediation taint")
Expand All @@ -199,7 +201,8 @@ var _ = Describe("FAR Controller", func() {
testVADeletion(vaName2, resourceDeletionWasTriggered)
testPodDeletion(testPodName, resourceDeletionWasTriggered)

By("Having all three conditions set to false")
By("Verifying correct conditions for unsuccessfull remediation")
Expect(underTestFAR.Status.LastUpdateTime).ToNot(BeNil())
verifyStatusCondition(dummyNode, commonConditions.ProcessingType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, v1alpha1.FenceAgentActionSucceededType, conditionStatusPointer(metav1.ConditionFalse))
verifyStatusCondition(dummyNode, commonConditions.SucceededType, conditionStatusPointer(metav1.ConditionFalse))
Expand Down

0 comments on commit 3e0294e

Please sign in to comment.