Skip to content

Commit

Permalink
Retry completion status patch for backup and restore resources
Browse files Browse the repository at this point in the history
Signed-off-by: Tiger Kaovilai <[email protected]>

update to design vmware-tanzu#8063

Signed-off-by: Tiger Kaovilai <[email protected]>
  • Loading branch information
kaovilai committed Sep 10, 2024
1 parent 46801a0 commit b9b7504
Show file tree
Hide file tree
Showing 15 changed files with 582 additions and 16 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/7845-kaovilai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Retry backup/restore completion/finalizing status patching to unstuck inprogress backups/restores
32 changes: 32 additions & 0 deletions changelogs/unreleased/8068-kaovilai
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
add retries with timeout to existing patch calls that moves a backup/restore from InProgress/Finalizing to a final status phase.

- backup_controller.go
- Should already reconcile again
- From `""` or `New` to `FailedValidation`/`InProgress`
- Retry added
- `InProgress` to
- `WaitingForPluginOperations`
- `WaitingForPluginOperationsPartiallyFailed`
- `Finalizing`
- `FinalizingPartiallyFailed`
- `Failed`
- backup_operations_controller.go
- Should already reconcile again
- `WaitingForPluginOperations`/`WaitingForPluginOperationsPartiallyFailed` to `*`
- backup_finalizer_controller.go
- Retry added
- `Finalizing`/`FinalizingPartiallyFailed` to
- `Completed`
- `PartiallyFailed`
- restore_controller.go
- Retry added
- `InProgress` to
- `WaitingForPluginOperations`
- `WaitingForPluginOperationsPartiallyFailed`
- `Finalizing`
- `FinalizingPartiallyFailed`
- restore_finalizer_controller.go
- Retry added
- `Finalizing`/`FinalizingPartiallyFailed` to
- `Completed`
- `PartiallyFailed`
4 changes: 4 additions & 0 deletions design/retry-patching-configuration_design.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ and from above non final phases to
- Completed
- PartiallyFailed

Once backup/restore is in some phase it will already be reconciled again periodically and do not need additional retry
- WaitingForPluginOperations
- WaitingForPluginOperationsPartiallyFailed

## Detailed Design
Relevant reconcilers will have `resourceTimeout time.Duration` added to its struct and to parameters of New[Backup|Restore]XReconciler functions.

Expand Down
27 changes: 27 additions & 0 deletions pkg/client/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ package client

import (
"context"
"math"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
kbclient "sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -36,3 +39,27 @@ func CreateRetryGenerateName(client kbclient.Client, ctx context.Context, obj kb
return client.Create(ctx, obj, &kbclient.CreateOptions{})
}
}

// CapBackoff provides a backoff with a set backoff cap
func CapBackoff(cap time.Duration) wait.Backoff {
if cap < 0 {
cap = 0
}
return wait.Backoff{
Steps: math.MaxInt,
Duration: 10 * time.Millisecond,
Cap: cap,
Factor: retry.DefaultBackoff.Factor,
Jitter: retry.DefaultBackoff.Jitter,
}
}

// RetryOnRetriableMaxBackOff accepts a patch function param, retrying when the provided retriable function returns true.
func RetryOnRetriableMaxBackOff(maxDuration time.Duration, fn func() error, retriable func(error) bool) error {
return retry.OnError(CapBackoff(maxDuration), func(err error) bool { return retriable(err) }, fn)
}

// RetryOnErrorMaxBackOff accepts a patch function param, retrying when the error is not nil.
func RetryOnErrorMaxBackOff(maxDuration time.Duration, fn func() error) error {
return RetryOnRetriableMaxBackOff(maxDuration, fn, func(err error) bool { return err != nil })
}
2 changes: 2 additions & 0 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
backupStoreGetter,
s.logger,
s.metrics,
s.config.ResourceTimeout,
)
if err := r.SetupWithManager(s.mgr); err != nil {
s.logger.Fatal(err, "unable to create controller", "controller", constant.ControllerBackupFinalizer)
Expand Down Expand Up @@ -814,6 +815,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
s.config.DefaultItemOperationTimeout,
s.config.DisableInformerCache,
s.crClient,
s.config.ResourceTimeout,
)

if err = r.SetupWithManager(s.mgr); err != nil {
Expand Down
18 changes: 14 additions & 4 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,10 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
request.Status.StartTimestamp = &metav1.Time{Time: b.clock.Now()}
}

// update status
// update status to
// BackupPhaseFailedValidation
// BackupPhaseInProgress
// if patch fail, backup can reconcile again as phase would still be "" or New
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
}
Expand Down Expand Up @@ -304,9 +307,16 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
b.metrics.RegisterBackupValidationFailure(backupScheduleName)
b.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
}
log.Info("Updating backup's final status")
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Error("error updating backup's final status")
log.Info("Updating backup's status")
// Phases were updated in runBackup()
// This patch with retry update Phase from InProgress to
// BackupPhaseWaitingForPluginOperations -> backup_operations_controller.go will now reconcile
// BackupPhaseWaitingForPluginOperationsPartiallyFailed -> backup_operations_controller.go will now reconcile
// BackupPhaseFinalizing -> backup_finalizer_controller.go will now reconcile
// BackupPhaseFinalizingPartiallyFailed -> backup_finalizer_controller.go will now reconcile
// BackupPhaseFailed
if err := kubeutil.PatchResourceWithRetriesOnErrors(b.resourceTimeout, original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Errorf("error updating backup's status from %v to %v", original.Status.Phase, request.Backup.Status.Phase)
}
return ctrl.Result{}, nil
}
Expand Down
10 changes: 9 additions & 1 deletion pkg/controller/backup_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"bytes"
"context"
"os"
"time"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand All @@ -31,6 +32,7 @@ import (

velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
pkgbackup "github.com/vmware-tanzu/velero/pkg/backup"
"github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/kuberesource"
"github.com/vmware-tanzu/velero/pkg/metrics"
Expand All @@ -51,6 +53,7 @@ type backupFinalizerReconciler struct {
metrics *metrics.ServerMetrics
backupStoreGetter persistence.ObjectBackupStoreGetter
log logrus.FieldLogger
resourceTimeout time.Duration
}

// NewBackupFinalizerReconciler initializes and returns backupFinalizerReconciler struct.
Expand All @@ -64,6 +67,7 @@ func NewBackupFinalizerReconciler(
backupStoreGetter persistence.ObjectBackupStoreGetter,
log logrus.FieldLogger,
metrics *metrics.ServerMetrics,
resourceTimeout time.Duration,
) *backupFinalizerReconciler {
return &backupFinalizerReconciler{
client: client,
Expand Down Expand Up @@ -119,7 +123,11 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.backupTracker.Delete(backup.Namespace, backup.Name)
}
// Always attempt to Patch the backup object and status after each reconciliation.
if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
//
// if this patch fails, there may not be another opportunity to update the backup object without external update event.
// so we retry
// This retries updating Finalzing/FinalizingPartiallyFailed to Completed/PartiallyFailed
if err := client.RetryOnErrorMaxBackOff(r.resourceTimeout, func() error { return r.client.Patch(ctx, backup, kbclient.MergeFrom(original)) }); err != nil {
log.WithError(err).Error("Error updating backup")
return
}
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/backup_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ func mockBackupFinalizerReconciler(fakeClient kbclient.Client, fakeGlobalClient
NewFakeSingleObjectBackupStoreGetter(backupStore),
logrus.StandardLogger(),
metrics.NewServerMetrics(),
10*time.Minute,
), backupper
}
func TestBackupFinalizerReconcile(t *testing.T) {
Expand Down
19 changes: 14 additions & 5 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ type restoreReconciler struct {
newPluginManager func(logger logrus.FieldLogger) clientmgmt.Manager
backupStoreGetter persistence.ObjectBackupStoreGetter
globalCrClient client.Client
resourceTimeout time.Duration
}

type backupInfo struct {
Expand All @@ -130,6 +131,7 @@ func NewRestoreReconciler(
defaultItemOperationTimeout time.Duration,
disableInformerCache bool,
globalCrClient client.Client,
resourceTimeout time.Duration,
) *restoreReconciler {
r := &restoreReconciler{
ctx: ctx,
Expand All @@ -149,7 +151,8 @@ func NewRestoreReconciler(
newPluginManager: newPluginManager,
backupStoreGetter: backupStoreGetter,

globalCrClient: globalCrClient,
globalCrClient: globalCrClient,
resourceTimeout: resourceTimeout,
}

// Move the periodical backup and restore metrics computing logic from controllers to here.
Expand Down Expand Up @@ -246,6 +249,7 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}

// patch to update status and persist to API
// This is patching from "" or New, no retry needed
err = kubeutil.PatchResource(original, restore, r.kbClient)
if err != nil {
// return the error so the restore can be re-processed; it's currently
Expand Down Expand Up @@ -274,10 +278,15 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
restore.Status.Phase == api.RestorePhaseCompleted {
restore.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()}
}
log.Debug("Updating restore's final status")

if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil {
log.WithError(errors.WithStack(err)).Info("Error updating restore's final status")
log.Debug("Updating restore's status")
// Phases were updated in runValidatedRestore
// This patch with retry update Phase from InProgress to
// WaitingForPluginOperations
// WaitingForPluginOperationsPartiallyFailed
// Finalizing
// FinalizingPartiallyFailed
if err = kubeutil.PatchResourceWithRetriesOnErrors(r.resourceTimeout, original, restore, r.kbClient); err != nil {
log.WithError(errors.WithStack(err)).Infof("Error updating restore's status from %v to %v", original.Status.Phase, restore.Status.Phase)
// No need to re-enqueue here, because restore's already set to InProgress before.
// Controller only handle New restore.
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ func TestFetchBackupInfo(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

if test.backupStoreError == nil {
Expand Down Expand Up @@ -196,6 +197,7 @@ func TestProcessQueueItemSkips(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

_, err := r.Reconcile(context.Background(), ctrl.Request{NamespacedName: types.NamespacedName{
Expand Down Expand Up @@ -498,6 +500,7 @@ func TestRestoreReconcile(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

r.clock = clocktesting.NewFakeClock(now)
Expand Down Expand Up @@ -681,6 +684,7 @@ func TestValidateAndCompleteWhenScheduleNameSpecified(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

restore := &velerov1api.Restore{
Expand Down Expand Up @@ -776,6 +780,7 @@ func TestValidateAndCompleteWithResourceModifierSpecified(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

restore := &velerov1api.Restore{
Expand Down
6 changes: 4 additions & 2 deletions pkg/controller/restore_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,10 @@ func (r *restoreFinalizerReconciler) finishProcessing(restorePhase velerov1api.R
r.metrics.RegisterRestoreSuccess(restore.Spec.ScheduleName)
}
restore.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()}

return kubeutil.PatchResource(original, restore, r.Client)
// retry `Finalizing`/`FinalizingPartiallyFailed` to
// - `Completed`
// - `PartiallyFailed`
return kubeutil.PatchResourceWithRetriesOnErrors(r.resourceTimeout, original, restore, r.Client)
}

// finalizerContext includes all the dependencies required by finalization tasks and
Expand Down
Loading

0 comments on commit b9b7504

Please sign in to comment.