From 78890e711d0f78c92703439377cc0ce9baea2902 Mon Sep 17 00:00:00 2001 From: Cem Mergenci Date: Wed, 21 Aug 2024 17:08:11 +0300 Subject: [PATCH 1/2] Recover from panics in async external clients. Signed-off-by: Cem Mergenci --- pkg/controller/external_async_tfpluginfw.go | 95 +++++++++++++++------ 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/pkg/controller/external_async_tfpluginfw.go b/pkg/controller/external_async_tfpluginfw.go index a90f6787..a01bd248 100644 --- a/pkg/controller/external_async_tfpluginfw.go +++ b/pkg/controller/external_async_tfpluginfw.go @@ -6,6 +6,7 @@ package controller import ( "context" + "fmt" xpv1 "github.com/crossplane/crossplane-runtime/apis/common/v1" "github.com/crossplane/crossplane-runtime/pkg/logging" @@ -13,6 +14,7 @@ import ( "github.com/crossplane/crossplane-runtime/pkg/reconciler/managed" xpresource "github.com/crossplane/crossplane-runtime/pkg/resource" "github.com/pkg/errors" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/crossplane/upjet/pkg/config" @@ -131,6 +133,34 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Observe(ctx context.Contex return o, err } +// recoverIfPanic recovers from panics, if any. On recovery, API +// machinery panic handlers are run first. Then, the custom panic +// handler given as argument is called with the panic message. The +// implementation follows the outline of panic recovery mechanism in +// controller-runtime: +// https://github.com/kubernetes-sigs/controller-runtime/blob/v0.17.3/pkg/internal/controller/controller.go#L105-L112 +func recoverIfPanic(panicHandler func(panicErr error)) { + if r := recover(); r != nil { + for _, fn := range utilruntime.PanicHandlers { + fn(r) + } + + err := fmt.Errorf("panic: %v [recovered]", r) + panicHandler(err) + } +} + +func (n *terraformPluginFrameworkAsyncExternalClient) finishCreate(ctx context.Context, mg xpresource.Managed, errInCreate error) { + err := tferrors.NewAsyncCreateFailed(errInCreate) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async create ended.", "error", err) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error()) + } +} + func (n *terraformPluginFrameworkAsyncExternalClient) Create(_ context.Context, mg xpresource.Managed) (managed.ExternalCreation, error) { if !n.opTracker.LastOperation.MarkStart("create") { return managed.ExternalCreation{}, errors.Errorf("%s operation that started at %s is still running", n.opTracker.LastOperation.Type, n.opTracker.LastOperation.StartTime().String()) @@ -138,23 +168,30 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Create(_ context.Context, ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { - defer cancel() + defer cancel() // Cancel the context after panic recovery, because the context is used in the custom panic handler below. + defer recoverIfPanic(func(panicErr error) { + n.finishCreate(ctx, mg, panicErr) + }) n.opTracker.logger.Debug("Async create starting...") _, err := n.terraformPluginFrameworkExternalClient.Create(ctx, mg) - err = tferrors.NewAsyncCreateFailed(err) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async create ended.", "error", err) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error()) - } + n.finishCreate(ctx, mg, err) }() return managed.ExternalCreation{}, n.opTracker.LastOperation.Error() } +func (n *terraformPluginFrameworkAsyncExternalClient) finishUpdate(ctx context.Context, mg xpresource.Managed, errInUpdate error) { + err := tferrors.NewAsyncUpdateFailed(errInUpdate) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async update ended.", "error", err) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error()) + } +} + func (n *terraformPluginFrameworkAsyncExternalClient) Update(_ context.Context, mg xpresource.Managed) (managed.ExternalUpdate, error) { if !n.opTracker.LastOperation.MarkStart("update") { return managed.ExternalUpdate{}, errors.Errorf("%s operation that started at %s is still running", n.opTracker.LastOperation.Type, n.opTracker.LastOperation.StartTime().String()) @@ -162,23 +199,30 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Update(_ context.Context, ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { - defer cancel() + defer cancel() // Cancel the context after panic recovery, because the context is used in the custom panic handler below. + defer recoverIfPanic(func(panicErr error) { + n.finishUpdate(ctx, mg, panicErr) + }) n.opTracker.logger.Debug("Async update starting...") _, err := n.terraformPluginFrameworkExternalClient.Update(ctx, mg) - err = tferrors.NewAsyncUpdateFailed(err) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async update ended.", "error", err) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error()) - } + n.finishUpdate(ctx, mg, err) }() return managed.ExternalUpdate{}, n.opTracker.LastOperation.Error() } +func (n *terraformPluginFrameworkAsyncExternalClient) finishDelete(ctx context.Context, mg xpresource.Managed, errInDelete error) { + err := tferrors.NewAsyncDeleteFailed(errInDelete) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async delete ended.", "error", err) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error()) + } +} + func (n *terraformPluginFrameworkAsyncExternalClient) Delete(_ context.Context, mg xpresource.Managed) error { switch { case n.opTracker.LastOperation.Type == "delete": @@ -190,17 +234,14 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Delete(_ context.Context, ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { - defer cancel() + defer cancel() // Cancel the context after panic recovery, because the context is used in the custom panic handler below. + defer recoverIfPanic(func(panicErr error) { + n.finishDelete(ctx, mg, panicErr) + }) n.opTracker.logger.Debug("Async delete starting...") - err := tferrors.NewAsyncDeleteFailed(n.terraformPluginFrameworkExternalClient.Delete(ctx, mg)) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async delete ended.", "error", err) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error()) - } + err := n.terraformPluginFrameworkExternalClient.Delete(ctx, mg) + n.finishDelete(ctx, mg, err) }() return n.opTracker.LastOperation.Error() From 3dc4f0f69c2ef6203efdcea68509b5e38c46213c Mon Sep 17 00:00:00 2001 From: Cem Mergenci Date: Thu, 22 Aug 2024 01:49:06 +0300 Subject: [PATCH 2/2] Streamline async panic handler implementation. Signed-off-by: Cem Mergenci --- pkg/controller/external_async_tfpluginfw.go | 130 ++++++++++--------- pkg/controller/external_async_tfpluginsdk.go | 80 ++++++++---- 2 files changed, 126 insertions(+), 84 deletions(-) diff --git a/pkg/controller/external_async_tfpluginfw.go b/pkg/controller/external_async_tfpluginfw.go index a01bd248..44845c67 100644 --- a/pkg/controller/external_async_tfpluginfw.go +++ b/pkg/controller/external_async_tfpluginfw.go @@ -133,31 +133,28 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Observe(ctx context.Contex return o, err } -// recoverIfPanic recovers from panics, if any. On recovery, API -// machinery panic handlers are run first. Then, the custom panic -// handler given as argument is called with the panic message. The -// implementation follows the outline of panic recovery mechanism in +// panicHandler wraps an error, so that deferred functions that will +// be executed on a panic can access the error more conveniently. +type panicHandler struct { + err error +} + +// recoverIfPanic recovers from panics, if any. Calls to this function +// should be defferred directly: `defer ph.recoverIfPanic()`. Panic +// recovery won't work if the call is wrapped in another function +// call, such as `defer func() { ph.recoverIfPanic() }()`. On +// recovery, API machinery panic handlers run. The implementation +// follows the outline of panic recovery mechanism in // controller-runtime: // https://github.com/kubernetes-sigs/controller-runtime/blob/v0.17.3/pkg/internal/controller/controller.go#L105-L112 -func recoverIfPanic(panicHandler func(panicErr error)) { +func (ph *panicHandler) recoverIfPanic() { + ph.err = nil if r := recover(); r != nil { for _, fn := range utilruntime.PanicHandlers { fn(r) } - err := fmt.Errorf("panic: %v [recovered]", r) - panicHandler(err) - } -} - -func (n *terraformPluginFrameworkAsyncExternalClient) finishCreate(ctx context.Context, mg xpresource.Managed, errInCreate error) { - err := tferrors.NewAsyncCreateFailed(errInCreate) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async create ended.", "error", err) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error()) + ph.err = fmt.Errorf("recovered from panic: %v", r) } } @@ -168,30 +165,32 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Create(_ context.Context, ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { - defer cancel() // Cancel the context after panic recovery, because the context is used in the custom panic handler below. - defer recoverIfPanic(func(panicErr error) { - n.finishCreate(ctx, mg, panicErr) - }) + // The order of deferred functions, executed last-in-first-out, is + // significant. The context should be canceled last, because it is + // used by the finishing operations. Panic recovery should execute + // first, because the finishing operations report the panic error, + // if any. + var ph panicHandler + defer cancel() + defer func() { // Finishing operations + err := tferrors.NewAsyncCreateFailed(ph.err) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async create ended.", "error", err) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error()) + } + }() + defer ph.recoverIfPanic() n.opTracker.logger.Debug("Async create starting...") - _, err := n.terraformPluginFrameworkExternalClient.Create(ctx, mg) - n.finishCreate(ctx, mg, err) + _, ph.err = n.terraformPluginFrameworkExternalClient.Create(ctx, mg) }() return managed.ExternalCreation{}, n.opTracker.LastOperation.Error() } -func (n *terraformPluginFrameworkAsyncExternalClient) finishUpdate(ctx context.Context, mg xpresource.Managed, errInUpdate error) { - err := tferrors.NewAsyncUpdateFailed(errInUpdate) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async update ended.", "error", err) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error()) - } -} - func (n *terraformPluginFrameworkAsyncExternalClient) Update(_ context.Context, mg xpresource.Managed) (managed.ExternalUpdate, error) { if !n.opTracker.LastOperation.MarkStart("update") { return managed.ExternalUpdate{}, errors.Errorf("%s operation that started at %s is still running", n.opTracker.LastOperation.Type, n.opTracker.LastOperation.StartTime().String()) @@ -199,30 +198,32 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Update(_ context.Context, ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { - defer cancel() // Cancel the context after panic recovery, because the context is used in the custom panic handler below. - defer recoverIfPanic(func(panicErr error) { - n.finishUpdate(ctx, mg, panicErr) - }) + // The order of deferred functions, executed last-in-first-out, is + // significant. The context should be canceled last, because it is + // used by the finishing operations. Panic recovery should execute + // first, because the finishing operations report the panic error, + // if any. + var ph panicHandler + defer cancel() + defer func() { // Finishing operations + err := tferrors.NewAsyncUpdateFailed(ph.err) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async update ended.", "error", err) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error()) + } + }() + defer ph.recoverIfPanic() n.opTracker.logger.Debug("Async update starting...") - _, err := n.terraformPluginFrameworkExternalClient.Update(ctx, mg) - n.finishUpdate(ctx, mg, err) + _, ph.err = n.terraformPluginFrameworkExternalClient.Update(ctx, mg) }() return managed.ExternalUpdate{}, n.opTracker.LastOperation.Error() } -func (n *terraformPluginFrameworkAsyncExternalClient) finishDelete(ctx context.Context, mg xpresource.Managed, errInDelete error) { - err := tferrors.NewAsyncDeleteFailed(errInDelete) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async delete ended.", "error", err) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error()) - } -} - func (n *terraformPluginFrameworkAsyncExternalClient) Delete(_ context.Context, mg xpresource.Managed) error { switch { case n.opTracker.LastOperation.Type == "delete": @@ -234,14 +235,27 @@ func (n *terraformPluginFrameworkAsyncExternalClient) Delete(_ context.Context, ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { - defer cancel() // Cancel the context after panic recovery, because the context is used in the custom panic handler below. - defer recoverIfPanic(func(panicErr error) { - n.finishDelete(ctx, mg, panicErr) - }) + // The order of deferred functions, executed last-in-first-out, is + // significant. The context should be canceled last, because it is + // used by the finishing operations. Panic recovery should execute + // first, because the finishing operations report the panic error, + // if any. + var ph panicHandler + defer cancel() + defer func() { // Finishing operations + err := tferrors.NewAsyncDeleteFailed(ph.err) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async delete ended.", "error", err) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error()) + } + }() + defer ph.recoverIfPanic() n.opTracker.logger.Debug("Async delete starting...") - err := n.terraformPluginFrameworkExternalClient.Delete(ctx, mg) - n.finishDelete(ctx, mg, err) + ph.err = n.terraformPluginFrameworkExternalClient.Delete(ctx, mg) }() return n.opTracker.LastOperation.Error() diff --git a/pkg/controller/external_async_tfpluginsdk.go b/pkg/controller/external_async_tfpluginsdk.go index 2071c1c3..6967b9dd 100644 --- a/pkg/controller/external_async_tfpluginsdk.go +++ b/pkg/controller/external_async_tfpluginsdk.go @@ -143,18 +143,27 @@ func (n *terraformPluginSDKAsyncExternal) Create(_ context.Context, mg xpresourc ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { + // The order of deferred functions, executed last-in-first-out, is + // significant. The context should be canceled last, because it is + // used by the finishing operations. Panic recovery should execute + // first, because the finishing operations report the panic error, + // if any. + var ph panicHandler defer cancel() + defer func() { // Finishing operations + err := tferrors.NewAsyncCreateFailed(ph.err) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async create ended.", "error", err, "tfID", n.opTracker.GetTfID()) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error()) + } + }() + defer ph.recoverIfPanic() n.opTracker.logger.Debug("Async create starting...", "tfID", n.opTracker.GetTfID()) - _, err := n.terraformPluginSDKExternal.Create(ctx, mg) - err = tferrors.NewAsyncCreateFailed(err) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async create ended.", "error", err, "tfID", n.opTracker.GetTfID()) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Create(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async create callback failed", "error", cErr.Error()) - } + _, ph.err = n.terraformPluginSDKExternal.Create(ctx, mg) }() return managed.ExternalCreation{}, n.opTracker.LastOperation.Error() @@ -167,18 +176,27 @@ func (n *terraformPluginSDKAsyncExternal) Update(_ context.Context, mg xpresourc ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { + // The order of deferred functions, executed last-in-first-out, is + // significant. The context should be canceled last, because it is + // used by the finishing operations. Panic recovery should execute + // first, because the finishing operations report the panic error, + // if any. + var ph panicHandler defer cancel() + defer func() { // Finishing operations + err := tferrors.NewAsyncUpdateFailed(ph.err) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async update ended.", "error", err, "tfID", n.opTracker.GetTfID()) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error()) + } + }() + defer ph.recoverIfPanic() n.opTracker.logger.Debug("Async update starting...", "tfID", n.opTracker.GetTfID()) - _, err := n.terraformPluginSDKExternal.Update(ctx, mg) - err = tferrors.NewAsyncUpdateFailed(err) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async update ended.", "error", err, "tfID", n.opTracker.GetTfID()) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Update(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async update callback failed", "error", cErr.Error()) - } + _, ph.err = n.terraformPluginSDKExternal.Update(ctx, mg) }() return managed.ExternalUpdate{}, n.opTracker.LastOperation.Error() @@ -195,17 +213,27 @@ func (n *terraformPluginSDKAsyncExternal) Delete(_ context.Context, mg xpresourc ctx, cancel := context.WithDeadline(context.Background(), n.opTracker.LastOperation.StartTime().Add(defaultAsyncTimeout)) go func() { + // The order of deferred functions, executed last-in-first-out, is + // significant. The context should be canceled last, because it is + // used by the finishing operations. Panic recovery should execute + // first, because the finishing operations report the panic error, + // if any. + var ph panicHandler defer cancel() + defer func() { // Finishing operations + err := tferrors.NewAsyncDeleteFailed(ph.err) + n.opTracker.LastOperation.SetError(err) + n.opTracker.logger.Debug("Async delete ended.", "error", err, "tfID", n.opTracker.GetTfID()) + + n.opTracker.LastOperation.MarkEnd() + if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil { + n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error()) + } + }() + defer ph.recoverIfPanic() n.opTracker.logger.Debug("Async delete starting...", "tfID", n.opTracker.GetTfID()) - err := tferrors.NewAsyncDeleteFailed(n.terraformPluginSDKExternal.Delete(ctx, mg)) - n.opTracker.LastOperation.SetError(err) - n.opTracker.logger.Debug("Async delete ended.", "error", err, "tfID", n.opTracker.GetTfID()) - - n.opTracker.LastOperation.MarkEnd() - if cErr := n.callback.Destroy(mg.GetName())(err, ctx); cErr != nil { - n.opTracker.logger.Info("Async delete callback failed", "error", cErr.Error()) - } + ph.err = n.terraformPluginSDKExternal.Delete(ctx, mg) }() return n.opTracker.LastOperation.Error()