Skip to content

Commit

Permalink
Improve polling loop for readiness (#63)
Browse files Browse the repository at this point in the history
- Exit polling loop early if resource is in Failed state for several
  iterations. We have to use a failure threshold because the resource
  may appear as Failed immediately after a configuration change that
  should eventually return it to a healthy state, since reconciliation
  is asynchronous.
- Suppress network errors in polling loop.
- Collect Terraform logging.
  • Loading branch information
adriansuarez authored Sep 17, 2024
1 parent ebc23d7 commit 32ead32
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 9 deletions.
5 changes: 5 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ commands:
name: "Run tests"
command: |
make testacc
no_output_timeout: 30m
- run:
name: "Generate coverage report"
command: |
Expand Down Expand Up @@ -66,6 +67,8 @@ jobs:
executor: ubuntu_vm
environment:
NUODB_CP_VERSION: << parameters.nuodb-cp-version >>
TF_LOG: DEBUG
TF_LOG_PATH: /tmp/test-artifacts/tf-debug.log
steps:
- checkout
- run_test:
Expand All @@ -85,6 +88,8 @@ jobs:
NUODB_CP_IMAGE: nuodb/nuodb-control-plane:latest
USING_LATEST_API: "true"
USE_TOFU: "<< parameters.use-tofu >>"
TF_LOG: DEBUG
TF_LOG_PATH: /tmp/test-artifacts/tf-debug.log
steps:
- checkout
- add_ssh_keys:
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/nuodb/terraform-provider-nuodbaas
go 1.19

require (
github.com/Masterminds/semver/v3 v3.2.1
github.com/deepmap/oapi-codegen/v2 v2.1.0
github.com/getkin/kin-openapi v0.123.0
github.com/hashicorp/terraform-plugin-docs v0.18.0
Expand Down Expand Up @@ -95,7 +96,6 @@ require (
github.com/GaijinEntertainment/go-exhaustruct/v3 v3.1.0 // indirect
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/semver v1.5.0 // indirect
github.com/Masterminds/semver/v3 v3.2.1 // indirect
github.com/Masterminds/sprig/v3 v3.2.3 // indirect
github.com/OpenPeeDeeP/depguard/v2 v2.1.0 // indirect
github.com/ProtonMail/go-crypto v1.1.0-alpha.1-proton // indirect
Expand Down
53 changes: 50 additions & 3 deletions internal/framework/generic_resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"context"
"errors"
"fmt"
"io"
"net/url"
"os"
"strings"
"time"
Expand Down Expand Up @@ -235,6 +237,7 @@ const (
READINESS_TIMEOUT = 10 * time.Minute
DELETION_TIMEOUT = 1 * time.Minute
POLLING_INTERVAL = 5 * time.Second
FAILURE_THRESHOLD = 3
DEFAULT_RESOURCE = "default"
CREATE_OPERATION = "create"
UPDATE_OPERATION = "update"
Expand Down Expand Up @@ -310,13 +313,34 @@ func (r *GenericResource) GetTimeout(operation string, defaultTimeout time.Durat
return defaultTimeout
}

type resourceFailedError struct {
message string
}

func ResourceFailed(format string, args ...any) error {
return &resourceFailedError{message: fmt.Sprintf(format, args...)}
}

func (err *resourceFailedError) Error() string {
return err.message
}

func isNetworkError(err error) bool {
if errors.Is(err, io.ErrUnexpectedEOF) {
return true
}
urlErr, ok := err.(*url.Error)
return ok && urlErr.Temporary()
}

func (r *GenericResource) AwaitReady(ctx context.Context, state ResourceState, operation string) error {
timeout := r.GetTimeout(operation, READINESS_TIMEOUT)
if timeout == 0 {
tflog.Info(ctx, "Not waiting for resource to achieve desired state because timeout is 0",
map[string]any{"resourceType": r.TypeName, "operation": operation})
return nil
}
consecutiveFailed := 0
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
for {
Expand All @@ -325,6 +349,15 @@ func (r *GenericResource) AwaitReady(ctx context.Context, state ResourceState, o
if readyErr == nil {
return nil
}
// Return early if resource in failed state for several iterations
if _, ok := readyErr.(*resourceFailedError); ok {
consecutiveFailed++
if consecutiveFailed >= FAILURE_THRESHOLD {
return readyErr
}
} else {
consecutiveFailed = 0
}
time.Sleep(POLLING_INTERVAL)

// Re-read resource state and check for timeout error
Expand All @@ -333,8 +366,13 @@ func (r *GenericResource) AwaitReady(ctx context.Context, state ResourceState, o
if os.IsTimeout(err) && ctx.Err() == context.DeadlineExceeded {
return fmt.Errorf("Timed out after %s: %s", timeout, readyErr.Error())
}
// Return verbatim error if not timeout
return err
// Suppress network errors, which may be transient and retriable
if isNetworkError(err) {
tflog.Info(ctx, "Suppressing network error while awaiting readiness",
map[string]any{"resourceType": r.TypeName, "error": err.Error()})
} else {
return err
}
}
}
}
Expand All @@ -356,7 +394,16 @@ func (r *GenericResource) AwaitDeleted(ctx context.Context, state ResourceState)
if helper.IsNotFound(err) {
return nil
}
return err
if os.IsTimeout(err) && ctx.Err() == context.DeadlineExceeded {
return fmt.Errorf("Timed out after %s", timeout)
}
// Suppress network errors, which may be transient and retriable
if isNetworkError(err) {
tflog.Info(ctx, "Suppressing network error while awaiting deletion",
map[string]any{"resourceType": r.TypeName, "error": err.Error()})
} else {
return err
}
}

time.Sleep(POLLING_INTERVAL)
Expand Down
13 changes: 11 additions & 2 deletions internal/provider/backup/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,21 @@ func (state *BackupResourceModel) CheckReady(ctx context.Context, client openapi
return fmt.Errorf("Backup %s/%s/%s/%s has no status information",
state.Organization, state.Project, state.Database, state.Name)
}
if *state.Status.State != openapi.BackupStatusModelStateSucceeded {
switch *state.Status.State {
case openapi.BackupStatusModelStateSucceeded:
return nil
case openapi.BackupStatusModelStateFailed:
message := "unknown reason"
if state.Status.Message != nil {
message = *state.Status.Message
}
return framework.ResourceFailed("Backup %s/%s/%s/%s failed: %s",
state.Organization, state.Project, state.Database, state.Name, message)
default:
return fmt.Errorf("Backup %s/%s/%s/%s has an unexpected state: expected=%s, found=%s",
state.Organization, state.Project, state.Database, state.Name,
openapi.BackupStatusModelStateSucceeded, *state.Status.State)
}
return nil
}

func (state *BackupResourceModel) Create(ctx context.Context, client openapi.ClientInterface) error {
Expand Down
12 changes: 11 additions & 1 deletion internal/provider/database/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,17 @@ func (state *DatabaseResourceModel) CheckReady(ctx context.Context, client opena
if state.Maintenance != nil && state.Maintenance.IsDisabled != nil && *state.Maintenance.IsDisabled {
expectedState = openapi.DatabaseStatusModelStateStopped
}
if *state.Status.State != expectedState {
switch *state.Status.State {
case expectedState:
break
case openapi.DatabaseStatusModelStateFailed:
message := "unknown reason"
if state.Status.Message != nil {
message = *state.Status.Message
}
return framework.ResourceFailed("Database %s/%s/%s failed: %s",
state.Organization, state.Project, state.Name, message)
default:
return fmt.Errorf("Database %s/%s/%s has an unexpected state: expected=%s, found=%s",
state.Organization, state.Project, state.Name, expectedState, *state.Status.State)
}
Expand Down
13 changes: 11 additions & 2 deletions internal/provider/project/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,20 @@ func (state *ProjectResourceModel) CheckReady(ctx context.Context, client openap
if state.Maintenance != nil && state.Maintenance.IsDisabled != nil && *state.Maintenance.IsDisabled {
expectedState = openapi.ProjectStatusModelStateStopped
}
if *state.Status.State != expectedState {
switch *state.Status.State {
case expectedState:
return nil
case openapi.ProjectStatusModelStateFailed:
message := "unknown reason"
if state.Status.Message != nil {
message = *state.Status.Message
}
return framework.ResourceFailed("Project %s/%s failed: %s",
state.Organization, state.Name, message)
default:
return fmt.Errorf("Project %s/%s has an unexpected state: expected=%s, found=%s",
state.Organization, state.Name, expectedState, *state.Status.State)
}
return nil
}

func (state *ProjectResourceModel) Create(ctx context.Context, client openapi.ClientInterface) error {
Expand Down
37 changes: 37 additions & 0 deletions internal/provider_test/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
. "github.com/nuodb/terraform-provider-nuodbaas/internal/provider/project"
"github.com/nuodb/terraform-provider-nuodbaas/openapi"

semver "github.com/Masterminds/semver/v3"
"github.com/stretchr/testify/require"
)

Expand All @@ -35,6 +36,7 @@ const (
RESOURCE_CREATE_TIMEOUT TestOption = "RESOURCE_CREATE_TIMEOUT"
RESOURCE_UPDATE_TIMEOUT TestOption = "RESOURCE_UPDATE_TIMEOUT"
RESOURCE_DELETE_TIMEOUT TestOption = "RESOURCE_DELETE_TIMEOUT"
NUODB_CP_VERSION TestOption = "NUODB_CP_VERSION"
)

func (option TestOption) Get() string {
Expand All @@ -49,6 +51,19 @@ func (option TestOption) IsFalse() bool {
return option.Get() == "false"
}

func (option TestOption) GetSemver() (*semver.Version, error) {
return semver.NewVersion(option.Get())
}

func (option TestOption) IsVersionLessThan(version string) bool {
thisVersion, err := option.GetSemver()
if err != nil {
return false
}
suppliedVersion, err := semver.NewVersion(version)
return err == nil && thisVersion.LessThan(suppliedVersion)
}

func PauseOperator(t *testing.T) {
// Pausing Operator when webhooks are enabled prevents CRUD operations
// from being performed
Expand Down Expand Up @@ -1034,8 +1049,30 @@ func TestNegative(t *testing.T) {
require.Contains(t, string(out), "409 Conflict")
})

if WEBHOOKS_ENABLED.IsFalse() && !NUODB_CP_VERSION.IsVersionLessThan("2.5.0") {
t.Run("failedProjectAndDatabase", func(t *testing.T) {
// Specify non-existent project tier and check that it goes into failed state
vars.project.Tier = "n5.huge"
tf.WriteConfigT(t, vars.builder.Build())
out, err = tf.Apply()
require.Error(t, err)
require.Contains(t, string(out), fmt.Sprintf("Project %s/%s failed: ", vars.project.Organization, vars.project.Name))

// Specify non-existent database tier and check that it goes into failed state
vars.project.Tier = "n0.nano"
vars.database.Tier = ptr("n5.huge")
tf.WriteConfigT(t, vars.builder.Build())
out, err = tf.Apply()
require.Error(t, err)
require.Contains(t, string(out), fmt.Sprintf("Database %s/%s/%s failed: ", vars.database.Organization, vars.database.Project, vars.database.Name))
// Revert tier change
vars.database.Tier = nil
})
}

// Run `terraform destroy` again, which should succeed now that
// unmanaged database has been deleted
tf.WriteConfigT(t, vars.builder.Build())
out, err = tf.Destroy()
require.NoError(t, err)
}
Expand Down

0 comments on commit 32ead32

Please sign in to comment.