Skip to content

Commit

Permalink
testing framework: refactor interrupt logic for immediate exits (#33532)
Browse files Browse the repository at this point in the history
* testing framework: refactor interrupt logic

* fix formatting
  • Loading branch information
liamcervante authored Jul 19, 2023
1 parent 6882dd9 commit 6c7db16
Show file tree
Hide file tree
Showing 7 changed files with 816 additions and 97 deletions.
175 changes: 87 additions & 88 deletions internal/command/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"path"
"sort"
"strings"
"time"

"github.com/hashicorp/terraform/internal/addrs"
"github.com/hashicorp/terraform/internal/backend"
Expand Down Expand Up @@ -173,6 +174,14 @@ func (c *TestCommand) Run(rawArgs []string) int {
return 1
}

// We have two levels of interrupt here. A 'stop' and a 'cancel'. A 'stop'
// is a soft request to stop. We'll finish the current test, do the tidy up,
// but then skip all remaining tests and run blocks. A 'cancel' is a hard
// request to stop now. We'll cancel the current operation immediately
// even if it's a delete operation, and we won't clean up any infrastructure
// if we're halfway through a test. We'll print details explaining what was
// stopped so the user can do their best to recover from it.

runningCtx, done := context.WithCancel(context.Background())
stopCtx, stop := context.WithCancel(runningCtx)
cancelCtx, cancel := context.WithCancel(context.Background())
Expand All @@ -199,7 +208,7 @@ func (c *TestCommand) Run(rawArgs []string) int {

go func() {
defer logging.PanicHandler()
defer done() // We completed successfully.
defer done()
defer stop()
defer cancel()

Expand All @@ -224,10 +233,12 @@ func (c *TestCommand) Run(rawArgs []string) int {
runner.Cancelled = true
cancel()

// TODO(liamcervante): Should we add a timer here? That would mean
// after 5 seconds we just give up and don't even print out the
// lists of resources left behind?
<-runningCtx.Done() // Nothing left to do now but wait.
// We'll wait 5 seconds for this operation to finish now, regardless
// of whether it finishes successfully or not.
select {
case <-runningCtx.Done():
case <-time.After(5 * time.Second):
}

case <-runningCtx.Done():
// The application finished nicely after the request was stopped.
Expand Down Expand Up @@ -331,13 +342,13 @@ func (runner *TestRunner) ExecuteTestFile(file *moduletest.File, globals map[str
if run.Config.ConfigUnderTest != nil {
// Then we want to execute a different module under a kind of
// sandbox.
state := runner.ExecuteTestRun(run, file, states.NewState(), run.Config.ConfigUnderTest, globals)
state := runner.ExecuteTestRun(mgr, run, file, states.NewState(), run.Config.ConfigUnderTest, globals)
mgr.States = append(mgr.States, &TestModuleState{
State: state,
Run: run,
})
} else {
mgr.State = runner.ExecuteTestRun(run, file, mgr.State, runner.Config, globals)
mgr.State = runner.ExecuteTestRun(mgr, run, file, mgr.State, runner.Config, globals)
}
file.Status = file.Status.Merge(run.Status)
}
Expand All @@ -348,7 +359,7 @@ func (runner *TestRunner) ExecuteTestFile(file *moduletest.File, globals map[str
}
}

func (runner *TestRunner) ExecuteTestRun(run *moduletest.Run, file *moduletest.File, state *states.State, config *configs.Config, globals map[string]backend.UnparsedVariableValue) *states.State {
func (runner *TestRunner) ExecuteTestRun(mgr *TestStateManager, run *moduletest.Run, file *moduletest.File, state *states.State, config *configs.Config, globals map[string]backend.UnparsedVariableValue) *states.State {
if runner.Cancelled {
// Don't do anything, just give up and return immediately.
// The surrounding functions should stop this even being called, but in
Expand Down Expand Up @@ -376,7 +387,7 @@ func (runner *TestRunner) ExecuteTestRun(run *moduletest.Run, file *moduletest.F
return state
}

ctx, plan, state, diags := runner.execute(run, file, config, state, &terraform.PlanOpts{
ctx, plan, state, diags := runner.execute(mgr, run, file, config, state, &terraform.PlanOpts{
Mode: func() plans.Mode {
switch run.Config.Options.Mode {
case configs.RefreshOnlyTestMode:
Expand Down Expand Up @@ -458,17 +469,12 @@ func (runner *TestRunner) ExecuteTestRun(run *moduletest.Run, file *moduletest.F
//
// The command argument decides whether it executes only a plan or also applies
// the plan it creates during the planning.
func (runner *TestRunner) execute(run *moduletest.Run, file *moduletest.File, config *configs.Config, state *states.State, opts *terraform.PlanOpts, command configs.TestCommand, globals map[string]backend.UnparsedVariableValue) (*terraform.Context, *plans.Plan, *states.State, tfdiags.Diagnostics) {
func (runner *TestRunner) execute(mgr *TestStateManager, run *moduletest.Run, file *moduletest.File, config *configs.Config, state *states.State, opts *terraform.PlanOpts, command configs.TestCommand, globals map[string]backend.UnparsedVariableValue) (*terraform.Context, *plans.Plan, *states.State, tfdiags.Diagnostics) {
if opts.Mode == plans.DestroyMode && state.Empty() {
// Nothing to do!
return nil, nil, state, nil
}

identifier := file.Name
if run != nil {
identifier = fmt.Sprintf("%s/%s", identifier, run.Name)
}

// First, transform the config for the given test run and test file.

var diags tfdiags.Diagnostics
Expand Down Expand Up @@ -517,7 +523,7 @@ func (runner *TestRunner) execute(run *moduletest.Run, file *moduletest.File, co
defer done()
plan, planDiags = tfCtx.Plan(config, state, opts)
}()
waitDiags, cancelled := runner.wait(tfCtx, runningCtx, opts, identifier)
waitDiags, cancelled := runner.wait(tfCtx, runningCtx, mgr, run, file, nil)
planDiags = planDiags.Append(waitDiags)

diags = diags.Append(planDiags)
Expand Down Expand Up @@ -556,6 +562,25 @@ func (runner *TestRunner) execute(run *moduletest.Run, file *moduletest.File, co

runningCtx, done = context.WithCancel(context.Background())

// If things get cancelled while we are executing the apply operation below
// we want to print out all the objects that we were creating so the user
// can verify we managed to tidy everything up possibly.
//
// Unfortunately, this creates a race condition as the apply operation can
// edit the plan (by removing changes once they are applied) while at the
// same time our cancellation process will try to read the plan.
//
// We take a quick copy of the changes we care about here, which will then
// be used in place of the plan when we print out the objects to be created
// as part of the cancellation process.
var created []*plans.ResourceInstanceChangeSrc
for _, change := range plan.Changes.Resources {
if change.Action != plans.Create {
continue
}
created = append(created, change)
}

var updated *states.State
var applyDiags tfdiags.Diagnostics

Expand All @@ -564,77 +589,58 @@ func (runner *TestRunner) execute(run *moduletest.Run, file *moduletest.File, co
defer done()
updated, applyDiags = tfCtx.Apply(plan, config)
}()
waitDiags, _ = runner.wait(tfCtx, runningCtx, opts, identifier)
waitDiags, _ = runner.wait(tfCtx, runningCtx, mgr, run, file, created)
applyDiags = applyDiags.Append(waitDiags)

diags = diags.Append(applyDiags)
return tfCtx, plan, updated, diags
}

func (runner *TestRunner) wait(ctx *terraform.Context, runningCtx context.Context, opts *terraform.PlanOpts, identifier string) (diags tfdiags.Diagnostics, cancelled bool) {
select {
case <-runner.StoppedCtx.Done():
func (runner *TestRunner) wait(ctx *terraform.Context, runningCtx context.Context, mgr *TestStateManager, run *moduletest.Run, file *moduletest.File, created []*plans.ResourceInstanceChangeSrc) (diags tfdiags.Diagnostics, cancelled bool) {

if opts.Mode != plans.DestroyMode {
// It takes more impetus from the user to cancel the cleanup
// operations, so we only do this during the actual tests.
cancelled = true
go ctx.Stop()
}
// This function handles what happens when the user presses the second
// interrupt. This is a "hard cancel", we are going to stop doing whatever
// it is we're doing. This means even if we're halfway through creating or
// destroying infrastructure we just give up.
handleCancelled := func() {

select {
case <-runner.CancelledCtx.Done():
states := make(map[*moduletest.Run]*states.State)
states[nil] = mgr.State
for _, module := range mgr.States {
states[module.Run] = module.State
}
runner.View.FatalInterruptSummary(run, file, states, created)

// If the user still really wants to cancel, then we'll oblige
// even during the destroy mode at this point.
if opts.Mode == plans.DestroyMode {
cancelled = true
go ctx.Stop()
}
cancelled = true
go ctx.Stop()

diags = diags.Append(tfdiags.Sourceless(
tfdiags.Error,
"Terraform Test Interrupted",
fmt.Sprintf("Terraform test was interrupted while executing %s. This means resources that were created during the test may have been left active, please monitor the rest of the output closely as any dangling resources will be listed.", identifier)))

// It is actually quite disastrous if we exist early at this
// point as it means we'll have created resources that we
// haven't tracked at all. So for now, we won't ever actually
// forcibly terminate the test. When cancelled, we make the
// clean up faster by not performing it but we should still
// always manage it give an accurate list of resources left
// alive.
// TODO(liamcervante): Consider adding a timer here, so that we
// exit early even if that means some resources are just lost
// forever.
<-runningCtx.Done() // Just wait for things to finish now.
// Just wait for things to finish now, the overall test execution will
// exit early if this takes too long.
<-runningCtx.Done()
}

// This function handles what happens when the user presses the first
// interrupt. This is essentially a "soft cancel", we're not going to do
// anything but just wait for things to finish safely. But, we do listen
// for the crucial second interrupt which will prompt a hard stop / cancel.
handleStopped := func() {
select {
case <-runner.CancelledCtx.Done():
// We've been asked again. This time we stop whatever we're doing
// and abandon all attempts to do anything reasonable.
handleCancelled()
case <-runningCtx.Done():
// The operation exited nicely when asked!
// Do nothing, we finished safely and skipping the remaining tests
// will be handled elsewhere.
}
case <-runner.CancelledCtx.Done():
// This shouldn't really happen, as we'd expect to see the StoppedCtx
// being triggered first. But, just in case.
cancelled = true
go ctx.Stop()

diags = diags.Append(tfdiags.Sourceless(
tfdiags.Error,
"Terraform Test Interrupted",
fmt.Sprintf("Terraform test was interrupted while executing %s. This means resources that were created during the test may have been left active, please monitor the rest of the output closely as any dangling resources will be listed.", identifier)))

// It is actually quite disastrous if we exist early at this
// point as it means we'll have created resources that we
// haven't tracked at all. So for now, we won't ever actually
// forcibly terminate the test. When cancelled, we make the
// clean up faster by not performing it but we should still
// always manage it give an accurate list of resources left
// alive.
// TODO(liamcervante): Consider adding a timer here, so that we
// exit early even if that means some resources are just lost
// forever.
<-runningCtx.Done() // Just wait for things to finish now.
}

select {
case <-runner.StoppedCtx.Done():
handleStopped()
case <-runner.CancelledCtx.Done():
handleCancelled()
case <-runningCtx.Done():
// The operation exited normally.
}
Expand Down Expand Up @@ -675,27 +681,21 @@ type TestModuleState struct {

func (manager *TestStateManager) cleanupStates(file *moduletest.File, globals map[string]backend.UnparsedVariableValue) {
if manager.runner.Cancelled {

// We are still going to print out the resources that we have left
// even though the user asked for an immediate exit.

var diags tfdiags.Diagnostics
diags = diags.Append(tfdiags.Sourceless(tfdiags.Error, "Test cleanup skipped due to immediate exit", "Terraform could not clean up the state left behind due to immediate interrupt."))
manager.runner.View.DestroySummary(diags, nil, file, manager.State)

for _, module := range manager.States {
manager.runner.View.DestroySummary(diags, module.Run, file, module.State)
}

// Don't try and clean anything up if the execution has been cancelled.
return
}

// First, we'll clean up the main state.
_, _, state, diags := manager.runner.execute(nil, file, manager.runner.Config, manager.State, &terraform.PlanOpts{
_, _, state, diags := manager.runner.execute(manager, nil, file, manager.runner.Config, manager.State, &terraform.PlanOpts{
Mode: plans.DestroyMode,
}, configs.ApplyTestCommand, globals)
manager.runner.View.DestroySummary(diags, nil, file, state)

if manager.runner.Cancelled {
// In case things were cancelled during the last execution.
return
}

// Then we'll clean up the additional states for custom modules in reverse
// order.
for ix := len(manager.States); ix > 0; ix-- {
Expand All @@ -704,11 +704,10 @@ func (manager *TestStateManager) cleanupStates(file *moduletest.File, globals ma
if manager.runner.Cancelled {
// In case the cancellation came while a previous state was being
// destroyed.
manager.runner.View.DestroySummary(diags, module.Run, file, module.State)
continue
return
}

_, _, state, diags := manager.runner.execute(module.Run, file, module.Run.Config.ConfigUnderTest, module.State, &terraform.PlanOpts{
_, _, state, diags := manager.runner.execute(manager, module.Run, file, module.Run.Config.ConfigUnderTest, module.State, &terraform.PlanOpts{
Mode: plans.DestroyMode,
}, configs.ApplyTestCommand, globals)
manager.runner.View.DestroySummary(diags, module.Run, file, state)
Expand Down
13 changes: 13 additions & 0 deletions internal/command/test_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,19 @@ func TestTest_DoubleInterrupt(t *testing.T) {
t.Errorf("output didn't produce the right output:\n\n%s", output)
}

cleanupMessage := `Terraform was interrupted while executing main.tftest, and may not have performed the expected cleanup operations.
Terraform has already created the following resources from the module under test:
- test_resource.primary
- test_resource.secondary
- test_resource.tertiary`

// It's really important that the above message is printed, so we're testing
// for it specifically and making sure it contains all the resources.
if !strings.Contains(output, cleanupMessage) {
t.Errorf("output didn't produce the right output:\n\n%s", output)
}

// This time the test command shouldn't have cleaned up the resource because
// of the hard interrupt.
if provider.ResourceCount() != 3 {
Expand Down
6 changes: 6 additions & 0 deletions internal/command/testing/test_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"path"
"strings"
"time"

"github.com/hashicorp/go-uuid"
"github.com/zclconf/go-cty/cty"
Expand Down Expand Up @@ -217,6 +218,11 @@ func (provider *TestProvider) ApplyResourceChange(request providers.ApplyResourc
for ix := 0; ix < int(count); ix++ {
provider.Interrupt <- struct{}{}
}

// Wait for a second to make sure the interrupts are processed by
// Terraform before the provider finishes. This is an attempt to ensure
// the output of any tests that rely on this behaviour is deterministic.
time.Sleep(time.Second)
}

provider.Store.Put(provider.GetResourceKey(id.AsString()), resource)
Expand Down
15 changes: 8 additions & 7 deletions internal/command/views/json/message_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ const (
MessageRefreshComplete MessageType = "refresh_complete"

// Test messages
MessageTestAbstract MessageType = "test_abstract"
MessageTestFile MessageType = "test_file"
MessageTestRun MessageType = "test_run"
MessageTestPlan MessageType = "test_plan"
MessageTestState MessageType = "test_state"
MessageTestSummary MessageType = "test_summary"
MessageTestCleanup MessageType = "test_cleanup"
MessageTestAbstract MessageType = "test_abstract"
MessageTestFile MessageType = "test_file"
MessageTestRun MessageType = "test_run"
MessageTestPlan MessageType = "test_plan"
MessageTestState MessageType = "test_state"
MessageTestSummary MessageType = "test_summary"
MessageTestCleanup MessageType = "test_cleanup"
MessageTestInterrupt MessageType = "test_interrupt"
)
6 changes: 6 additions & 0 deletions internal/command/views/json/test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ type TestFailedResource struct {
DeposedKey string `json:"deposed_key,omitempty"`
}

type TestFatalInterrupt struct {
State []TestFailedResource `json:"state,omitempty"`
States map[string][]TestFailedResource `json:"states,omitempty"`
Planned []string `json:"planned,omitempty"`
}

func ToTestStatus(status moduletest.Status) TestStatus {
return TestStatus(strings.ToLower(status.String()))
}
Loading

0 comments on commit 6c7db16

Please sign in to comment.