From d834c4ff32e7f7885b5fc551d1efe66953a17722 Mon Sep 17 00:00:00 2001
From: "Deomid \"rojer\" Ryabkov" <rojer9@fb.com>
Date: Thu, 10 Dec 2020 10:24:03 +0000
Subject: [PATCH 1/5] New test runner implementation

Simplified, added support for serializing state and resuming.
Not used yet but will be.
---
 pkg/cerrors/cerrors.go                        |  55 ++
 pkg/config/timeouts.go                        |  29 +-
 pkg/event/testevent/test.go                   |  16 +-
 pkg/jobmanager/jobmanager.go                  |   2 +-
 pkg/pluginregistry/bundles.go                 |   2 +-
 pkg/runner/job_runner.go                      |  13 +-
 pkg/runner/test_runner.go                     | 882 +++++++++++++-----
 pkg/runner/test_runner_pipeline.go            | 542 -----------
 pkg/runner/test_runner_route.go               | 302 ------
 pkg/runner/test_runner_route_test.go          | 400 --------
 pkg/runner/test_runner_test.go                | 548 +++++++++++
 pkg/runner/wait_for_first_target.go           |  58 --
 pkg/runner/wait_for_first_target_test.go      | 142 ---
 pkg/statectx/context.go                       |  31 +-
 pkg/storage/storage.go                        |   1 +
 pkg/target/target.go                          |   7 +-
 plugins/storage/memory/memory.go              |   2 +-
 plugins/teststeps/example/example.go          | 142 ++-
 plugins/teststeps/teststeps.go                |  11 +-
 tests/integ/jobmanager/common.go              |   1 -
 tests/integ/test/testrunner_test.go           | 218 +----
 .../teststeps/badtargets/badtargets.go        |  79 ++
 tests/plugins/teststeps/channels/channels.go  |   6 +-
 tests/plugins/teststeps/hanging/hanging.go    |   2 +-
 tests/plugins/teststeps/noreturn/noreturn.go  |   2 +-
 25 files changed, 1517 insertions(+), 1976 deletions(-)
 delete mode 100644 pkg/runner/test_runner_pipeline.go
 delete mode 100644 pkg/runner/test_runner_route.go
 delete mode 100644 pkg/runner/test_runner_route_test.go
 create mode 100644 pkg/runner/test_runner_test.go
 delete mode 100644 pkg/runner/wait_for_first_target.go
 delete mode 100644 pkg/runner/wait_for_first_target_test.go
 create mode 100644 tests/plugins/teststeps/badtargets/badtargets.go

diff --git a/pkg/cerrors/cerrors.go b/pkg/cerrors/cerrors.go
index 4fa3211e..d0a1dfdd 100644
--- a/pkg/cerrors/cerrors.go
+++ b/pkg/cerrors/cerrors.go
@@ -43,6 +43,17 @@ func (e *ErrTestStepsNeverReturned) Error() string {
 	return fmt.Sprintf("test step [%s] did not return", strings.Join(e.StepNames, ", "))
 }
 
+// ErrTestTargetInjectionTimedOut indicates that test step did not ingest a target
+// within allotted time.
+type ErrTestTargetInjectionTimedOut struct {
+	StepName string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestTargetInjectionTimedOut) Error() string {
+	return fmt.Sprintf("test step %v failed to ingest a target", e.StepName)
+}
+
 // ErrTestStepClosedChannels indicates that the test step returned after
 // closing its output channels, which constitutes an API violation
 type ErrTestStepClosedChannels struct {
@@ -53,3 +64,47 @@ type ErrTestStepClosedChannels struct {
 func (e *ErrTestStepClosedChannels) Error() string {
 	return fmt.Sprintf("test step %v closed output channels (api violation)", e.StepName)
 }
+
+// ErrTestStepPaniced indicates that a test step's method paniced.
+type ErrTestStepPaniced struct {
+	StepName   string
+	StackTrace string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepPaniced) Error() string {
+	return fmt.Sprintf("test step %s paniced, trace: %q", e.StepName, e.StackTrace)
+}
+
+// ErrTestStepReturnedDuplicateResult indicates that a test step's method paniced.
+type ErrTestStepReturnedDuplicateResult struct {
+	StepName string
+	Target   string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepReturnedDuplicateResult) Error() string {
+	return fmt.Sprintf("test step %s returned duplicate result for %s", e.StepName, e.Target)
+}
+
+// ErrTestStepReturnedUnexpectedResult indicates that a test step's method paniced.
+type ErrTestStepReturnedUnexpectedResult struct {
+	StepName string
+	Target   string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepReturnedUnexpectedResult) Error() string {
+	return fmt.Sprintf("test step %s returned unexpected result for %s", e.StepName, e.Target)
+}
+
+// ErrTestStepLostTargets indicates that targets have been lost during test run.
+type ErrTestStepLostTargets struct {
+	StepName string
+	Target   string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepLostTargets) Error() string {
+	return fmt.Sprintf("test step %s lost target %s", e.StepName, e.Target)
+}
diff --git a/pkg/config/timeouts.go b/pkg/config/timeouts.go
index 0fb66f58..5a06a8e8 100644
--- a/pkg/config/timeouts.go
+++ b/pkg/config/timeouts.go
@@ -12,34 +12,13 @@ import "time"
 var TargetManagerTimeout = 5 * time.Minute
 
 // StepInjectTimeout represents the maximum time that TestRunner will wait for
-// the first TestStep of the pipeline to accept a Target
+// a TestStep to accept a Target
 var StepInjectTimeout = 30 * time.Second
 
-// TestRunnerMsgTimeout represents the maximum time that any component of the
-// TestRunner will wait for the delivery of a message to any other subsystem
-// of the TestRunner
-var TestRunnerMsgTimeout = 5 * time.Second
-
-// TestRunnerShutdownTimeout represents the maximum time that the TestRunner will
-// wait for all the TestStep to complete after a cancellation signal has been
-// delivered
-
-// TestRunnerShutdownTimeout controls a block of the TestRunner which works as a
-// watchdog, i.e. if there are multiple steps that need to return, the timeout is
-// reset every time a step returns. The timeout should be handled so that it
-// doesn't reset when a TestStep returns.
-var TestRunnerShutdownTimeout = 30 * time.Second
-
-// TestRunnerStepShutdownTimeout represents the maximum time that the TestRunner
+// TestRunnerShutdownTimeout represents the maximum time that the TestRunner
 // will wait for all TestSteps to complete after all Targets have reached the end
-// of the pipeline. This timeout is only relevant if a cancellation signal is *not*
-// delivered.
-
-// TestRunnerStepShutdownTimeout controls a block of the TestRunner which worksas
-// a watchdog, i.e. if there are multiple steps that need to return, the timeout
-// is reset every time a step returns. The timeout should be handled so that it
-// doesn't reset when a TestStep returns.
-var TestRunnerStepShutdownTimeout = 5 * time.Second
+// of the pipeline.
+var TestRunnerShutdownTimeout = 30 * time.Second
 
 // LockRefreshTimeout is the amount of time by which a target lock is extended
 // periodically while a job is running.
diff --git a/pkg/event/testevent/test.go b/pkg/event/testevent/test.go
index 5d28c1b8..3baa3232 100644
--- a/pkg/event/testevent/test.go
+++ b/pkg/event/testevent/test.go
@@ -16,7 +16,7 @@ import (
 	"github.com/facebookincubator/contest/pkg/types"
 )
 
-// Header models the header of a test event, which consists in metadatat hat defines the
+// Header models the header of a test event, which consists in metadata that defines the
 // emitter of the events. The Header is under ConTest control and cannot be manipulated
 // by the TestStep
 type Header struct {
@@ -28,8 +28,8 @@ type Header struct {
 
 // Data models the data of a test event. It is populated by the TestStep
 type Data struct {
-	EventName event.Name
 	Target    *target.Target
+	EventName event.Name
 	Payload   *json.RawMessage
 }
 
@@ -152,3 +152,15 @@ type EmitterFetcher interface {
 	Emitter
 	Fetcher
 }
+
+func (h *Header) String() string {
+	return fmt.Sprintf("[%d %d %s %s]", h.JobID, h.RunID, h.TestName, h.TestStepLabel)
+}
+
+func (d *Data) String() string {
+	ps := ""
+	if d.Payload != nil {
+		ps = fmt.Sprintf(" %q", d.Payload) //nolint SA5009 - works fine
+	}
+	return fmt.Sprintf("[%s %s%s]", d.Target, d.EventName, ps)
+}
diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go
index d831a85e..78479db2 100644
--- a/pkg/jobmanager/jobmanager.go
+++ b/pkg/jobmanager/jobmanager.go
@@ -160,7 +160,7 @@ func newPartialJobFromDescriptor(pr *pluginregistry.PluginRegistry, jd *job.JobD
 			}
 			// test step index is incremented by 1 so we can use 0 to signal an
 			// anomaly.
-			tsb, err := pr.NewTestStepBundle(*testStepDesc, uint(idx)+1, tse)
+			tsb, err := pr.NewTestStepBundle(*testStepDesc, tse)
 			if err != nil {
 				return nil, fmt.Errorf("NewTestStepBundle for test step '%s' with index %d failed: %w", testStepDesc.Name, idx, err)
 			}
diff --git a/pkg/pluginregistry/bundles.go b/pkg/pluginregistry/bundles.go
index 91038ce4..4a18a146 100644
--- a/pkg/pluginregistry/bundles.go
+++ b/pkg/pluginregistry/bundles.go
@@ -15,7 +15,7 @@ import (
 )
 
 // NewTestStepBundle creates a TestStepBundle from a TestStepDescriptor
-func (r *PluginRegistry) NewTestStepBundle(testStepDescriptor test.TestStepDescriptor, stepIndex uint, allowedEvents map[event.Name]bool) (*test.TestStepBundle, error) {
+func (r *PluginRegistry) NewTestStepBundle(testStepDescriptor test.TestStepDescriptor, allowedEvents map[event.Name]bool) (*test.TestStepBundle, error) {
 	testStep, err := r.NewTestStep(testStepDescriptor.Name)
 	if err != nil {
 		return nil, fmt.Errorf("could not get the desired TestStep (%s): %v", testStepDescriptor.Name, err)
diff --git a/pkg/runner/job_runner.go b/pkg/runner/job_runner.go
index 94b18c7a..efb0c4a9 100644
--- a/pkg/runner/job_runner.go
+++ b/pkg/runner/job_runner.go
@@ -17,6 +17,7 @@ import (
 	"github.com/facebookincubator/contest/pkg/event/testevent"
 	"github.com/facebookincubator/contest/pkg/job"
 	"github.com/facebookincubator/contest/pkg/logging"
+	"github.com/facebookincubator/contest/pkg/statectx"
 	"github.com/facebookincubator/contest/pkg/storage"
 	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/types"
@@ -157,9 +158,8 @@ func (jr *JobRunner) Run(j *job.Job) ([][]*job.Report, []*job.Report, error) {
 						if err := tl.Unlock(j.ID, targets); err != nil {
 							jobLog.Warningf("Failed to unlock targets (%v) for job ID %d: %v", targets, j.ID, err)
 						}
-					case <-j.StateCtx.Paused():
-						jobLog.Debugf("Received pause request, NOT releasing targets so the job can be resumed")
 						return
+					// Ignore the pause signal, continue to refresh targets.
 					case <-done:
 						if err := tl.Unlock(j.ID, targets); err != nil {
 							jobLog.Warningf("Failed to unlock %d target(s) (%v): %v", len(targets), targets, err)
@@ -183,7 +183,14 @@ func (jr *JobRunner) Run(j *job.Job) ([][]*job.Report, []*job.Report, error) {
 			if runErr = jr.emitAcquiredTargets(testEventEmitter, targets); runErr == nil {
 				jobLog.Infof("Run #%d: running test #%d for job '%s' (job ID: %d) on %d targets", run+1, idx, j.Name, j.ID, len(targets))
 				testRunner := NewTestRunner()
-				runErr = testRunner.Run(j.StateCtx, t, targets, j.ID, types.RunID(run+1))
+				var resumeState []byte
+				resumeState, err := testRunner.Run(j.StateCtx, t, targets, j.ID, types.RunID(run+1), resumeState)
+				if err == statectx.ErrPaused {
+					jobLog.Debugf("Runner paused, state: %s", string(resumeState))
+					// TODO(rojer): Persist the state.
+				} else {
+					runErr = err
+				}
 			}
 
 			// Job is done, release all the targets
diff --git a/pkg/runner/test_runner.go b/pkg/runner/test_runner.go
index b37bfb4d..f3a5b8d3 100644
--- a/pkg/runner/test_runner.go
+++ b/pkg/runner/test_runner.go
@@ -1,287 +1,753 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
 package runner
 
 import (
-	"context"
+	"encoding/json"
 	"fmt"
+	"runtime/debug"
+	"sync"
 	"time"
 
+	"github.com/sirupsen/logrus"
+
 	"github.com/facebookincubator/contest/pkg/cerrors"
 	"github.com/facebookincubator/contest/pkg/config"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
 	"github.com/facebookincubator/contest/pkg/logging"
 	"github.com/facebookincubator/contest/pkg/statectx"
+	"github.com/facebookincubator/contest/pkg/storage"
 	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/test"
 	"github.com/facebookincubator/contest/pkg/types"
-	"github.com/sirupsen/logrus"
 )
 
-// TestRunnerTimeouts collects all the timeouts values that the test runner uses
-type TestRunnerTimeouts struct {
-	StepInjectTimeout   time.Duration
-	MessageTimeout      time.Duration
-	ShutdownTimeout     time.Duration
-	StepShutdownTimeout time.Duration
+// TestRunner is the interface test runner implements
+type TestRunner interface {
+	// ErrPaused will be returned when runner was able to pause successfully.
+	// When this error is returned from Run(), it is accompanied by valid serialized state
+	// that can be passed later to Run() to continue from where we left off.
+	Run(ctx statectx.Context, test *test.Test, targets []*target.Target, jobID types.JobID, runID types.RunID, resumeState []byte) ([]byte, error)
 }
 
-// routingCh represents a set of unidirectional channels used by the routing subsystem.
-// There is a routing block for each TestStep of the pipeline, which is responsible for
-// the following actions:
-// * Targets in egress from the previous routing block are injected into the
-// current TestStep
-// * Targets in egress from the current TestStep are injected into the next
-// routing block
-type routingCh struct {
-	// routeIn and routeOut connect the routing block to other routing blocks
-	routeIn  <-chan *target.Target
-	routeOut chan<- *target.Target
-	// Channels that connect the routing block to the TestStep
-	stepIn  chan<- *target.Target
-	stepOut <-chan *target.Target
-	stepErr <-chan cerrors.TargetError
-	// targetErr connects the routing block directly to the TestRunner. Failing
-	// targets are acquired by the TestRunner via this channel
-	targetErr chan<- cerrors.TargetError
+// testRunner is the state associated with a test run.
+// Here's how a test run works:
+//  * Each target gets a targetState and a "target runner" - a goroutine that takes that particular
+//    target through each step of the pipeline in sequence. It injects the target, waits for the result,
+//    then mves on to the next step.
+//  * Each step of the pipeline gets a stepState and:
+//    - A "step runner" - a goroutine that is responsible for running the step's Run() method
+//    - A "step reader" - a goroutine that processes results and sends them on to target runners that await them.
+//  * After starting all of the above, the main goroutine goes into "monitor" mode
+//    that checks on the pipeline's progress and is responsible for closing step input channels
+//    when all the targets have been injected.
+//  * Monitor loop finishes when all the targets have been injected into the last step
+//    or if a step has encountered and error.
+//  * We then wait for all the step runners and readers to shut down.
+//  * Once all the activity has died down, resulting state is examined and an error is returned, if any.
+type testRunner struct {
+	stepInjectTimeout time.Duration // Time to wait for steps to accept each target
+	shutdownTimeout   time.Duration // Time to wait for steps runners to finish a the end of the run
+
+	steps     []*stepState            // The pipeline, in order of execution
+	targets   map[string]*targetState // Target state lookup map
+	targetsWg sync.WaitGroup          // Tracks all the target runners
+	log       *logrus.Entry           // Logger
+
+	// One mutex to rule them all, used to serialize access to all the state above.
+	// Could probably be split into several if necessary.
+	mu   sync.Mutex
+	cond *sync.Cond // Used to notify the monitor about changes
 }
 
-// stepCh represents a set of bidirectional channels that a TestStep and its associated
-// routing block use to communicate. The TestRunner forces the direction of each
-// channel when connecting the TestStep to the routing block.
-type stepCh struct {
-	stepIn  chan *target.Target
-	stepOut chan *target.Target
-	stepErr chan cerrors.TargetError
-}
+// stepState contains state associated with one state of the pipeline:
+type stepState struct {
+	stepIndex int                 // Index of this step in the pipeline.
+	sb        test.TestStepBundle // The test bundle.
 
-type injectionCh struct {
-	stepIn   chan<- *target.Target
-	resultCh chan<- injectionResult
-}
+	// Channels used to communicate with the plugin.
+	inCh  chan *target.Target
+	outCh chan *target.Target
+	errCh chan cerrors.TargetError
+	ev    testevent.Emitter
 
-// injectionResult represents the result of an injection goroutine
-type injectionResult struct {
-	target *target.Target
-	err    error
-}
+	numInjected int                     // Number of targets injected.
+	tgtDone     map[*target.Target]bool // Targets for which results have been received.
 
-// routeResult represents the result of routing block, possibly carrying error information
-type routeResult struct {
-	bundle test.TestStepBundle
-	err    error
-}
+	stepRunning   bool  // testStep.Run() is currently running.
+	readerRunning bool  // Result reader is running.
+	runErr        error // Runner error, returned from Run() or an error condition detected by the reader.
 
-// stepResult represents the result of a TestStep, possibly carrying error information
-type stepResult struct {
-	jobID  types.JobID
-	runID  types.RunID
-	bundle test.TestStepBundle
-	err    error
+	log *logrus.Entry // Logger
 }
 
-// pipelineCtrlCh represents multiple result and control channels that the pipeline uses
-// to collect results from routing blocks, steps and target completing the test and to
-//  signa cancellation to various pipeline subsystems
-type pipelineCtrlCh struct {
-	routingResultCh <-chan routeResult
-	stepResultCh    <-chan stepResult
-	targetOut       <-chan *target.Target
-	targetErr       <-chan cerrors.TargetError
-
-	// ctx  is a control context used to cancel/pause the steps of the pipeline
-	ctx    statectx.Context
-	pause  func()
-	cancel func()
+// targetStepPhase denotes progression of a target through a step
+type targetStepPhase int
+
+const (
+	targetStepPhaseInit  targetStepPhase = 0
+	targetStepPhaseBegin targetStepPhase = 1 // Picked up for execution.
+	targetStepPhaseRun   targetStepPhase = 2 // Injected into step.
+	targetStepPhaseEnd   targetStepPhase = 3 // Finished running a step.
+)
+
+// targetState contains state associated with one target progressing through the pipeline.
+type targetState struct {
+	tgt *target.Target
+
+	// This part of state gets serialized into JSON for resumption.
+	CurStep  int             `json:"cur_step"`  // Current step number.
+	CurPhase targetStepPhase `json:"cur_phase"` // Current phase of step execution.
+
+	res   error      // Final result, if reached the end state.
+	resCh chan error // Channel used to communicate result by the step runner.
 }
 
-// TestRunner is the main runner of TestSteps in ConTest. `results` collects
-// the results of the run. It is not safe to access `results` concurrently.
-type TestRunner struct {
-	timeouts TestRunnerTimeouts
+// resumeStateStruct is used to serialize runner state to be resumed in the future.
+type resumeStateStruct struct {
+	Version int                     `json:"version"`
+	JobID   types.JobID             `json:"job_id"`
+	RunID   types.RunID             `json:"run_id"`
+	Targets map[string]*targetState `json:"targets"`
 }
 
-// targetWriter is a helper object which exposes methods to write targets into step channels
-type targetWriter struct {
-	log      *logrus.Entry
-	timeouts TestRunnerTimeouts
+// Resume state version we are compatible with.
+// When imcompatible changes are made to the state format, bump this.
+// Restoring incompatible state will abort the job.
+const resumeStateStructVersion = 1
+
+// Run is the main enty point of the runner.
+func (tr *testRunner) Run(
+	ctx statectx.Context,
+	t *test.Test, targets []*target.Target,
+	jobID types.JobID, runID types.RunID,
+	resumeState []byte) ([]byte, error) {
+
+	// Set up logger
+	rootLog := logging.GetLogger("pkg/runner")
+	fields := make(map[string]interface{})
+	fields["jobid"] = jobID
+	fields["runid"] = runID
+	rootLog = logging.AddFields(rootLog, fields)
+	tr.log = logging.AddField(rootLog, "phase", "run")
+
+	tr.log.Debugf("== test runner starting job %d, run %d", jobID, runID)
+	resumeState, err := tr.run(ctx, t, targets, jobID, runID, resumeState)
+	tr.log.Debugf("== test runner finished job %d, run %d, err: %v", jobID, runID, err)
+	return resumeState, err
 }
 
-func (w *targetWriter) writeTimeout(ctx context.Context, ch chan<- *target.Target, target *target.Target, timeout time.Duration) error {
-	w.log.Debugf("writing target %+v, timeout %v", target, timeout)
-	start := time.Now()
+func (tr *testRunner) run(
+	ctx statectx.Context,
+	t *test.Test, targets []*target.Target,
+	jobID types.JobID, runID types.RunID,
+	resumeState []byte) ([]byte, error) {
+
+	// Peel off contexts used for steps and target handlers.
+	stepCtx, _, stepCancel := statectx.WithParent(ctx)
+	defer stepCancel()
+	targetCtx, _, targetCancel := statectx.WithParent(ctx)
+	defer targetCancel()
+
+	// Set up the pipeline
+	for i, sb := range t.TestStepsBundles {
+		tr.steps = append(tr.steps, &stepState{
+			stepIndex: i,
+			sb:        sb,
+			inCh:      make(chan *target.Target),
+			outCh:     make(chan *target.Target),
+			errCh:     make(chan cerrors.TargetError),
+			ev: storage.NewTestEventEmitter(testevent.Header{
+				JobID:         jobID,
+				RunID:         runID,
+				TestName:      t.Name,
+				TestStepLabel: sb.TestStepLabel,
+			}),
+			tgtDone: make(map[*target.Target]bool),
+			log:     logging.AddField(tr.log, "step", sb.TestStepLabel),
+		})
+	}
+
+	// Set up the targets
+	tr.targets = make(map[string]*targetState)
+	// If we have target state to resume, do it now.
+	if len(resumeState) > 0 {
+		tr.log.Debugf("Attempting to resume from state: %s", string(resumeState))
+		var rs resumeStateStruct
+		if err := json.Unmarshal(resumeState, &rs); err != nil {
+			return nil, fmt.Errorf("invalid resume state: %w", err)
+		}
+		if rs.Version != resumeStateStructVersion {
+			return nil, fmt.Errorf("incompatible resume state version %d (want %d)",
+				rs.Version, resumeStateStructVersion)
+		}
+		if rs.JobID != jobID {
+			return nil, fmt.Errorf("wrong resume state, job id %d (want %d)", rs.JobID, jobID)
+		}
+		tr.targets = rs.Targets
+	}
+	// Initialize remaining fields of the target structures,
+	// build the map and kick off target processing.
+	for _, tgt := range targets {
+		ts := tr.targets[tgt.ID]
+		if ts == nil {
+			ts = &targetState{}
+		}
+		ts.tgt = tgt
+		ts.resCh = make(chan error)
+		tr.mu.Lock()
+		tr.targets[tgt.ID] = ts
+		tr.mu.Unlock()
+		tr.targetsWg.Add(1)
+		go tr.targetRunner(targetCtx, stepCtx, ts)
+	}
+
+	// Run until no more progress can be made.
+	runErr := tr.runMonitor()
+	if runErr != nil {
+		tr.log.Errorf("monitor returned error: %q, canceling", runErr)
+		stepCancel()
+	}
+
+	// Wait for step runners and readers to exit.
+	if err := tr.waitStepRunners(ctx); err != nil {
+		tr.log.Errorf("step runner error: %q, canceling", err)
+		stepCancel()
+		if runErr == nil {
+			runErr = err
+		}
+	}
+	// There will be no more results, reel in all the target runners (if any).
+	tr.log.Debugf("waiting for target runners to finish")
+	targetCancel()
+	tr.targetsWg.Wait()
+
+	// Examine the resulting state.
+	tr.log.Debugf("leaving, err %v, target states:", runErr)
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	resumeOk := (runErr == nil)
+	var inFlightTargets []*targetState
+	for i, tgt := range targets {
+		ts := tr.targets[tgt.ID]
+		stepErr := tr.steps[ts.CurStep].runErr
+		tr.log.Debugf("  %d %s %v", i, ts, stepErr)
+		if ts.CurPhase == targetStepPhaseRun {
+			inFlightTargets = append(inFlightTargets, ts)
+			if stepErr != statectx.ErrPaused {
+				resumeOk = false
+			}
+		}
+	}
+	tr.log.Debugf("- %d in flight, ok to resume? %t", len(inFlightTargets), resumeOk)
+	tr.log.Debugf("step states:")
+	for i, ss := range tr.steps {
+		tr.log.Debugf("  %d %s %t %t %v", i, ss, ss.stepRunning, ss.readerRunning, ss.runErr)
+	}
+
+	// Is there a useful error to report?
+	if runErr != nil {
+		return nil, runErr
+	}
+
+	// Has the run been canceled?
 	select {
 	case <-ctx.Done():
-		w.log.Debugf("terminate requested while writing target %+v", target)
-	case ch <- target:
-	case <-time.After(timeout):
-		return fmt.Errorf("timeout (%v) while writing target %+v", timeout, target)
+		return nil, statectx.ErrCanceled
+	default:
 	}
-	w.log.Debugf("done writing target %+v, spent %v", target, time.Since(start))
-	return nil
-}
 
-// writeTargetWithResult attempts to deliver a Target on the input channel of a step,
-// returning the result of the operation on the result channel wrapped in the
-// injectionCh argument
-func (w *targetWriter) writeTargetWithResult(ctx context.Context, target *target.Target, ch injectionCh) {
-	err := w.writeTimeout(ctx, ch.stepIn, target, w.timeouts.StepInjectTimeout)
+	// Have we been asked to pause? If yes, is it safe to do so?
 	select {
-	case <-ctx.Done():
-	case ch.resultCh <- injectionResult{target: target, err: err}:
-	case <-time.After(w.timeouts.MessageTimeout):
-		w.log.Panicf("timeout while writing result for target %+v after %v", target, w.timeouts.MessageTimeout)
+	case <-ctx.Paused():
+		if !resumeOk {
+			tr.log.Warningf("paused but not ok to resume")
+			break
+		}
+		rs := &resumeStateStruct{
+			Version: resumeStateStructVersion,
+			JobID:   jobID, RunID: runID,
+			Targets: tr.targets,
+		}
+		resumeState, runErr = json.Marshal(rs)
+		if runErr != nil {
+			tr.log.Errorf("unable to serialize the state: %s", runErr)
+		} else {
+			runErr = statectx.ErrPaused
+		}
+	default:
+		// We are not pausing and yet some targets were left in flight.
+		if len(inFlightTargets) > 0 {
+			ts := inFlightTargets[0]
+			runErr = &cerrors.ErrTestStepLostTargets{
+				StepName: tr.steps[ts.CurStep].sb.TestStepLabel,
+				Target:   ts.tgt.ID,
+			}
+		}
 	}
+
+	return resumeState, runErr
 }
 
-// writeTargetError writes a TargetError object to a TargetError channel with timeout
-func (w *targetWriter) writeTargetError(ctx context.Context, ch chan<- cerrors.TargetError, targetError cerrors.TargetError, timeout time.Duration) error {
+func (tr *testRunner) waitStepRunners(ctx statectx.Context) error {
+	tr.log.Debugf("waiting for step runners to finish")
+	swch := make(chan struct{})
+	go func() {
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		for {
+			ok := true
+			for _, ss := range tr.steps {
+				// numRunning == 1 is also acceptable: we allow the Run() goroutine
+				// to continue in case of error, if the result processor decided
+				// to abandon its runner, there's nothing we can do.
+				switch {
+				case !ss.stepRunning && !ss.readerRunning:
+				// Done
+				case ss.stepRunning && ss.readerRunning:
+					// Still active
+					ok = false
+				case !ss.stepRunning && ss.readerRunning:
+					// Transient state, let it finish
+					ok = false
+				case ss.stepRunning && !ss.readerRunning:
+					// This is possible if plugin got stuck and result processor gave up on it.
+					// If so, it should have left an error.
+					if ss.runErr == nil {
+						tr.log.Errorf("%s: result processor left runner with no error", ss)
+						// There's nothing we can do at this point, fall through.
+					}
+				}
+			}
+			if ok {
+				close(swch)
+				return
+			}
+			tr.cond.Wait()
+		}
+	}()
 	select {
-	case <-ctx.Done():
-	case ch <- targetError:
-	case <-time.After(timeout):
-		return fmt.Errorf("timeout while writing targetError %+v", targetError)
+	case <-swch:
+		tr.log.Debugf("step runners finished")
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		return tr.checkStepRunners()
+	case <-time.After(tr.shutdownTimeout):
+		tr.log.Errorf("step runners failed to shut down correctly")
+		// If there is a step with an error set, use that.
+		if err := tr.checkStepRunners(); err != nil {
+			return err
+		}
+		// If there isn't, enumerate ones that were still running at the time.
+		err := &cerrors.ErrTestStepsNeverReturned{}
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		for _, ss := range tr.steps {
+			if ss.stepRunning {
+				err.StepNames = append(err.StepNames, ss.sb.TestStepLabel)
+			}
+		}
+		return err
 	}
-	return nil
 }
 
-func newTargetWriter(log *logrus.Entry, timeouts TestRunnerTimeouts) *targetWriter {
-	return &targetWriter{log: log, timeouts: timeouts}
+// targetRunner runs one target through all the steps of the pipeline.
+func (tr *testRunner) targetRunner(ctx, stepCtx statectx.Context, ts *targetState) {
+	log := logging.AddField(tr.log, "target", ts.tgt.ID)
+	log.Debugf("%s: target runner active", ts)
+	// NB: CurStep may be non-zero on entry if resumed
+loop:
+	for i := ts.CurStep; i < len(tr.steps); {
+		// Early check for pause of cancelation.
+		select {
+		case <-ctx.Paused():
+			log.Debugf("%s: paused 0", ts)
+			break loop
+		case <-ctx.Done():
+			log.Debugf("%s: canceled 0", ts)
+			break loop
+		default:
+		}
+		tr.mu.Lock()
+		ss := tr.steps[i]
+		if ts.CurPhase == targetStepPhaseEnd {
+			// This target already terminated.
+			// Can happen if resumed from terminal state.
+			tr.mu.Unlock()
+			break loop
+		}
+		ts.CurStep = i
+		ts.CurPhase = targetStepPhaseBegin
+		tr.mu.Unlock()
+		// Make sure we have a step runner active.
+		// These are started on-demand.
+		tr.runStepIfNeeded(stepCtx, ss)
+		// Inject the target.
+		log.Debugf("%s: injecting into %s", ts, ss)
+		select {
+		case ss.inCh <- ts.tgt:
+			// Injected successfully.
+			tr.mu.Lock()
+			ts.CurPhase = targetStepPhaseRun
+			ss.numInjected++
+			tr.mu.Unlock()
+			tr.cond.Signal()
+		case <-time.After(tr.stepInjectTimeout):
+			tr.mu.Lock()
+			ss.log.Errorf("timed out while injecting a target")
+			ss.runErr = &cerrors.ErrTestTargetInjectionTimedOut{StepName: ss.sb.TestStepLabel}
+			tr.mu.Unlock()
+			break loop
+		case <-ctx.Done():
+			log.Debugf("%s: canceled 1", ts)
+			break loop
+		}
+		// Await result. It will be communicated to us by the step runner.
+		select {
+		case res, ok := <-ts.resCh:
+			if !ok {
+				log.Debugf("%s: result channel closed", ts)
+				break loop
+			}
+			log.Debugf("%s: result for %s recd", ts, ss)
+			tr.mu.Lock()
+			ts.CurPhase = targetStepPhaseEnd
+			ts.res = res
+			tr.cond.Signal()
+			if res != nil {
+				tr.mu.Unlock()
+				break loop
+			}
+			i++
+			if i < len(tr.steps) {
+				ts.CurStep = i
+				ts.CurPhase = targetStepPhaseInit
+			}
+			tr.mu.Unlock()
+			// Check for cancellation.
+			// Notably we are not checking for the pause condition here:
+			// when paused, we want to let all the injected targets to finish
+			// and collect all the results they produce. If that doesn't happen,
+			// step runner will close resCh on its way out and unlock us.
+		case <-ctx.Done():
+			log.Debugf("%s: canceled 2", ts)
+			break loop
+		}
+	}
+	log.Debugf("%s: target runner finished", ts)
+	tr.mu.Lock()
+	ts.resCh = nil
+	tr.cond.Signal()
+	tr.mu.Unlock()
+	tr.targetsWg.Done()
 }
 
-// Run implements the main logic of the TestRunner, i.e. the instantiation and
-// connection of the TestSteps, routing blocks and pipeline runner.
-func (tr *TestRunner) Run(ctx statectx.Context, test *test.Test, targets []*target.Target, jobID types.JobID, runID types.RunID) error {
-
-	if len(test.TestStepsBundles) == 0 {
-		return fmt.Errorf("no steps to run for test")
+// runStepIfNeeded starts the step runner goroutine if not already running.
+func (tr *testRunner) runStepIfNeeded(ctx statectx.Context, ss *stepState) {
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	if ss.stepRunning {
+		return
+	}
+	if ss.runErr != nil {
+		return
 	}
+	ss.stepRunning = true
+	ss.readerRunning = true
+	go tr.stepRunner(ctx, ss)
+	go tr.stepReader(ctx, ss)
+}
 
-	// rootLog is propagated to all the subsystems of the pipeline
-	rootLog := logging.GetLogger("pkg/runner")
-	fields := make(map[string]interface{})
-	fields["jobid"] = jobID
-	fields["runid"] = runID
-	rootLog = logging.AddFields(rootLog, fields)
+// emitStepEvent emits an error event if step resulted in an error.
+func (ss *stepState) emitStepEvent(tgt *target.Target, err error) error {
+	if err == nil {
+		return nil
+	}
+	payload, jmErr := json.Marshal(err.Error())
+	if jmErr != nil {
+		return fmt.Errorf("failed to marshal event: %w", err)
+	}
+	rm := json.RawMessage(payload)
+	errEv := testevent.Data{
+		EventName: EventTestError,
+		Target:    tgt,
+		Payload:   &rm,
+	}
+	return ss.ev.Emit(errEv)
+}
 
-	log := logging.AddField(rootLog, "phase", "run")
-	testPipeline := newPipeline(logging.AddField(rootLog, "entity", "test_pipeline"), test.TestStepsBundles, test, jobID, runID, tr.timeouts)
-
-	log.Infof("setting up pipeline")
-	completedTargets := make(chan *target.Target, 1)
-	inCh := testPipeline.init(ctx)
-
-	// inject targets in the step
-	terminateInjectionCtx, terminateInjection := context.WithCancel(context.Background())
-	go func(ctx context.Context, inputChannel chan<- *target.Target) {
-		defer close(inputChannel)
-		log := logging.AddField(log, "step", "injection")
-		writer := newTargetWriter(log, tr.timeouts)
-		for _, t := range targets {
-			if err := writer.writeTimeout(ctx, inputChannel, t, tr.timeouts.MessageTimeout); err != nil {
-				log.Debugf("could not inject target %+v into first routing block: %+v", t, err)
+// stepRunner runs a test pipeline's step (the Run() method).
+func (tr *testRunner) stepRunner(ctx statectx.Context, ss *stepState) {
+	ss.log.Debugf("%s: step runner active", ss)
+	defer func() {
+		if r := recover(); r != nil {
+			tr.mu.Lock()
+			ss.stepRunning = false
+			ss.runErr = &cerrors.ErrTestStepPaniced{
+				StepName:   ss.sb.TestStepLabel,
+				StackTrace: string(debug.Stack()),
 			}
+			tr.mu.Unlock()
+			tr.safeCloseOutCh(ss)
 		}
-	}(terminateInjectionCtx, inCh)
-
-	errCh := make(chan error, 1)
-	go func() {
-		log.Infof("running pipeline")
-		errCh <- testPipeline.run(completedTargets)
 	}()
-
-	defer terminateInjection()
-	// Receive targets from the completed channel controlled by the pipeline, while
-	// waiting for termination signals or fatal errors encountered while running
-	// the pipeline.
-	for {
-		select {
-		case err := <-errCh:
-			log.Debugf("test runner terminated, returning %v", err)
-			return err
-		case t := <-completedTargets:
-			log.Infof("test runner completed target: %v", t)
+	chans := test.TestStepChannels{In: ss.inCh, Out: ss.outCh, Err: ss.errCh}
+	runErr := ss.sb.TestStep.Run(ctx, chans, ss.sb.Parameters, ss.ev)
+	if err := ss.emitStepEvent(nil, runErr); err != nil {
+		if ss.runErr == nil {
+			ss.runErr = err
 		}
 	}
+	tr.mu.Lock()
+	ss.stepRunning = false
+	if runErr != nil {
+		ss.runErr = runErr
+	}
+	tr.mu.Unlock()
+	// Signal to the result processor that no more will be coming.
+	tr.safeCloseOutCh(ss)
+	ss.log.Debugf("%s: step runner finished", ss)
 }
 
-// NewTestRunner initializes and returns a new TestRunner object. This test
-// runner will use default timeout values
-func NewTestRunner() TestRunner {
-	return TestRunner{
-		timeouts: TestRunnerTimeouts{
-			StepInjectTimeout:   config.StepInjectTimeout,
-			MessageTimeout:      config.TestRunnerMsgTimeout,
-			ShutdownTimeout:     config.TestRunnerShutdownTimeout,
-			StepShutdownTimeout: config.TestRunnerStepShutdownTimeout,
-		},
+// reportTargetResult reports result of executing a step to the appropriate target runner.
+func (tr *testRunner) reportTargetResult(ctx statectx.Context, ss *stepState, tgt *target.Target, res error) error {
+	resCh, err := func() (chan error, error) {
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		ts := tr.targets[tgt.ID]
+		if ts == nil {
+			return nil, fmt.Errorf("%s: result for nonexistent target %s %v", ss, tgt, res)
+		}
+		if ss.tgtDone[tgt] {
+			return nil, &cerrors.ErrTestStepReturnedDuplicateResult{
+				StepName: ss.sb.TestStepLabel,
+				Target:   tgt.ID,
+			}
+		}
+		ss.tgtDone[tgt] = true
+		// Begin is also allowed here because it may happen that we get a result before target runner updates phase.
+		if ts.CurStep != ss.stepIndex || (ts.CurPhase != targetStepPhaseBegin && ts.CurPhase != targetStepPhaseRun) {
+			return nil, &cerrors.ErrTestStepReturnedUnexpectedResult{
+				StepName: ss.sb.TestStepLabel,
+				Target:   tgt.ID,
+			}
+		}
+		if ts.resCh == nil {
+			// This should not happen, must be an internal error.
+			return nil, fmt.Errorf("%s: target runner %s is not there, dropping result on the floor", ss, ts)
+		}
+		ss.log.Debugf("%s: result for %s: %v", ss, ts, res)
+		return ts.resCh, nil
+	}()
+	if err != nil {
+		return err
+	}
+	if res != nil {
+		if err := ss.emitStepEvent(tgt, res); err != nil {
+			return err
+		}
+	}
+	select {
+	case resCh <- res:
+		break
+	case <-ctx.Done():
+		break
 	}
+	return nil
 }
 
-// NewTestRunnerWithTimeouts initializes and returns a new TestRunner object with
-// custom timeouts
-func NewTestRunnerWithTimeouts(timeouts TestRunnerTimeouts) TestRunner {
-	return TestRunner{timeouts: timeouts}
+func (tr *testRunner) safeCloseOutCh(ss *stepState) {
+	defer func() {
+		if r := recover(); r != nil {
+			tr.mu.Lock()
+			ss.runErr = &cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel}
+			tr.mu.Unlock()
+		}
+	}()
+	close(ss.outCh)
 }
 
-// State is a structure that models the current state of the test runner
-type State struct {
-	completedSteps   map[string]error
-	completedRouting map[string]error
-	completedTargets map[*target.Target]error
+// safeCloseErrCh closes error channel safely, even if it has already been closed.
+func (tr *testRunner) safeCloseErrCh(ss *stepState) {
+	defer func() {
+		if r := recover(); r != nil {
+			tr.mu.Lock()
+			ss.runErr = &cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel}
+			tr.mu.Unlock()
+		}
+	}()
+	close(ss.errCh)
 }
 
-// NewState initializes a State object.
-func NewState() *State {
-	r := State{}
-	r.completedSteps = make(map[string]error)
-	r.completedRouting = make(map[string]error)
-	r.completedTargets = make(map[*target.Target]error)
-	return &r
+// stepReader receives results from the step's output channel and forwards them to the appropriate target runners.
+func (tr *testRunner) stepReader(ctx statectx.Context, ss *stepState) {
+	ss.log.Debugf("%s: step reader active", ss)
+	var err error
+	outCh := ss.outCh
+loop:
+	for {
+		select {
+		case tgt, ok := <-outCh:
+			if !ok {
+				ss.log.Debugf("%s: out chan closed", ss)
+				// At this point we may still have an error to report,
+				// wait until error channel is emptied too.
+				outCh = make(chan *target.Target)
+				tr.safeCloseErrCh(ss)
+				break
+			}
+			if err = tr.reportTargetResult(ctx, ss, tgt, nil); err != nil {
+				break loop
+			}
+		case res, ok := <-ss.errCh:
+			if !ok {
+				ss.log.Debugf("%s: err chan closed", ss)
+				break loop
+			}
+			if err = tr.reportTargetResult(ctx, ss, res.Target, res.Err); err != nil {
+				break loop
+			}
+		case <-ctx.Done():
+			ss.log.Debugf("%s: canceled 3, draining", ss)
+			for {
+				select {
+				case <-outCh:
+					break loop
+				case <-ss.errCh:
+					break loop
+				case <-time.After(tr.shutdownTimeout):
+					tr.mu.Lock()
+					if ss.runErr == nil {
+						ss.runErr = &cerrors.ErrTestStepsNeverReturned{}
+					}
+					tr.mu.Unlock()
+				}
+			}
+		}
+	}
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	if ss.runErr == nil && err != nil {
+		ss.runErr = err
+	}
+	if ss.stepRunning && ss.runErr == nil {
+		// This means that plugin closed its channels before leaving.
+		ss.runErr = &cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel}
+	}
+	ss.readerRunning = false
+	ss.log.Debugf("%s: step reader finished, %t %t %v", ss, ss.stepRunning, ss.readerRunning, ss.runErr)
+	tr.cond.Signal()
 }
 
-// CompletedTargets returns a map that associates each target with its returning error.
-// If the target succeeded, the error will be nil
-func (r *State) CompletedTargets() map[*target.Target]error {
-	return r.completedTargets
+// checkStepRunnersLocked checks if any step runner has encountered an error.
+func (tr *testRunner) checkStepRunners() error {
+	for _, ss := range tr.steps {
+		if ss.runErr != nil {
+			return ss.runErr
+		}
+	}
+	return nil
 }
 
-// CompletedRouting returns a map that associates each routing block with its returning error.
-// If the routing block succeeded, the error will be nil
-func (r *State) CompletedRouting() map[string]error {
-	return r.completedRouting
+// runMonitor monitors progress of targets through the pipeline
+// and closes input channels of the steps to indicate that no more are expected.
+// It also monitors steps for critical errors and cancels the whole run.
+// Note: input channels remain open when cancellation is requested,
+// plugins are expected to handle it explicitly.
+func (tr *testRunner) runMonitor() error {
+	tr.log.Debugf("monitor: active")
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	// First, compute the starting step of the pipeline (it may be non-zero
+	// if the pipleine was resumed).
+	minStep := len(tr.steps)
+	for _, ts := range tr.targets {
+		if ts.CurStep < minStep {
+			minStep = ts.CurStep
+		}
+	}
+	if minStep < len(tr.steps) {
+		tr.log.Debugf("monitor: starting at step %s", tr.steps[minStep])
+	}
+
+	// Run the main loop.
+	pass := 1
+	var runErr error
+loop:
+	for step := minStep; step < len(tr.steps); pass++ {
+		ss := tr.steps[step]
+		tr.log.Debugf("monitor pass %d: current step %s", pass, ss)
+		// Check if all the targets have either made it past the injection phase or terminated.
+		ok := true
+		for _, ts := range tr.targets {
+			tr.log.Debugf("monitor pass %d: %s: %s", pass, ss, ts)
+			if ts.resCh == nil { // Not running anymore
+				continue
+			}
+			if ok && (ts.CurStep < step || ts.CurPhase < targetStepPhaseRun) {
+				tr.log.Debugf("monitor pass %d: %s: not all targets injected yet (%s)", pass, ss, ts)
+				ok = false
+				break
+			}
+		}
+		if runErr = tr.checkStepRunners(); runErr != nil {
+			break loop
+		}
+		if !ok {
+			// Wait for notification: as progress is being made, we get notified.
+			tr.cond.Wait()
+			continue
+		}
+		// All targets ok, close the step's input channel.
+		tr.log.Debugf("monitor pass %d: %s: no more targets, closing input channel", pass, ss)
+		close(ss.inCh)
+		step++
+	}
+	tr.log.Debugf("monitor: finished, %v", runErr)
+	return runErr
 }
 
-// CompletedSteps returns a map that associates each step with its returning error.
-// If the step succeeded, the error will be nil
-func (r *State) CompletedSteps() map[string]error {
-	return r.completedSteps
+func NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout time.Duration) TestRunner {
+	tr := &testRunner{
+		stepInjectTimeout: stepInjectTimeout,
+		shutdownTimeout:   shutdownTimeout,
+	}
+	tr.cond = sync.NewCond(&tr.mu)
+	return tr
 }
 
-// SetRouting sets the error associated with a routing block
-func (r *State) SetRouting(testStepLabel string, err error) {
-	r.completedRouting[testStepLabel] = err
+func NewTestRunner() TestRunner {
+	return NewTestRunnerWithTimeouts(config.StepInjectTimeout, config.TestRunnerShutdownTimeout)
 }
 
-// SetTarget sets the error associated with a target
-func (r *State) SetTarget(target *target.Target, err error) {
-	r.completedTargets[target] = err
+func (tph targetStepPhase) String() string {
+	switch tph {
+	case targetStepPhaseInit:
+		return "init"
+	case targetStepPhaseBegin:
+		return "begin"
+	case targetStepPhaseRun:
+		return "run"
+	case targetStepPhaseEnd:
+		return "end"
+	}
+	return fmt.Sprintf("???(%d)", tph)
 }
 
-// SetStep sets the error associated with a step
-func (r *State) SetStep(testStepLabel string, err error) {
-	r.completedSteps[testStepLabel] = err
+func (ss *stepState) String() string {
+	return fmt.Sprintf("[#%d %s]", ss.stepIndex, ss.sb.TestStepLabel)
 }
 
-// IncompleteSteps returns a slice of step names for which the result hasn't been set yet
-func (r *State) IncompleteSteps(bundles []test.TestStepBundle) []string {
-	var incompleteSteps []string
-	for _, bundle := range bundles {
-		if _, ok := r.completedSteps[bundle.TestStepLabel]; !ok {
-			incompleteSteps = append(incompleteSteps, bundle.TestStepLabel)
+func (ts *targetState) String() string {
+	var resText string
+	if ts.res != nil {
+		resStr := fmt.Sprintf("%s", ts.res)
+		if len(resStr) > 20 {
+			resStr = resStr[:20] + "..."
 		}
+		resText = fmt.Sprintf("%q", resStr)
+	} else {
+		resText = "<nil>"
 	}
-	return incompleteSteps
+	finished := ts.resCh == nil
+	return fmt.Sprintf("[%s %d %s %t %s]",
+		ts.tgt, ts.CurStep, ts.CurPhase, finished, resText)
 }
diff --git a/pkg/runner/test_runner_pipeline.go b/pkg/runner/test_runner_pipeline.go
deleted file mode 100644
index bfdaa435..00000000
--- a/pkg/runner/test_runner_pipeline.go
+++ /dev/null
@@ -1,542 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"runtime/debug"
-	"sync/atomic"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/cerrors"
-	"github.com/facebookincubator/contest/pkg/event/testevent"
-	"github.com/facebookincubator/contest/pkg/logging"
-	"github.com/facebookincubator/contest/pkg/statectx"
-	"github.com/facebookincubator/contest/pkg/storage"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/facebookincubator/contest/pkg/types"
-	"github.com/sirupsen/logrus"
-)
-
-// pipeline represents a sequence of steps through which targets flow. A pipeline could implement
-// either a test sequence or a cleanup sequence
-type pipeline struct {
-	log *logrus.Entry
-
-	bundles []test.TestStepBundle
-
-	jobID types.JobID
-	runID types.RunID
-
-	state *State
-	test  *test.Test
-
-	timeouts TestRunnerTimeouts
-
-	// ctrlChannels represents a set of result and completion channels for this pipeline,
-	// used to collect the results of routing blocks, steps and targets completing
-	// the pipeline. It's available only after having initialized the pipeline
-	ctrlChannels *pipelineCtrlCh
-
-	// numIngress represents the total number of targets that have been seen entering
-	// the pipeline. This number is set by the first routing block as soon as the injection
-	// terminates
-	numIngress uint64
-}
-
-// runStep runs synchronously a TestStep and peforms sanity checks on the status
-// of the input/output channels on the defer control path. When the TestStep returns,
-// the associated output channels are closed. This signals to the routing subsytem
-// that this step of the pipeline will not be producing any more targets. The API
-// dictates that the TestStep can only return if its input channel has been closed,
-// which indicates that no more targets will be submitted to that step. If the
-// TestStep does not comply with this rule, an error is sent downstream to the pipeline
-// runner, which will in turn shutdown the whole pipeline and flag the TestStep
-// as misbehaving. All attempts to send downstream a result after a TestStep complete
-// are protected by a timeout and fail open. This because it's not guaranteed that
-// the TestRunner will still be listening on the result channels. If the TestStep hangs
-// indefinitely and does not respond to cancellation signals, the TestRunner will
-// flag it as misbehaving and return. If the TestStep returns once the TestRunner
-// has completed, it will timeout trying to write on the result channel.
-func (p *pipeline) runStep(ctx statectx.Context, jobID types.JobID, runID types.RunID, bundle test.TestStepBundle, stepCh stepCh, resultCh chan<- stepResult, ev testevent.EmitterFetcher) {
-
-	stepLabel := bundle.TestStepLabel
-	log := logging.AddField(p.log, "step", stepLabel)
-	log = logging.AddField(log, "phase", "runStep")
-
-	log.Debugf("initializing step")
-	timeout := p.timeouts.MessageTimeout
-	defer func() {
-		if r := recover(); r != nil {
-			err := fmt.Errorf("step %s paniced (%v): %s", stepLabel, r, debug.Stack())
-			select {
-			case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-			case <-time.After(p.timeouts.MessageTimeout):
-				log.Warningf("sending error back from test step runner timed out after %v: %v", timeout, err)
-				return
-			}
-			return
-		}
-	}()
-
-	// We should not run a test step if there're no targets in stepCh.stepIn
-	// First we check if there's at least one incoming target, and only
-	// then we call `bundle.TestStep.Run()`.
-	//
-	// ITS: https://github.com/facebookincubator/contest/issues/101
-	stepIn, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(ctx.PausedOrDoneCtx(), stepCh.stepIn)
-
-	haveTargets := false
-	select {
-	case <-onFirstTargetChan:
-		log.Debugf("first target received, will start step")
-		haveTargets = true
-	case <-ctx.Done():
-		log.Debugf("cancelled")
-	case <-ctx.Paused():
-		log.Debugf("paused")
-	case <-onNoTargetsChan:
-		log.Debugf("no targets")
-	}
-
-	var err error
-	if haveTargets {
-		// Run the TestStep and defer to the return path the handling of panic
-		// conditions. If multiple error conditions occur, send downstream only
-		// the first error encountered.
-		channels := test.TestStepChannels{
-			In:  stepIn,
-			Out: stepCh.stepOut,
-			Err: stepCh.stepErr,
-		}
-		err = bundle.TestStep.Run(ctx, channels, bundle.Parameters, ev)
-	}
-
-	log.Debugf("step %s returned", bundle.TestStepLabel)
-
-	// Check if we are shutting down. If so, do not perform sanity checks on the
-	// channels but return immediately as the TestStep itself probably returned
-	// because it honored the termination signal.
-
-	cancellationAsserted := ctx.PausedOrDoneCtx().Err() == statectx.ErrCanceled
-	pauseAsserted := ctx.PausedOrDoneCtx().Err() == statectx.ErrPaused
-
-	if cancellationAsserted && err == nil {
-		err = fmt.Errorf("test step cancelled")
-	}
-
-	if cancellationAsserted || pauseAsserted {
-		select {
-		case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-		case <-time.After(timeout):
-			log.Warningf("sending error back from TestStep runner timed out after %v: %v", timeout, err)
-		}
-		return
-	}
-
-	// If the TestStep has crashed with an error, return immediately the result to the TestRunner
-	// which in turn will issue a cancellation signal to the pipeline
-	if err != nil {
-		select {
-		case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-		case <-time.After(timeout):
-			log.Warningf("sending error back from test step runner (%s) timed out after %v: %v", stepLabel, timeout, err)
-		}
-		return
-	}
-
-	// Perform sanity checks on the status of the channels. The TestStep API
-	// mandates that output and error channels shall not be closed by the
-	// TestStep itself. If the TestStep does not comply with the API, it is
-	// flagged as misbehaving.
-	select {
-	case _, ok := <-stepCh.stepIn:
-		if !ok {
-			break
-		}
-		// stepCh.stepIn is not closed, but the TestStep returned, which is a violation
-		// of the API. Record the error if no other error condition has been seen.
-		err = fmt.Errorf("step %s returned, but input channel is not closed (api violation; case 0)", stepLabel)
-
-	default:
-		// stepCh.stepIn is not closed, and a read operation would block. The TestStep
-		// does not comply with the API (see above).
-		err = fmt.Errorf("step %s returned, but input channel is not closed (api violation; case 1)", stepLabel)
-	}
-
-	select {
-	case _, ok := <-stepCh.stepOut:
-		if !ok {
-			// stepOutCh has been closed. This is a violation of the API. Record the error
-			// if no other error condition has been seen.
-			if err == nil {
-				err = &cerrors.ErrTestStepClosedChannels{StepName: stepLabel}
-			}
-		}
-	default:
-		// stepCh.stepOut is open. Flag it for closure to signal to the routing subsystem that
-		// no more Targets will go through from this channel.
-		close(stepCh.stepOut)
-	}
-
-	select {
-	case _, ok := <-stepCh.stepErr:
-		if !ok {
-			// stepErrCh has been closed. This is a violation of the API. Record the error
-			// if no other error condition has been seen.
-			if err == nil {
-				err = &cerrors.ErrTestStepClosedChannels{StepName: stepLabel}
-			}
-		}
-	default:
-		// stepCh.stepErr is open. Flag it for closure to signal to the routing subsystem that
-		// no more Targets will go through from this channel.
-		close(stepCh.stepErr)
-	}
-
-	select {
-	case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-	case <-time.After(timeout):
-		log.Warningf("sending error back from step runner (%s) timed out after %v: %v", stepLabel, timeout, err)
-		return
-	}
-}
-
-// waitTargets reads results coming from results channels until all Targets
-// have completed or an error occurs. If all Targets complete successfully, it checks
-// whether TestSteps and routing blocks have completed as well. If not, returns an
-// error. Termination is signalled via terminate channel.
-func (p *pipeline) waitTargets(ctx context.Context, completedCh chan<- *target.Target) error {
-
-	log := logging.AddField(p.log, "phase", "waitTargets")
-
-	var (
-		err                  error
-		completedTarget      *target.Target
-		completedTargetError error
-	)
-
-	outChannel := p.ctrlChannels.targetOut
-
-	writer := newTargetWriter(log, p.timeouts)
-	for {
-		select {
-		case t, ok := <-outChannel:
-			if !ok {
-				log.Debugf("pipeline output channel was closed, no more targets will come through")
-				outChannel = nil
-				break
-			}
-			completedTarget = t
-		case <-ctx.Done():
-			// When termination is signaled just stop wait. It is up
-			// to the caller to decide how to further handle pipeline termination.
-			log.Debugf("termination requested")
-			return nil
-		case res := <-p.ctrlChannels.routingResultCh:
-			err = res.err
-			p.state.SetRouting(res.bundle.TestStepLabel, res.err)
-		case res := <-p.ctrlChannels.stepResultCh:
-			err = res.err
-			if err != nil {
-				payload, jmErr := json.Marshal(err.Error())
-				if jmErr != nil {
-					log.Warningf("failed to marshal error string to JSON: %v", jmErr)
-					continue
-				}
-				rm := json.RawMessage(payload)
-				header := testevent.Header{
-					JobID:         res.jobID,
-					RunID:         res.runID,
-					TestName:      p.test.Name,
-					TestStepLabel: res.bundle.TestStepLabel,
-				}
-				ev := storage.NewTestEventEmitterFetcher(header)
-				// this event is not associated to any target, e.g. a plugin has returned an error.
-				errEv := testevent.Data{EventName: EventTestError, Target: nil, Payload: &rm}
-				// emit test event containing the completion error
-				if err := ev.Emit(errEv); err != nil {
-					log.Warningf("could not emit completion error event %v", errEv)
-				}
-			}
-			p.state.SetStep(res.bundle.TestStepLabel, res.err)
-
-		case targetErr := <-p.ctrlChannels.targetErr:
-			completedTarget = targetErr.Target
-			completedTargetError = targetErr.Err
-		}
-
-		if err != nil {
-			return err
-		}
-
-		if outChannel == nil {
-			log.Debugf("no more targets to wait, output channel is closed")
-			break
-		}
-
-		if completedTarget != nil {
-			p.state.SetTarget(completedTarget, completedTargetError)
-			log.Debugf("writing target %+v on the completed channel", completedTarget)
-			if err := writer.writeTimeout(ctx, completedCh, completedTarget, p.timeouts.MessageTimeout); err != nil {
-				log.Panicf("could not write completed target: %v", err)
-			}
-			completedTarget = nil
-			completedTargetError = nil
-		}
-
-		numIngress := atomic.LoadUint64(&p.numIngress)
-		numCompleted := uint64(len(p.state.CompletedTargets()))
-		log.Debugf("targets completed: %d, expected: %d", numCompleted, numIngress)
-		if numIngress != 0 && numCompleted == numIngress {
-			log.Debugf("no more targets to wait, all targets (%d) completed", numCompleted)
-			break
-		}
-	}
-	// The test run completed, we have collected all Targets. TestSteps might have already
-	// closed `ch.out`, in which case the pipeline terminated correctly (channels are closed
-	// in a "domino" sequence, so seeing the last channel closed indicates that the
-	// sequence of close operations has completed). If `ch.out` is still open,
-	// there are still TestSteps that might have not returned. Wait for all
-	// TestSteps to complete or `StepShutdownTimeout` to occur.
-	log.Infof("waiting for all steps to complete")
-	return p.waitSteps()
-}
-
-// waitTermination reads results coming from result channels waiting
-// for the pipeline to completely shutdown before `ShutdownTimeout` occurs. A
-// "complete shutdown" means that all TestSteps and routing blocks have sent
-// downstream their results.
-func (p *pipeline) waitTermination() error {
-
-	if len(p.bundles) == 0 {
-		return fmt.Errorf("no bundles specified for waitTermination")
-	}
-
-	log := logging.AddField(p.log, "phase", "waitTermination")
-	log.Printf("waiting for pipeline to terminate")
-
-	for {
-
-		log.Debugf("steps completed: %d", len(p.state.CompletedSteps()))
-		log.Debugf("routing completed: %d", len(p.state.CompletedRouting()))
-		stepsCompleted := len(p.state.CompletedSteps()) == len(p.bundles)
-		routingCompleted := len(p.state.CompletedRouting()) == len(p.bundles)
-		if stepsCompleted && routingCompleted {
-			return nil
-		}
-
-		select {
-		case <-time.After(p.timeouts.ShutdownTimeout):
-			incompleteSteps := p.state.IncompleteSteps(p.bundles)
-			if len(incompleteSteps) > 0 {
-				return &cerrors.ErrTestStepsNeverReturned{StepNames: incompleteSteps}
-			}
-			return fmt.Errorf("pipeline did not return but all test steps completed")
-		case res := <-p.ctrlChannels.routingResultCh:
-			p.state.SetRouting(res.bundle.TestStepLabel, res.err)
-		case res := <-p.ctrlChannels.stepResultCh:
-			p.state.SetStep(res.bundle.TestStepLabel, res.err)
-		}
-	}
-}
-
-// waitSteps reads results coming from result channels until `StepShutdownTimeout`
-// occurs or an error is encountered. It then checks whether TestSteps and routing
-// blocks have all returned correctly. If not, it returns an error.
-func (p *pipeline) waitSteps() error {
-
-	if len(p.bundles) == 0 {
-		return fmt.Errorf("no bundles specified for waitSteps")
-	}
-
-	log := logging.AddField(p.log, "phase", "waitSteps")
-
-	var err error
-
-	log.Debugf("waiting for test steps to terminate")
-	for {
-		select {
-		case <-time.After(p.timeouts.StepShutdownTimeout):
-			log.Warningf("timed out waiting for steps to complete after %v", p.timeouts.StepShutdownTimeout)
-			incompleteSteps := p.state.IncompleteSteps(p.bundles)
-			if len(incompleteSteps) > 0 {
-				err = &cerrors.ErrTestStepsNeverReturned{StepNames: incompleteSteps}
-				break
-			}
-			if len(p.state.CompletedRouting()) != len(p.bundles) {
-				err = fmt.Errorf("not all routing completed: %d!=%d", len(p.state.CompletedRouting()), len(p.bundles))
-			}
-		case res := <-p.ctrlChannels.routingResultCh:
-			log.Debugf("received routing block result for %s", res.bundle.TestStepLabel)
-			p.state.SetRouting(res.bundle.TestStepLabel, res.err)
-			err = res.err
-		case res := <-p.ctrlChannels.stepResultCh:
-			log.Debugf("received step result for %s", res.bundle.TestStepLabel)
-			p.state.SetStep(res.bundle.TestStepLabel, res.err)
-			err = res.err
-		}
-		if err != nil {
-			return err
-		}
-		log.Debugf("steps completed: %d, expected: %d", len(p.state.CompletedSteps()), len(p.bundles))
-		log.Debugf("routing completed: %d, expected: %d", len(p.state.CompletedRouting()), len(p.bundles))
-		stepsCompleted := len(p.state.CompletedSteps()) == len(p.bundles)
-		routingCompleted := len(p.state.CompletedRouting()) == len(p.bundles)
-		if stepsCompleted && routingCompleted {
-			break
-		}
-	}
-
-	return nil
-}
-
-// init initializes the pipeline by connecting steps and routing blocks. The result of pipeline
-// initialization is a set of control/result channels assigned to the pipeline object. The pipeline
-// input channel is returned.
-func (p *pipeline) init(ctx statectx.Context) (routeInFirst chan *target.Target) {
-	p.log.Debugf("starting")
-
-	if p.ctrlChannels != nil {
-		p.log.Panicf("pipeline is already initialized, control channel are already configured")
-	}
-
-	var (
-		routeOut chan *target.Target
-		routeIn  chan *target.Target
-	)
-
-	pipelineCtx, pause, cancel := statectx.WithParent(ctx)
-
-	// result channels used to communicate result information from the routing blocks
-	// and step executors
-	routingResultCh := make(chan routeResult)
-	stepResultCh := make(chan stepResult)
-	targetErrCh := make(chan cerrors.TargetError)
-
-	routeIn = make(chan *target.Target)
-	for position, testStepBundle := range p.bundles {
-
-		// Input and output channels for the TestStep
-		stepInCh := make(chan *target.Target)
-		stepOutCh := make(chan *target.Target)
-		stepErrCh := make(chan cerrors.TargetError)
-
-		routeOut = make(chan *target.Target)
-
-		// First step of the pipeline, keep track of the routeIn channel as this is
-		// going to be used to injects targets into the pipeline from outside. Also
-		// add an intermediate goroutine which keeps track of how many targets have
-		// been injected into the pipeline
-		if position == 0 {
-			routeInFirst = make(chan *target.Target)
-			routeInStep := routeIn
-			go func() {
-				defer close(routeInStep)
-				numIngress := uint64(0)
-				for t := range routeInFirst {
-					routeInStep <- t
-					numIngress++
-				}
-				atomic.StoreUint64(&p.numIngress, numIngress)
-			}()
-		}
-
-		stepChannels := stepCh{stepIn: stepInCh, stepErr: stepErrCh, stepOut: stepOutCh}
-		routingChannels := routingCh{
-			routeIn:   routeIn,
-			routeOut:  routeOut,
-			stepIn:    stepInCh,
-			stepErr:   stepErrCh,
-			stepOut:   stepOutCh,
-			targetErr: targetErrCh,
-		}
-
-		// Build the Header that the the TestStep will be using for emitting events
-		Header := testevent.Header{
-			JobID:         p.jobID,
-			RunID:         p.runID,
-			TestName:      p.test.Name,
-			TestStepLabel: testStepBundle.TestStepLabel,
-		}
-		ev := storage.NewTestEventEmitterFetcherWithAllowedEvents(Header, &testStepBundle.AllowedEvents)
-
-		router := newStepRouter(p.log, testStepBundle, routingChannels, ev, p.timeouts)
-		go router.route(pipelineCtx.PausedOrDoneCtx(), routingResultCh)
-		go p.runStep(pipelineCtx, p.jobID, p.runID, testStepBundle, stepChannels, stepResultCh, ev)
-		// The input of the next routing block is the output of the current routing block
-		routeIn = routeOut
-	}
-
-	p.ctrlChannels = &pipelineCtrlCh{
-		routingResultCh: routingResultCh,
-		stepResultCh:    stepResultCh,
-		targetErr:       targetErrCh,
-		targetOut:       routeOut,
-
-		ctx:    pipelineCtx,
-		cancel: cancel,
-		pause:  pause,
-	}
-
-	return
-}
-
-// run is a blocking method which executes the pipeline until successful or failed termination
-func (p *pipeline) run(completedTargetsCh chan<- *target.Target) error {
-	p.log.Debugf("run")
-	if p.ctrlChannels == nil {
-		p.log.Panicf("pipeline is not initialized, control channels are not available")
-	}
-	ctx := p.ctrlChannels.ctx
-
-	// Wait for the pipeline to complete. If an error occurrs, cancel all TestSteps
-	// and routing blocks and wait again for completion until shutdown timeout occurrs.
-	p.log.Infof("waiting for pipeline to complete")
-	completionError := p.waitTargets(ctx.PausedOrDoneCtx(), completedTargetsCh)
-
-	pauseAsserted := ctx.PausedOrDoneCtx().Err() == statectx.ErrPaused
-	cancellationAsserted := ctx.PausedOrDoneCtx().Err() == statectx.ErrCanceled
-
-	if completionError != nil {
-		p.log.Warningf("test failed to complete: %v. Forcing cancellation.", completionError)
-		p.ctrlChannels.cancel() // terminate routing and and propagate cancellation to the steps
-	} else if cancellationAsserted {
-		p.log.Infof("cancellation was asserted")
-	} else if pauseAsserted {
-		p.log.Warningf("received pause request")
-	}
-
-	// If either cancellation or pause have been asserted, we need to wait for the
-	// pipeline to terminate
-	if cancellationAsserted || pauseAsserted || completionError != nil {
-		signal := "cancellation"
-		if pauseAsserted {
-			signal = "pause"
-		}
-		terminationError := p.waitTermination()
-		if terminationError != nil {
-			p.log.Infof("test did not terminate correctly after %s signal: %v", signal, terminationError)
-		} else {
-			p.log.Infof("test terminated correctly after %s signal", signal)
-		}
-		if completionError != nil {
-			return completionError
-		}
-		return terminationError
-	}
-	p.log.Infof("completed")
-	return nil
-}
-
-func newPipeline(log *logrus.Entry, bundles []test.TestStepBundle, test *test.Test, jobID types.JobID, runID types.RunID, timeouts TestRunnerTimeouts) *pipeline {
-	p := pipeline{log: log, bundles: bundles, jobID: jobID, runID: runID, test: test, timeouts: timeouts}
-	p.state = NewState()
-	return &p
-}
diff --git a/pkg/runner/test_runner_route.go b/pkg/runner/test_runner_route.go
deleted file mode 100644
index 4f5a880d..00000000
--- a/pkg/runner/test_runner_route.go
+++ /dev/null
@@ -1,302 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"container/list"
-	"context"
-	"encoding/json"
-	"fmt"
-	"log"
-	"sync"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/event/testevent"
-	"github.com/facebookincubator/contest/pkg/logging"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/sirupsen/logrus"
-)
-
-// router implements the routing logic that injects targets into a test step and consumes
-// targets in output from another test step
-type stepRouter struct {
-	log *logrus.Entry
-
-	routingChannels routingCh
-	bundle          test.TestStepBundle
-	ev              testevent.EmitterFetcher
-
-	timeouts TestRunnerTimeouts
-}
-
-// routeIn is responsible for accepting a target from the previous routing block
-// and injecting it into the associated test step. Returns the numer of targets
-// injected into the test step or an error upon failure
-func (r *stepRouter) routeIn(ctx context.Context) (int, error) {
-
-	stepLabel := r.bundle.TestStepLabel
-	log := logging.AddField(r.log, "step", stepLabel)
-	log = logging.AddField(log, "phase", "routeIn")
-
-	var (
-		err             error
-		injectionWg     sync.WaitGroup
-		routeInProgress bool
-	)
-
-	// terminateTargetWriter is a control channel used to signal termination to
-	// the writer object which injects a target into the test step
-	terminateTargetWriterCtx, terminateTargetWriter := context.WithCancel(context.Background())
-	defer terminateTargetWriter() // avoids possible goroutine deadlock in context.WithCancel implementation
-
-	// `targets` is used to buffer targets coming from the previous routing blocks,
-	// queueing them for injection into the TestStep. The list is accessed
-	// synchronously by a single goroutine.
-	targets := list.New()
-
-	// `ingressTarget` is used to keep track of ingress times of a target into a test step
-	ingressTarget := make(map[string]time.Time)
-
-	// Channel that the injection goroutine uses to communicate back to `routeIn` the results
-	// of asynchronous injection
-	injectResultCh := make(chan injectionResult)
-
-	// injectionChannels are used to inject targets into test step and return results to `routeIn`
-	injectionChannels := injectionCh{stepIn: r.routingChannels.stepIn, resultCh: injectResultCh}
-
-	log.Debugf("initializing routeIn for %s", stepLabel)
-	targetWriter := newTargetWriter(log, r.timeouts)
-
-	for {
-		select {
-		case <-ctx.Done():
-			err = fmt.Errorf("termination requested for routing into %s", stepLabel)
-		case injectionResult := <-injectResultCh:
-			log.Debugf("received injection result for %v", injectionResult.target)
-			routeInProgress = false
-			if injectionResult.err != nil {
-				err = fmt.Errorf("routing failed while injecting target %+v into %s", injectionResult.target, stepLabel)
-				targetInErrEv := testevent.Data{EventName: target.EventTargetInErr, Target: injectionResult.target}
-				if err := r.ev.Emit(targetInErrEv); err != nil {
-					log.Warningf("could not emit %v event for target %+v: %v", targetInErrEv, *injectionResult.target, err)
-				}
-			} else {
-				targetInEv := testevent.Data{EventName: target.EventTargetIn, Target: injectionResult.target}
-				if err := r.ev.Emit(targetInEv); err != nil {
-					log.Warningf("could not emit %v event for Target: %+v", targetInEv, *injectionResult.target)
-				}
-			}
-		case t, chanIsOpen := <-r.routingChannels.routeIn:
-			if !chanIsOpen {
-				log.Debugf("routing input channel closed")
-				r.routingChannels.routeIn = nil
-			} else {
-				log.Debugf("received target %v in input", t)
-				targets.PushFront(t)
-			}
-		}
-
-		if err != nil {
-			break
-		}
-
-		if routeInProgress {
-			continue
-		}
-
-		// no targets currently being injected in the test step
-		if targets.Len() == 0 {
-			if r.routingChannels.routeIn == nil {
-				log.Debugf("input channel is closed and no more targets are available, closing step input channel")
-				close(r.routingChannels.stepIn)
-				break
-			}
-			continue
-		}
-
-		t := targets.Back().Value.(*target.Target)
-		ingressTarget[t.ID] = time.Now()
-		targets.Remove(targets.Back())
-		log.Debugf("writing target %v into test step", t)
-		routeInProgress = true
-		injectionWg.Add(1)
-		go func() {
-			defer injectionWg.Done()
-			targetWriter.writeTargetWithResult(terminateTargetWriterCtx, t, injectionChannels)
-		}()
-	}
-	// Signal termination to the injection routines regardless of the result of the
-	// routing. If the routing completed successfully, this is a no-op. If there is an
-	// injection goroutine running, wait for it to terminate, as we might have gotten
-	// here after a cancellation signal.
-	terminateTargetWriter()
-	injectionWg.Wait()
-
-	if err != nil {
-		log.Debugf("routeIn failed: %v", err)
-		return 0, err
-	}
-	return len(ingressTarget), nil
-}
-
-func (r *stepRouter) emitOutEvent(t *target.Target, err error) error {
-
-	log := logging.AddField(r.log, "step", r.bundle.TestStepLabel)
-	log = logging.AddField(log, "phase", "emitOutEvent")
-
-	if err != nil {
-		targetErrPayload := target.ErrPayload{Error: err.Error()}
-		payloadEncoded, err := json.Marshal(targetErrPayload)
-		if err != nil {
-			log.Warningf("could not encode target error ('%s'): %v", targetErrPayload, err)
-		}
-		rawPayload := json.RawMessage(payloadEncoded)
-		targetErrEv := testevent.Data{EventName: target.EventTargetErr, Target: t, Payload: &rawPayload}
-		if err := r.ev.Emit(targetErrEv); err != nil {
-			return err
-		}
-	} else {
-		targetOutEv := testevent.Data{EventName: target.EventTargetOut, Target: t}
-		if err := r.ev.Emit(targetOutEv); err != nil {
-			log.Warningf("could not emit %v event for target: %v", targetOutEv, *t)
-		}
-	}
-	return nil
-}
-
-// routeOut is responsible for accepting a target from the associated test step
-// and forward it to the next routing block. Returns the number of targets
-// received from the test step or an error upon failure
-func (r *stepRouter) routeOut(ctx context.Context) (int, error) {
-
-	stepLabel := r.bundle.TestStepLabel
-	log := logging.AddField(r.log, "step", stepLabel)
-	log = logging.AddField(log, "phase", "routeOut")
-
-	targetWriter := newTargetWriter(log, r.timeouts)
-
-	var err error
-
-	log.Debugf("initializing routeOut for %s", stepLabel)
-	// `egressTarget` is used to keep track of egress times of a target from a test step
-	egressTarget := make(map[string]time.Time)
-
-	for {
-		select {
-		case <-ctx.Done():
-			err = fmt.Errorf("termination requested for routing into %s", r.bundle.TestStepLabel)
-		case t, chanIsOpen := <-r.routingChannels.stepOut:
-			if !chanIsOpen {
-				log.Debugf("step output closed")
-				r.routingChannels.stepOut = nil
-				break
-			}
-
-			if _, targetPresent := egressTarget[t.ID]; targetPresent {
-				err = fmt.Errorf("step %s returned target %+v multiple times", r.bundle.TestStepLabel, t)
-				break
-			}
-			// Emit an event signaling that the target has left the TestStep
-			if err := r.emitOutEvent(t, nil); err != nil {
-				log.Warningf("could not emit out event for target %v: %v", *t, err)
-			}
-			// Register egress time and forward target to the next routing block
-			egressTarget[t.ID] = time.Now()
-			if err := targetWriter.writeTimeout(ctx, r.routingChannels.routeOut, t, r.timeouts.MessageTimeout); err != nil {
-				log.Panicf("could not forward target to the test runner: %+v", err)
-			}
-		case targetError, chanIsOpen := <-r.routingChannels.stepErr:
-			if !chanIsOpen {
-				log.Debugf("step error closed")
-				r.routingChannels.stepErr = nil
-				break
-			}
-
-			if _, targetPresent := egressTarget[targetError.Target.ID]; targetPresent {
-				err = fmt.Errorf("step %s returned target %+v multiple times", r.bundle.TestStepLabel, targetError.Target)
-			} else {
-				if err := r.emitOutEvent(targetError.Target, targetError.Err); err != nil {
-					log.Warningf("could not emit err event for target: %v", *targetError.Target)
-				}
-				egressTarget[targetError.Target.ID] = time.Now()
-				if err := targetWriter.writeTargetError(ctx, r.routingChannels.targetErr, targetError, r.timeouts.MessageTimeout); err != nil {
-					log.Panicf("could not forward target (%+v) to the test runner: %v", targetError.Target, err)
-				}
-			}
-		}
-		if err != nil {
-			break
-		}
-		if r.routingChannels.stepErr == nil && r.routingChannels.stepOut == nil {
-			log.Debugf("output and error channel from step are closed, routeOut should terminate")
-			close(r.routingChannels.routeOut)
-			break
-		}
-	}
-
-	if err != nil {
-		log.Debugf("routeOut failed: %v", err)
-		return 0, err
-	}
-	return len(egressTarget), nil
-
-}
-
-// route implements the routing logic from the previous routing block to the test step
-// and from the test step to the next routing block
-func (r *stepRouter) route(ctx context.Context, resultCh chan<- routeResult) {
-
-	var (
-		inTargets, outTargets   int
-		errRouteIn, errRouteOut error
-	)
-
-	terminateInternalCtx, terminateInternal := context.WithCancel(ctx)
-	defer terminateInternal() // avoids possible goroutine deadlock in context.WithCancel implementation
-
-	wg := sync.WaitGroup{}
-	wg.Add(2)
-	go func() {
-		defer wg.Done()
-		inTargets, errRouteIn = r.routeIn(terminateInternalCtx)
-		if errRouteIn != nil {
-			terminateInternal()
-		}
-	}()
-
-	go func() {
-		defer wg.Done()
-		outTargets, errRouteOut = r.routeOut(terminateInternalCtx)
-		if errRouteOut != nil {
-			terminateInternal()
-		}
-	}()
-	wg.Wait()
-
-	routingErr := errRouteIn
-	if routingErr == nil {
-		routingErr = errRouteOut
-	}
-	if routingErr == nil && inTargets != outTargets {
-		routingErr = fmt.Errorf("step %s completed but did not return all injected Targets (%d!=%d)", r.bundle.TestStepLabel, inTargets, outTargets)
-	}
-
-	// Send the result to the test runner, which is expected to be listening
-	// within `MessageTimeout`. If that's not the case, we hit an unrecovrable
-	// condition.
-	select {
-	case resultCh <- routeResult{bundle: r.bundle, err: routingErr}:
-	case <-time.After(r.timeouts.MessageTimeout):
-		log.Panicf("could not send routing block result")
-	}
-}
-
-func newStepRouter(log *logrus.Entry, bundle test.TestStepBundle, routingChannels routingCh, ev testevent.EmitterFetcher, timeouts TestRunnerTimeouts) *stepRouter {
-	routerLogger := logging.AddField(log, "step", bundle.TestStepLabel)
-	r := stepRouter{log: routerLogger, bundle: bundle, routingChannels: routingChannels, ev: ev, timeouts: timeouts}
-	return &r
-}
diff --git a/pkg/runner/test_runner_route_test.go b/pkg/runner/test_runner_route_test.go
deleted file mode 100644
index d34d18ae..00000000
--- a/pkg/runner/test_runner_route_test.go
+++ /dev/null
@@ -1,400 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"context"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/cerrors"
-	"github.com/facebookincubator/contest/pkg/event"
-	"github.com/facebookincubator/contest/pkg/event/testevent"
-	"github.com/facebookincubator/contest/pkg/logging"
-	"github.com/facebookincubator/contest/pkg/pluginregistry"
-	"github.com/facebookincubator/contest/pkg/storage"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/facebookincubator/contest/plugins/storage/memory"
-	"github.com/facebookincubator/contest/plugins/teststeps/example"
-
-	"github.com/stretchr/testify/require"
-	"github.com/stretchr/testify/suite"
-)
-
-var testSteps = map[string]test.TestStepFactory{
-	example.Name: example.New,
-}
-
-var testStepsEvents = map[string][]event.Name{
-	example.Name: example.Events,
-}
-
-type TestRunnerSuite struct {
-	suite.Suite
-	routingChannels routingCh
-	router          *stepRouter
-
-	// the following channels are the same channels wrapped within `routingChannels`,
-	// but used in the opposite direction compared to the routing block. When the
-	// routing block reads results, the corresponding channel of the test suite has
-	// write direction, when the routing block writes results, the corresponding channel
-	// of the test suite has read direction
-	routeInCh  chan<- *target.Target
-	routeOutCh <-chan *target.Target
-
-	stepInCh  <-chan *target.Target
-	stepOutCh chan<- *target.Target
-	stepErrCh chan<- cerrors.TargetError
-
-	targetErrCh <-chan cerrors.TargetError
-}
-
-func (suite *TestRunnerSuite) SetupTest() {
-
-	log := logging.GetLogger("TestRunnerSuite")
-
-	pluginRegistry := pluginregistry.NewPluginRegistry()
-	// Setup the PluginRegistry by registering TestSteps
-	for name, tsfactory := range testSteps {
-		if _, ok := testStepsEvents[name]; !ok {
-			suite.T().Errorf("test step %s does not define any associated event", name)
-		}
-		err := pluginRegistry.RegisterTestStep(name, tsfactory, testStepsEvents[name])
-		require.NoError(suite.T(), err)
-	}
-	ts1, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(suite.T(), err)
-
-	params := make(test.TestStepParameters)
-	bundle := test.TestStepBundle{TestStep: ts1, TestStepLabel: "FirstStage", Parameters: params}
-
-	routeInCh := make(chan *target.Target)
-	routeOutCh := make(chan *target.Target)
-	stepInCh := make(chan *target.Target)
-	stepOutCh := make(chan *target.Target)
-	stepErrCh := make(chan cerrors.TargetError)
-	targetErrCh := make(chan cerrors.TargetError)
-
-	suite.routeInCh = routeInCh
-	suite.routeOutCh = routeOutCh
-	suite.stepOutCh = stepOutCh
-	suite.stepInCh = stepInCh
-	suite.stepErrCh = stepErrCh
-	suite.targetErrCh = targetErrCh
-
-	suite.routingChannels = routingCh{
-		routeIn:   routeInCh,
-		routeOut:  routeOutCh,
-		stepIn:    stepInCh,
-		stepErr:   stepErrCh,
-		stepOut:   stepOutCh,
-		targetErr: targetErrCh,
-	}
-
-	s, err := memory.New()
-	require.NoError(suite.T(), err)
-	err = storage.SetStorage(s)
-	require.NoError(suite.T(), err)
-
-	header := testevent.Header{
-		JobID:         1,
-		RunID:         1,
-		TestName:      "TestRunnerSuite",
-		TestStepLabel: "TestRunnerSuite",
-	}
-	ev := storage.NewTestEventEmitterFetcher(header)
-
-	timeouts := TestRunnerTimeouts{
-		StepInjectTimeout:   30 * time.Second,
-		MessageTimeout:      25 * time.Second,
-		ShutdownTimeout:     5 * time.Second,
-		StepShutdownTimeout: 5 * time.Second,
-	}
-
-	suite.router = newStepRouter(log, bundle, suite.routingChannels, ev, timeouts)
-}
-
-func (suite *TestRunnerSuite) TestRouteInRoutesAllTargets() {
-
-	// test that all targets are routed to step input channel without delay, in order
-	targets := []*target.Target{
-		{ID: "001", FQDN: "host001.facebook.com"},
-		{ID: "002", FQDN: "host002.facebook.com"},
-		{ID: "003", FQDN: "host003.facebook.com"},
-	}
-
-	routeInResult := make(chan error)
-	stepInResult := make(chan error)
-
-	go func() {
-		// start routing
-		_, _ = suite.router.routeIn(context.Background())
-	}()
-
-	// inject targets
-	go func() {
-		defer close(suite.routeInCh)
-		for _, target := range targets {
-			select {
-			case <-time.After(2 * time.Second):
-				routeInResult <- fmt.Errorf("target should be accepted by routeIn block within timeout")
-				return
-			case suite.routeInCh <- target:
-			}
-		}
-		routeInResult <- nil
-	}()
-
-	// mimic the test step consuming targets in input
-	go func() {
-		numTargets := 0
-		for {
-			select {
-			case t, ok := <-suite.stepInCh:
-				if !ok {
-					if numTargets != len(targets) {
-						stepInResult <- fmt.Errorf("not all targets received by teste step")
-					} else {
-						stepInResult <- nil
-					}
-					return
-				}
-				if numTargets+1 > len(targets) {
-					stepInResult <- fmt.Errorf("more targets returned than injected")
-					return
-				}
-				if t != targets[numTargets] {
-					stepInResult <- fmt.Errorf("targets returned in wrong order")
-					return
-				}
-				numTargets++
-			case <-time.After(2 * time.Second):
-				stepInResult <- fmt.Errorf("expected target on channel within timeout")
-			}
-		}
-	}()
-
-	for {
-		select {
-		case <-time.After(5 * time.Second):
-			suite.T().Errorf("test should return within timeout")
-		case err := <-stepInResult:
-			stepInResult = nil
-			if err != nil {
-				suite.T().Errorf("step in returned error: %v", err)
-			}
-		case err := <-routeInResult:
-			routeInResult = nil
-			if err != nil {
-				suite.T().Errorf("route in returned error: %v", err)
-			}
-		}
-		if stepInResult == nil && routeInResult == nil {
-			return
-		}
-	}
-}
-
-func (suite *TestRunnerSuite) TestRouteOutRoutesAllSuccessfulTargets() {
-
-	// test that all targets are routed in output from a test step are received by
-	// the routing logic and forwarded to the next routing block
-	targets := []*target.Target{
-		{ID: "001", FQDN: "host001.facebook.com"},
-		{ID: "002", FQDN: "host002.facebook.com"},
-		{ID: "003", FQDN: "host003.facebook.com"},
-	}
-
-	go func() {
-		// start routing
-		_, _ = suite.router.routeOut(context.Background())
-	}()
-
-	stepResult := make(chan error)
-	routeResult := make(chan error)
-
-	// mimic the test step returning targets on the output channel. This test exercise
-	// the path where all targets complete successfully
-	go func() {
-		// it's expected that the step closes both channels
-		defer close(suite.stepOutCh)
-		defer close(suite.stepErrCh)
-		for _, target := range targets {
-			select {
-			case <-time.After(2 * time.Second):
-				stepResult <- fmt.Errorf("target should be accepted by routing block within timeout")
-				return
-			case suite.stepOutCh <- target:
-			}
-		}
-		stepResult <- nil
-	}()
-
-	// mimic the next routing block and the pipeline reading from the routeOut channel and the targetErr
-	// channel. We don't expect any target coming through targetErr, we expect all targets to be successful
-	go func() {
-		numTargets := 0
-		for {
-			select {
-			case _, ok := <-suite.targetErrCh:
-				if !ok {
-					suite.targetErrCh = nil
-				} else {
-					routeResult <- fmt.Errorf("no targets expected on the error channel")
-					return
-				}
-			case t, ok := <-suite.routeOutCh:
-				if !ok {
-					if numTargets != len(targets) {
-						routeResult <- fmt.Errorf("not all targets have been returned")
-					} else {
-						routeResult <- nil
-					}
-					return
-				}
-				if numTargets+1 > len(targets) {
-					routeResult <- fmt.Errorf("more targets returned than injected")
-					return
-				}
-				if t.ID != targets[numTargets].ID {
-					routeResult <- fmt.Errorf("targets returned in wrong order")
-					return
-				}
-				numTargets++
-			case <-time.After(2 * time.Second):
-				routeResult <- fmt.Errorf("expected target on channel within timeout")
-				return
-			}
-		}
-	}()
-
-	for {
-		select {
-		case <-time.After(5 * time.Second):
-			suite.T().Errorf("test should return within timeout")
-		case err := <-stepResult:
-			stepResult = nil
-			if err != nil {
-				suite.T().Errorf("step output returned error: %v", err)
-			}
-		case err := <-routeResult:
-			routeResult = nil
-			if err != nil {
-				suite.T().Errorf("router returned error: %v", err)
-			}
-		}
-		if stepResult == nil && routeResult == nil {
-			return
-		}
-	}
-}
-
-func (suite *TestRunnerSuite) TestRouteOutRoutesAllFailedTargets() {
-
-	// test that all targets are routed in output from a test step are received by
-	// the routing logic and forwarded to the next routing block
-	targets := []*target.Target{
-		{ID: "001", FQDN: "host001.facebook.com"},
-		{ID: "002", FQDN: "host002.facebook.com"},
-		{ID: "003", FQDN: "host003.facebook.com"},
-	}
-
-	go func() {
-		// start routing
-		_, _ = suite.router.routeOut(context.Background())
-	}()
-
-	stepResult := make(chan error)
-	routeResult := make(chan error)
-
-	// mimic the test step returning targets on the output channel. This test exercise
-	// the path where all targets complete successfully
-	go func() {
-		// it's expected that the step closes both channels
-		defer close(suite.stepOutCh)
-		defer close(suite.stepErrCh)
-		for _, target := range targets {
-			targetErr := cerrors.TargetError{Target: target, Err: fmt.Errorf("test error")}
-			select {
-			case <-time.After(2 * time.Second):
-				stepResult <- fmt.Errorf("target should be accepted by routing block within timeout")
-				return
-			case suite.stepErrCh <- targetErr:
-			}
-		}
-		stepResult <- nil
-	}()
-
-	// mimic the next routing block and the pipeline reading from the routeOut channel and the targetErr
-	// channel. We don't expect any target coming through targetErr, we expect all targets to be successful
-	go func() {
-		numTargets := 0
-		for {
-			select {
-			case targetErr, ok := <-suite.targetErrCh:
-				if !ok {
-					routeResult <- fmt.Errorf("target error channel should not be closed by routing block")
-					return
-				}
-				if targetErr.Err == nil {
-					routeResult <- fmt.Errorf("expected error associated to the target")
-					return
-				}
-				if numTargets+1 > len(targets) {
-					routeResult <- fmt.Errorf("more targets returned than injected")
-					return
-				}
-				if targetErr.Target.ID != targets[numTargets].ID {
-					routeResult <- fmt.Errorf("targets returned in wrong order")
-					return
-				}
-
-				numTargets++
-				if numTargets == len(targets) {
-					routeResult <- nil
-					return
-				}
-			case _, ok := <-suite.routeOutCh:
-				if !ok {
-					suite.routeOutCh = nil
-				} else {
-					routeResult <- fmt.Errorf("no targets expected on the output channel")
-					return
-				}
-			case <-time.After(2 * time.Second):
-				routeResult <- fmt.Errorf("expected target on channel within timeout")
-				return
-			}
-		}
-	}()
-
-	for {
-		select {
-		case <-time.After(5 * time.Second):
-			suite.T().Errorf("test should return within timeout")
-		case err := <-stepResult:
-			stepResult = nil
-			if err != nil {
-				suite.T().Errorf("step output routine returned error: %v", err)
-			}
-		case err := <-routeResult:
-			routeResult = nil
-			if err != nil {
-				suite.T().Errorf("router returned error: %v", err)
-			}
-		}
-		if stepResult == nil && routeResult == nil {
-			return
-		}
-	}
-}
-
-func TestTestRunnerSuite(t *testing.T) {
-	TestRunnerSuite := &TestRunnerSuite{}
-	suite.Run(t, TestRunnerSuite)
-}
diff --git a/pkg/runner/test_runner_test.go b/pkg/runner/test_runner_test.go
new file mode 100644
index 00000000..70c590cf
--- /dev/null
+++ b/pkg/runner/test_runner_test.go
@@ -0,0 +1,548 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package runner
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/logging"
+	"github.com/facebookincubator/contest/pkg/pluginregistry"
+	"github.com/facebookincubator/contest/pkg/statectx"
+	"github.com/facebookincubator/contest/pkg/storage"
+	"github.com/facebookincubator/contest/pkg/target"
+	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/pkg/types"
+	"github.com/facebookincubator/contest/plugins/storage/memory"
+	"github.com/facebookincubator/contest/plugins/teststeps/example"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/badtargets"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/channels"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/noreturn"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/panicstep"
+)
+
+const testName = "SimpleTest"
+
+var (
+	evs            storage.ResettableStorage
+	pluginRegistry *pluginregistry.PluginRegistry
+)
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	logging.Debug()
+	if ms, err := memory.New(); err == nil {
+		evs = ms
+		if err := storage.SetStorage(ms); err != nil {
+			panic(err.Error())
+		}
+	} else {
+		panic(fmt.Sprintf("could not initialize in-memory storage layer: %v", err))
+	}
+	pluginRegistry = pluginregistry.NewPluginRegistry()
+	for _, e := range []struct {
+		name    string
+		factory test.TestStepFactory
+		events  []event.Name
+	}{
+		{badtargets.Name, badtargets.New, badtargets.Events},
+		{channels.Name, channels.New, channels.Events},
+		{example.Name, example.New, example.Events},
+		{hanging.Name, hanging.New, hanging.Events},
+		{noreturn.Name, noreturn.New, noreturn.Events},
+		{panicstep.Name, panicstep.New, panicstep.Events},
+	} {
+		if err := pluginRegistry.RegisterTestStep(e.name, e.factory, e.events); err != nil {
+			panic(fmt.Sprintf("could not register TestStep: %v", err))
+		}
+	}
+	os.Exit(m.Run())
+}
+
+func eventToStringNoTime(ev testevent.Event) string {
+	// Omit the timestamp to make output stable.
+	return fmt.Sprintf("{%s%s}", ev.Header, ev.Data)
+}
+
+func resetEventStorage() {
+	if err := evs.Reset(); err != nil {
+		panic(err.Error())
+	}
+}
+
+func tgt(id string) *target.Target {
+	return &target.Target{ID: id}
+}
+
+func getEvents(targetID, stepLabel *string) string {
+	q, _ := testevent.BuildQuery(testevent.QueryTestName(testName))
+	results, _ := evs.GetTestEvents(q)
+	var resultsForTarget []string
+	for _, r := range results {
+		if targetID != nil {
+			if r.Data.Target == nil {
+				continue
+			}
+			if *targetID != "" && r.Data.Target.ID != *targetID {
+				continue
+			}
+		}
+		if stepLabel != nil {
+			if *stepLabel != "" && r.Header.TestStepLabel != *stepLabel {
+				continue
+			}
+			if targetID == nil && r.Data.Target != nil {
+				continue
+			}
+		}
+		resultsForTarget = append(resultsForTarget, eventToStringNoTime(r))
+	}
+	return "\n" + strings.Join(resultsForTarget, "\n") + "\n"
+}
+
+func getStepEvents(stepLabel string) string {
+	return getEvents(nil, &stepLabel)
+}
+
+func getTargetEvents(targetID string) string {
+	return getEvents(&targetID, nil)
+}
+
+func newStep(label, name string, params *test.TestStepParameters) test.TestStepBundle {
+	td := test.TestStepDescriptor{
+		Name:  name,
+		Label: label,
+	}
+	if params != nil {
+		td.Parameters = *params
+	}
+	sb, err := pluginRegistry.NewTestStepBundle(td, nil)
+	if err != nil {
+		panic(fmt.Sprintf("failed to create test step bundle: %v", err))
+	}
+	return *sb
+}
+
+func newExampleStep(label string, failPct int, failTargets string, delayTargets string) test.TestStepBundle {
+	return newStep(label, example.Name, &test.TestStepParameters{
+		example.FailPctParam:      []test.Param{*test.NewParam(fmt.Sprintf("%d", failPct))},
+		example.FailTargetsParam:  []test.Param{*test.NewParam(failTargets)},
+		example.DelayTargetsParam: []test.Param{*test.NewParam(delayTargets)},
+	})
+}
+
+type runRes struct {
+	res []byte
+	err error
+}
+
+func runWithTimeout(t *testing.T, tr TestRunner, ctx statectx.Context, resumeState []byte, runID types.RunID, timeout time.Duration, targets []*target.Target, bundles []test.TestStepBundle) ([]byte, error) {
+	newCtx, _, cancel := statectx.WithParent(ctx)
+	test := &test.Test{
+		Name:             testName,
+		TestStepsBundles: bundles,
+	}
+	resCh := make(chan runRes)
+	go func() {
+		res, err := tr.Run(newCtx, test, targets, 1, runID, resumeState)
+		resCh <- runRes{res: res, err: err}
+	}()
+	var res runRes
+	select {
+	case res = <-resCh:
+	case <-time.After(timeout):
+		cancel()
+		assert.FailNow(t, "TestRunner should not time out")
+	}
+	return res.res, res.err
+}
+
+// Simple case: one target, one step, success.
+func Test1Step1Success(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newExampleStep("Step 1", 0, "", ""),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents(""))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFinishedEvent]}
+`, getTargetEvents("T1"))
+}
+
+// Simple case: one target, one step, failure.
+func Test1Step1Fail(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newExampleStep("Step 1", 100, "", ""),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents("Step 1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+`, getTargetEvents("T1"))
+}
+
+// One step pipeline with two targets - one fails, one succeeds.
+func Test1Step1Success1Fail(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1"), tgt("T2")},
+		[]test.TestStepBundle{
+			newExampleStep("Step 1", 0, "T1", "T2=100"),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents(""))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+`, getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleFinishedEvent]}
+`, getTargetEvents("T2"))
+}
+
+// Three-step pipeline, two targets: T1 fails at step 1, T2 fails at step 2,
+// step 3 is not reached and not even run.
+func Test3StepsNotReachedStepNotRun(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1"), tgt("T2")},
+		[]test.TestStepBundle{
+			newExampleStep("Step 1", 0, "T1", ""),
+			newExampleStep("Step 2", 0, "T2", ""),
+			newExampleStep("Step 3", 0, "", ""),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents("Step 1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents("Step 2"))
+	require.Equal(t, "\n\n", getStepEvents("Step 3"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+`, getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleFinishedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleFailedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestError &"\"target failed\""]}
+`, getTargetEvents("T2"))
+}
+
+// A misbehaving step that fails to shut down properly after processing targets
+// and does not return.
+func TestNoReturnStepWithCorrectTargetForwarding(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunnerWithTimeouts(100*time.Millisecond, 200*time.Millisecond)
+	ctx, _, cancel := statectx.New()
+	defer cancel()
+	_, err := runWithTimeout(t, tr, ctx, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", noreturn.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepsNeverReturned{}, err)
+}
+
+// A misbehaving step that does not process any targets.
+func TestNoReturnStepWithoutTargetForwarding(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunnerWithTimeouts(100*time.Millisecond, 200*time.Millisecond)
+	ctx, _, cancel := statectx.New()
+	defer cancel()
+	_, err := runWithTimeout(t, tr, ctx, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", hanging.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestTargetInjectionTimedOut{}, err)
+}
+
+// A misbehaving step that panics.
+func TestStepPanics(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", panicstep.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepPaniced{}, err)
+	require.Equal(t, "\n\n", getTargetEvents("T1"))
+}
+
+// A misbehaving step that closes its output channel.
+func TestStepClosesChannels(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", channels.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepClosedChannels{}, err)
+	require.Equal(t, "\n\n", getTargetEvents("T1"))
+}
+
+// A misbehaving step that yields a result for a target that does not exist.
+func TestStepYieldsResultForNonexistentTarget(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", badtargets.Name, nil),
+		},
+	)
+	require.Error(t, err)
+}
+
+// A misbehaving step that yields a result for a target that does not exist.
+func TestStepYieldsDuplicateResult(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("TGood"), tgt("TDup")},
+		[]test.TestStepBundle{
+			// TGood makes it past here unscathed and gets delayed in Step 2,
+			// TDup also emerges fine at first but is then returned again, and that's bad.
+			newStep("Step 1", badtargets.Name, nil),
+			newExampleStep("Step 2", 0, "", "T2=100"),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepReturnedDuplicateResult{}, err)
+}
+
+// A misbehaving step that loses targets.
+func TestStepLosesTargets(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("TGood"), tgt("TDrop")},
+		[]test.TestStepBundle{
+			newStep("Step 1", badtargets.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepLostTargets{}, err)
+}
+
+// A misbehaving step that yields a result for a target that does exist
+// but is not currently waiting for it.
+func TestStepYieldsResultForUnexpectedTarget(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1"), tgt("T1XXX")},
+		[]test.TestStepBundle{
+			// T1XXX fails here.
+			newExampleStep("Step 1", 0, "T1XXX", ""),
+			// Yet, a result for it is returned here, which we did not expect.
+			newStep("Step 2", badtargets.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepReturnedUnexpectedResult{}, err)
+}
+
+// Larger, randomized test - a number of steps, some targets failing, some succeeding.
+func TestRandomizedMultiStep(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunner()
+	var targets []*target.Target
+	for i := 1; i <= 100; i++ {
+		targets = append(targets, tgt(fmt.Sprintf("T%d", i)))
+	}
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		targets,
+		[]test.TestStepBundle{
+			newExampleStep("Step 1", 0, "", "*=10"),  // All targets pass the first step, with a slight delay
+			newExampleStep("Step 2", 25, "", ""),     // 25% don't make it past the second step
+			newExampleStep("Step 3", 25, "", "*=10"), // Another 25% fail at the third step
+		},
+	)
+	require.NoError(t, err)
+	// Every target mush have started and finished the first step.
+	numFinished := 0
+	for _, tgt := range targets {
+		s1n := "Step 1"
+		require.Equal(t, fmt.Sprintf(`
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} ExampleFinishedEvent]}
+`, tgt.ID, tgt.ID),
+			getEvents(&tgt.ID, &s1n))
+		s3n := "Step 3"
+		if strings.Contains(getEvents(&tgt.ID, &s3n), "ExampleFinishedEvent") {
+			numFinished++
+		}
+	}
+	// At least some must have finished.
+	require.Greater(t, numFinished, 0)
+}
+
+// Test pausing/resuming a naive step that does not cooperate.
+// In this case we drain input, wait for all targets to emerge and exit gracefully.
+func TestPauseResumeSimple(t *testing.T) {
+	resetEventStorage()
+	log := logging.GetLogger("TestPauseResumeSimple")
+	var err error
+	var resumeState []byte
+	targets := []*target.Target{tgt("T1"), tgt("T2"), tgt("T3")}
+	steps := []test.TestStepBundle{
+		newExampleStep("Step 1", 0, "T1", ""),
+		// T2 and T3 will be paused here, the step will be given time to finish.
+		newExampleStep("Step 2", 0, "", "T2=200,T3=200"),
+		newExampleStep("Step 3", 0, "", ""),
+	}
+	{
+		tr1 := NewTestRunner()
+		ctx1, pause, cancel := statectx.New()
+		defer cancel()
+		go func() {
+			time.Sleep(100 * time.Millisecond)
+			log.Infof("TestPauseResumeNaive: pausing")
+			pause()
+		}()
+		resumeState, err = runWithTimeout(t, tr1, ctx1, nil, 1, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.IsType(t, statectx.ErrPaused, err)
+		require.NotNil(t, resumeState)
+	}
+	log.Debugf("Resume state: %s", string(resumeState))
+	// Make sure that resume state is validated.
+	{
+		tr := NewTestRunner()
+		ctx, _, cancel := statectx.New()
+		defer cancel()
+		resumeState2, err := runWithTimeout(
+			t, tr, ctx, []byte("FOO"), 2, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "invalid resume state")
+		require.Nil(t, resumeState2)
+	}
+	{
+		tr := NewTestRunner()
+		ctx, _, cancel := statectx.New()
+		defer cancel()
+		resumeState2 := strings.Replace(string(resumeState), `"version"`, `"Xversion"`, 1)
+		_, err := runWithTimeout(
+			t, tr, ctx, []byte(resumeState2), 3, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "incompatible resume state")
+	}
+	{
+		tr := NewTestRunner()
+		ctx, _, cancel := statectx.New()
+		defer cancel()
+		resumeState2 := strings.Replace(string(resumeState), `"job_id":1`, `"job_id":2`, 1)
+		_, err := runWithTimeout(
+			t, tr, ctx, []byte(resumeState2), 4, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "wrong resume state")
+	}
+	// Finally, resume and finish the job.
+	{
+		tr2 := NewTestRunner()
+		ctx2, _, cancel := statectx.New()
+		defer cancel()
+		_, err := runWithTimeout(t, tr2, ctx2, resumeState, 5, 2*time.Second,
+			// Pass exactly the same targets and pipeline to resume properly.
+			// Don't use the same pointers ot make sure there is no reliance on that.
+			[]*target.Target{tgt("T1"), tgt("T2"), tgt("T3")},
+			[]test.TestStepBundle{
+				newExampleStep("Step 1", 0, "T1", ""),
+				newExampleStep("Step 2", 0, "", "T2=200,T3=200"),
+				newExampleStep("Step 3", 0, "", ""),
+			},
+		)
+		require.NoError(t, err)
+	}
+	// Verify step events.
+	// Steps 1 and 2 are executed entirely within the first runner instance
+	// and never started in the second.
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents("Step 1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents("Step 2"))
+	// Step 3 did not get to start in the first instance and ran in the second.
+	require.Equal(t, `
+{[1 5 SimpleTest Step 3][(*Target)(nil) ExampleStepRunningEvent]}
+{[1 5 SimpleTest Step 3][(*Target)(nil) ExampleStepFinishedEvent]}
+`, getStepEvents("Step 3"))
+	// T1 failed entirely within the first run.
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+`, getTargetEvents("T1"))
+	// T2 and T3 ran in both.
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleFinishedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleStartedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleFinishedEvent]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} ExampleStartedEvent]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} ExampleFinishedEvent]}
+`, getTargetEvents("T2"))
+}
diff --git a/pkg/runner/wait_for_first_target.go b/pkg/runner/wait_for_first_target.go
deleted file mode 100644
index 8d0602ff..00000000
--- a/pkg/runner/wait_for_first_target.go
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"context"
-	"github.com/facebookincubator/contest/pkg/target"
-)
-
-// waitForFirstTarget copies targets from `in` to `out` and also
-// signals on first target or if `in` is closed and there was no targets
-// at all.
-//
-// To solve the initial problem there're two approaches:
-// * Just wait for a target in `stepCh.stepIn` and then insert
-// it back to the channel.
-// * Create substitute `stepCh.stepIn` with a new channel and just
-// copy all targets from original `stepCh.stepIn` to this new channel.
-// And we may detect when we receive the first target in the original
-// channel.
-//
-// The first approach is much simpler, but the second preserves
-// the order of targets (which is handy and not misleading
-// while reading logs). Here we implement the second one:
-func waitForFirstTarget(
-	ctx context.Context,
-	in <-chan *target.Target,
-) (out chan *target.Target, onFirstTarget, onNoTargets <-chan struct{}) {
-	onFirstTargetCh := make(chan struct{})
-	onNoTargetsCh := make(chan struct{})
-	onFirstTarget, onNoTargets = onFirstTargetCh, onNoTargetsCh
-
-	out = make(chan *target.Target)
-	go func() {
-		select {
-		case t, ok := <-in:
-			if !ok {
-				close(out)
-				close(onNoTargetsCh)
-				return
-			}
-			close(onFirstTargetCh)
-			out <- t
-		case <-ctx.Done():
-			return
-		}
-
-		for t := range in {
-			out <- t
-		}
-		close(out)
-	}()
-
-	return
-}
diff --git a/pkg/runner/wait_for_first_target_test.go b/pkg/runner/wait_for_first_target_test.go
deleted file mode 100644
index c5439035..00000000
--- a/pkg/runner/wait_for_first_target_test.go
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"context"
-	"fmt"
-	"runtime"
-	"sync"
-	"testing"
-
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/stretchr/testify/require"
-)
-
-func TestWaitForFirstTarget(t *testing.T) {
-	t.Run("100targets", func(t *testing.T) {
-		ch0 := make(chan *target.Target)
-		ch1, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(context.Background(), ch0)
-
-		var wgBeforeFirstTarget, wgAfterSecondTarget sync.WaitGroup
-		wgBeforeFirstTarget.Add(1)
-		wgAfterSecondTarget.Add(1)
-
-		go func() {
-			wgBeforeFirstTarget.Wait()
-			for i := 0; i < 100; i++ {
-				ch0 <- &target.Target{
-					ID: fmt.Sprintf("%d", i),
-				}
-				if i == 1 {
-					// to avoid race conditions in this test we verify
-					// if there's a "onFirstTarget" signal after the second
-					// target
-					wgAfterSecondTarget.Done()
-				}
-			}
-		}()
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		wgBeforeFirstTarget.Done()
-
-		var wgReader sync.WaitGroup
-		wgReader.Add(1)
-		go func() {
-			defer wgReader.Done()
-			// Checking quantity and order
-			for i := 0; i < 100; i++ {
-				_target := <-ch1
-				require.Equal(t, &target.Target{
-					ID: fmt.Sprintf("%d", i),
-				}, _target)
-			}
-		}()
-
-		wgAfterSecondTarget.Wait()
-
-		select {
-		case <-onFirstTargetChan:
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-			t.Fatal("should not happen") // see wgAfterSecondTarget.Wait() above
-		}
-
-		wgReader.Wait()
-
-		runtime.Gosched()
-		require.Len(t, ch1, 0)
-	})
-
-	t.Run("no_target", func(t *testing.T) {
-		ch0 := make(chan *target.Target)
-		ch1, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(context.Background(), ch0)
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		close(ch0)
-
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-		}
-
-		runtime.Gosched()
-		require.Len(t, ch1, 0)
-		_, isOpened := <-ch1
-		require.False(t, isOpened)
-	})
-
-	t.Run("cancel", func(t *testing.T) {
-		cancelCh := make(chan struct{})
-		ch0 := make(chan *target.Target)
-		ch1, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(context.Background(), ch0)
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen") // see wgBeforeFirstTarget.Wait() above
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		close(cancelCh)
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		require.Len(t, ch1, 0)
-		select {
-		case <-ch1:
-			t.Fatal("should not happen")
-		default:
-		}
-	})
-}
diff --git a/pkg/statectx/context.go b/pkg/statectx/context.go
index a22456cc..1dbcc36d 100644
--- a/pkg/statectx/context.go
+++ b/pkg/statectx/context.go
@@ -39,10 +39,10 @@ func Background() Context {
 	}
 }
 
-func New() (Context, func(), func()) {
-	cancelCtx, cancel := newCancelContext(context.Background())
-	pauseCtx, pause := newCancelContext(context.Background())
-	pauseOrDoneCtx, pauseOrDone := newCancelContext(context.Background())
+func newInternal(cctx, pctx, cpctx context.Context) (Context, func(), func()) {
+	cancelCtx, cancel := newCancelContext(cctx)
+	pauseCtx, pause := newCancelContext(pctx)
+	pauseOrDoneCtx, pauseOrDone := newCancelContext(cpctx)
 
 	resCtx := &stateCtx{
 		cancelCtx:      cancelCtx,
@@ -59,24 +59,15 @@ func New() (Context, func(), func()) {
 	return resCtx, wrap(pause, ErrPaused), wrap(cancel, ErrCanceled)
 }
 
-func WithParent(ctx Context) (Context, func(), func()) {
-	cancelCtx, cancel := newCancelContext(ctx)
-	pauseCtx, pause := newCancelContext(ctx.PausedCtx())
-	pauseOrDoneCtx, pauseOrDone := newCancelContext(ctx.PausedOrDoneCtx())
-
-	resCtx := &stateCtx{
-		cancelCtx:      cancelCtx,
-		pauseCtx:       pauseCtx,
-		pauseOrDoneCtx: pauseOrDoneCtx,
-	}
+func New() (Context, func(), func()) {
+	return newInternal(context.Background(), context.Background(), context.Background())
+}
 
-	wrap := func(action func(err error), err error) func() {
-		return func() {
-			pauseOrDone(err)
-			action(err)
-		}
+func WithParent(ctx Context) (Context, func(), func()) {
+	if ctx == nil {
+		return New()
 	}
-	return resCtx, wrap(pause, ErrPaused), wrap(cancel, ErrCanceled)
+	return newInternal(ctx, ctx.PausedCtx(), ctx.PausedOrDoneCtx())
 }
 
 type stateCtx struct {
diff --git a/pkg/storage/storage.go b/pkg/storage/storage.go
index fe95b017..b56687bd 100644
--- a/pkg/storage/storage.go
+++ b/pkg/storage/storage.go
@@ -58,6 +58,7 @@ type TransactionalStorage interface {
 
 // ResettableStorage is implemented by storage engines that support reset operation
 type ResettableStorage interface {
+	Storage
 	Reset() error
 }
 
diff --git a/pkg/target/target.go b/pkg/target/target.go
index 5593ff3f..7600c5a2 100644
--- a/pkg/target/target.go
+++ b/pkg/target/target.go
@@ -40,9 +40,10 @@ type ErrPayload struct {
 // FQDN, PrimaryIPv4, and PrimaryIPv6 are used by plugins to contact the target, set as many as possible for maximum plugin compatibility.
 // Plugins are generally expected to attempt contacting devices via FQDN, IPv4, and IPv6. Note there is no way to enforce this and more specialized plugins might only support a subset.
 type Target struct {
-	ID string
-    FQDN string
-    PrimaryIPv4, PrimaryIPv6 net.IP
+	ID          string
+	FQDN        string
+	PrimaryIPv4 net.IP
+	PrimaryIPv6 net.IP
 }
 
 func (t *Target) String() string {
diff --git a/plugins/storage/memory/memory.go b/plugins/storage/memory/memory.go
index e29aa0d6..a8a7f588 100644
--- a/plugins/storage/memory/memory.go
+++ b/plugins/storage/memory/memory.go
@@ -225,7 +225,7 @@ func (m *Memory) Version() (uint64, error) {
 }
 
 // New create a new Memory events storage backend
-func New() (storage.Storage, error) {
+func New() (storage.ResettableStorage, error) {
 	m := Memory{lock: &sync.Mutex{}}
 	m.jobRequests = make(map[types.JobID]*job.Request)
 	m.jobReports = make(map[types.JobID]*job.JobReport)
diff --git a/plugins/teststeps/example/example.go b/plugins/teststeps/example/example.go
index a3c17142..c56b150f 100644
--- a/plugins/teststeps/example/example.go
+++ b/plugins/teststeps/example/example.go
@@ -8,14 +8,18 @@ package example
 import (
 	"fmt"
 	"math/rand"
+	"strconv"
 	"strings"
+	"time"
 
 	"github.com/facebookincubator/contest/pkg/cerrors"
 	"github.com/facebookincubator/contest/pkg/event"
 	"github.com/facebookincubator/contest/pkg/event/testevent"
 	"github.com/facebookincubator/contest/pkg/logging"
 	"github.com/facebookincubator/contest/pkg/statectx"
+	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/plugins/teststeps"
 )
 
 // Name is the name used to look this plugin up.
@@ -23,13 +27,25 @@ var Name = "Example"
 
 var log = logging.GetLogger("teststeps/" + strings.ToLower(Name))
 
+// Params this step accepts.
+const (
+	// A comma-delimited list of target IDs to fail on.
+	FailTargetsParam = "FailTargets"
+	// Alternatively, fail this percentage of targets at random.
+	FailPctParam = "FailPct"
+	// A comma-delimited list of target IDs to delay and by how much, ID=delay_ms.
+	DelayTargetsParam = "DelayTargets"
+)
+
 // events that we may emit during the plugin's lifecycle. This is used in Events below.
 // Note that you don't normally need to emit start/finish/cancellation events as
 // these are emitted automatically by the framework.
 const (
-	StartedEvent  = event.Name("ExampleStartedEvent")
-	FinishedEvent = event.Name("ExampleFinishedEvent")
-	FailedEvent   = event.Name("ExampleFailedEvent")
+	StartedEvent      = event.Name("ExampleStartedEvent")
+	FinishedEvent     = event.Name("ExampleFinishedEvent")
+	FailedEvent       = event.Name("ExampleFailedEvent")
+	StepRunningEvent  = event.Name("ExampleStepRunningEvent")
+	StepFinishedEvent = event.Name("ExampleStepFinishedEvent")
 )
 
 // Events defines the events that a TestStep is allow to emit. Emitting an event
@@ -37,10 +53,12 @@ const (
 var Events = []event.Name{StartedEvent, FinishedEvent, FailedEvent}
 
 // Step is an example implementation of a TestStep which simply
-// consumes Targets in input and pipes them to the output channel with intermediate
-// buffering. It randomly decides if a Target has failed and forwards it on
-// the err channel.
+// consumes Targets in input and pipes them to the output or error channel
+// with intermediate buffering.
 type Step struct {
+	failPct      int64
+	failTargets  map[string]bool
+	delayTargets map[string]time.Duration
 }
 
 // Name returns the name of the Step
@@ -48,50 +66,87 @@ func (ts Step) Name() string {
 	return Name
 }
 
-// Run executes the example step.
-func (ts *Step) Run(ctx statectx.Context, ch test.TestStepChannels, _ test.TestStepParameters, ev testevent.Emitter) error {
-	for {
+func (ts *Step) shouldFail(t *target.Target, params test.TestStepParameters) bool {
+	if ts.failTargets[t.ID] {
+		return true
+	}
+	if ts.failPct > 0 {
+		roll := rand.Int63n(101)
+		return (roll <= ts.failPct)
+	}
+	return false
+}
 
-		r := rand.Intn(3)
+// Run executes the example step.
+func (ts *Step) Run(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
+	f := func(ctx statectx.Context, target *target.Target) error {
+		log.Infof("Executing on target %s", target)
+		// NOTE: you may want more robust error handling here, possibly just
+		//       logging the error, or a retry mechanism. Returning an error
+		//       here means failing the entire job.
+		if err := ev.Emit(testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
+			return fmt.Errorf("failed to emit start event: %v", err)
+		}
+		delay := ts.delayTargets[target.ID]
+		if delay == 0 {
+			delay = ts.delayTargets["*"]
+		}
 		select {
-		case target := <-ch.In:
-			if target == nil {
-				return nil
-			}
-			log.Infof("Executing on target %s", target)
-			// NOTE: you may want more robust error handling here, possibly just
-			//       logging the error, or a retry mechanism. Returning an error
-			//       here means failing the entire job.
-			if err := ev.Emit(testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
-				return fmt.Errorf("failed to emit start event: %v", err)
+		case <-time.After(delay):
+		case <-ctx.Done():
+			return statectx.ErrCanceled
+		}
+		if ts.shouldFail(target, params) {
+			if err := ev.Emit(testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit finished event: %v", err)
 			}
-			if r == 1 {
-				select {
-				case <-ctx.PausedOrDone():
-					return nil
-				case ch.Err <- cerrors.TargetError{Target: target, Err: fmt.Errorf("target failed")}:
-					if err := ev.Emit(testevent.Data{EventName: FinishedEvent, Target: target, Payload: nil}); err != nil {
-						return fmt.Errorf("failed to emit finished event: %v", err)
-					}
-				}
-			} else {
-				select {
-				case <-ctx.PausedOrDone():
-					return nil
-				case ch.Out <- target:
-					if err := ev.Emit(testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
-						return fmt.Errorf("failed to emit failed event: %v", err)
-					}
-				}
+			return fmt.Errorf("target failed")
+		} else {
+			if err := ev.Emit(testevent.Data{EventName: FinishedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit failed event: %v", err)
 			}
-		case <-ctx.PausedOrDone():
-			return nil
 		}
+		return nil
+	}
+	if err := ev.Emit(testevent.Data{EventName: StepRunningEvent}); err != nil {
+		return fmt.Errorf("failed to emit failed event: %v", err)
+	}
+	res := teststeps.ForEachTarget(Name, ctx, ch, f)
+	if err := ev.Emit(testevent.Data{EventName: StepFinishedEvent}); err != nil {
+		return fmt.Errorf("failed to emit failed event: %v", err)
 	}
+	return res
 }
 
 // ValidateParameters validates the parameters associated to the TestStep
-func (ts *Step) ValidateParameters(_ test.TestStepParameters) error {
+func (ts *Step) ValidateParameters(params test.TestStepParameters) error {
+	targetsToFail := params.GetOne(FailTargetsParam).String()
+	if len(targetsToFail) > 0 {
+		for _, t := range strings.Split(targetsToFail, ",") {
+			ts.failTargets[t] = true
+		}
+	}
+	targetsToDelay := params.GetOne(DelayTargetsParam).String()
+	if len(targetsToDelay) > 0 {
+		for _, e := range strings.Split(targetsToDelay, ",") {
+			kv := strings.Split(e, "=")
+			if len(kv) != 2 {
+				continue
+			}
+			v, err := strconv.Atoi(kv[1])
+			if err != nil {
+				return fmt.Errorf("invalid FailTargets: %w", err)
+			}
+			ts.delayTargets[kv[0]] = time.Duration(v) * time.Millisecond
+		}
+	}
+	if params.GetOne(FailPctParam).String() != "" {
+		if pct, err := params.GetInt(FailPctParam); err == nil {
+			ts.failPct = pct
+		} else {
+			return fmt.Errorf("invalid FailPct: %w", err)
+		}
+	}
 	return nil
 }
 
@@ -108,7 +163,10 @@ func (ts *Step) CanResume() bool {
 
 // New initializes and returns a new ExampleTestStep.
 func New() test.TestStep {
-	return &Step{}
+	return &Step{
+		failTargets:  make(map[string]bool),
+		delayTargets: make(map[string]time.Duration),
+	}
 }
 
 // Load returns the name, factory and events which are needed to register the step.
diff --git a/plugins/teststeps/teststeps.go b/plugins/teststeps/teststeps.go
index 7e763a8a..b37adf1f 100644
--- a/plugins/teststeps/teststeps.go
+++ b/plugins/teststeps/teststeps.go
@@ -35,15 +35,15 @@ func ForEachTarget(pluginName string, ctx statectx.Context, ch test.TestStepChan
 			log.Errorf("%s: ForEachTarget: failed to apply test step function on target %s: %v", pluginName, t, err)
 			select {
 			case ch.Err <- cerrors.TargetError{Target: t, Err: err}:
-			case <-ctx.PausedOrDone():
-				log.Debugf("%s: ForEachTarget: received cancellation/pause signal while reporting error", pluginName)
+			case <-ctx.Done():
+				log.Debugf("%s: ForEachTarget: received cancellation signal while reporting error", pluginName)
 			}
 		} else {
 			log.Debugf("%s: ForEachTarget: target %s completed successfully", pluginName, t)
 			select {
 			case ch.Out <- t:
-			case <-ctx.PausedOrDone():
-				log.Debugf("%s: ForEachTarget: received cancellation/pause signal while reporting success", pluginName)
+			case <-ctx.Done():
+				log.Debugf("%s: ForEachTarget: received pause signal while reporting success", pluginName)
 			}
 		}
 	}
@@ -69,9 +69,6 @@ func ForEachTarget(pluginName string, ctx statectx.Context, ch test.TestStepChan
 			case <-ctx.Done():
 				log.Debugf("%s: ForEachTarget: incoming loop canceled", pluginName)
 				return
-			case <-ctx.Paused():
-				log.Debugf("%s: ForEachTarget: incoming loop paused", pluginName)
-				return
 			}
 		}
 	}()
diff --git a/tests/integ/jobmanager/common.go b/tests/integ/jobmanager/common.go
index aabbdb25..1c7f4170 100644
--- a/tests/integ/jobmanager/common.go
+++ b/tests/integ/jobmanager/common.go
@@ -457,7 +457,6 @@ func (suite *TestJobManagerSuite) TestJobManagerJobCrash() {
 func (suite *TestJobManagerSuite) TestJobManagerJobCancellationFailure() {
 
 	config.TestRunnerShutdownTimeout = 1 * time.Second
-	config.TestRunnerStepShutdownTimeout = 1 * time.Second
 
 	go func() {
 		suite.jm.Start(suite.sigs)
diff --git a/tests/integ/test/testrunner_test.go b/tests/integ/test/testrunner_test.go
index 5e258991..27e44469 100644
--- a/tests/integ/test/testrunner_test.go
+++ b/tests/integ/test/testrunner_test.go
@@ -13,7 +13,8 @@ import (
 	"testing"
 	"time"
 
-	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/stretchr/testify/require"
+
 	"github.com/facebookincubator/contest/pkg/event"
 	"github.com/facebookincubator/contest/pkg/logging"
 	"github.com/facebookincubator/contest/pkg/pluginregistry"
@@ -23,7 +24,6 @@ import (
 	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/test"
 	"github.com/facebookincubator/contest/pkg/types"
-
 	"github.com/facebookincubator/contest/plugins/storage/memory"
 	"github.com/facebookincubator/contest/plugins/teststeps/cmd"
 	"github.com/facebookincubator/contest/plugins/teststeps/echo"
@@ -34,8 +34,6 @@ import (
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/noreturn"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/panicstep"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 )
 
 var (
@@ -130,7 +128,7 @@ func TestSuccessfulCompletion(t *testing.T) {
 
 	go func() {
 		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
+		_, err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID, nil)
 		errCh <- err
 	}()
 	select {
@@ -141,180 +139,6 @@ func TestSuccessfulCompletion(t *testing.T) {
 	}
 }
 
-func TestPanicStep(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Panic")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		test.TestStepBundle{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	errCh := make(chan error, 1)
-	stateCtx, _, _ := statectx.New()
-
-	go func() {
-		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-	select {
-	case err = <-errCh:
-		require.Error(t, err)
-	case <-time.After(successTimeout):
-		t.Errorf("test should return within timeout: %+v", successTimeout)
-	}
-}
-
-func TestNoReturnStepWithCorrectTargetForwarding(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("NoReturn")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, Parameters: params, TestStepLabel: "NoReturn"},
-		test.TestStepBundle{TestStep: ts2, Parameters: params, TestStepLabel: "Example"},
-	}
-
-	stateCtx, _, _ := statectx.New()
-	errCh := make(chan error, 1)
-
-	timeouts := runner.TestRunnerTimeouts{
-		StepInjectTimeout:   30 * time.Second,
-		MessageTimeout:      5 * time.Second,
-		ShutdownTimeout:     1 * time.Second,
-		StepShutdownTimeout: 1 * time.Second,
-	}
-	go func() {
-		tr := runner.NewTestRunnerWithTimeouts(timeouts)
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-	select {
-	case err = <-errCh:
-		require.Error(t, err)
-		if _, ok := err.(*cerrors.ErrTestStepsNeverReturned); !ok {
-			errString := fmt.Sprintf("Error returned by TestRunner should be of type ErrTestStepsNeverReturned: %v", err)
-			assert.FailNow(t, errString)
-		}
-		assert.NotNil(t, err.(*cerrors.ErrTestStepsNeverReturned))
-	case <-time.After(successTimeout):
-		t.Errorf("test should return within timeout: %+v", successTimeout)
-	}
-}
-
-func TestNoReturnStepWithoutTargetForwarding(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Hanging")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		test.TestStepBundle{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	stateCtx, _, cancel := statectx.New()
-	errCh := make(chan error, 1)
-
-	var (
-		StepInjectTimeout   = 30 * time.Second
-		MessageTimeout      = 5 * time.Second
-		ShutdownTimeout     = 1 * time.Second
-		StepShutdownTimeout = 1 * time.Second
-	)
-	timeouts := runner.TestRunnerTimeouts{
-		StepInjectTimeout:   StepInjectTimeout,
-		MessageTimeout:      MessageTimeout,
-		ShutdownTimeout:     ShutdownTimeout,
-		StepShutdownTimeout: StepShutdownTimeout,
-	}
-
-	go func() {
-		tr := runner.NewTestRunnerWithTimeouts(timeouts)
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-
-	testTimeout := 2 * time.Second
-	testShutdownTimeout := ShutdownTimeout + StepShutdownTimeout + 2
-	select {
-	case err = <-errCh:
-		// The TestRunner should never return and should instead time out
-		assert.FailNow(t, "TestRunner should not return, received an error instead: %v", err)
-	case <-time.After(testTimeout):
-		// Assert cancellation signal and expect the TestRunner to return within
-		// testShutdownTimeout
-		cancel()
-		select {
-		case err = <-errCh:
-			// The test timed out, which is an error from the perspective of the JobManager
-			require.Error(t, err)
-			if _, ok := err.(*cerrors.ErrTestStepsNeverReturned); !ok {
-				errString := fmt.Sprintf("Error returned by TestRunner should be of type ErrTestStepsNeverReturned: %v", err)
-				assert.FailNow(t, errString)
-			}
-		case <-time.After(testShutdownTimeout):
-			assert.FailNow(t, "TestRunner should return after cancellation before timeout")
-		}
-	}
-}
-
-func TestStepClosesChannels(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Channels")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		test.TestStepBundle{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	stateCtx, _, _ := statectx.New()
-	errCh := make(chan error, 1)
-
-	go func() {
-		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-
-	testTimeout := 2 * time.Second
-	select {
-	case err = <-errCh:
-		if _, ok := err.(*cerrors.ErrTestStepClosedChannels); !ok {
-			errString := fmt.Sprintf("Error returned by TestRunner should be of type ErrTestStepClosedChannels, got %T(%v)", err, err)
-			assert.FailNow(t, errString)
-		}
-	case <-time.After(testTimeout):
-		assert.FailNow(t, "TestRunner should not time out")
-	}
-}
-
 func TestCmdPlugin(t *testing.T) {
 
 	jobID := types.JobID(1)
@@ -340,7 +164,7 @@ func TestCmdPlugin(t *testing.T) {
 
 	go func() {
 		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
+		_, err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID, nil)
 		errCh <- err
 	}()
 
@@ -355,37 +179,3 @@ func TestCmdPlugin(t *testing.T) {
 		t.Errorf("test should return within timeout: %+v", successTimeout)
 	}
 }
-
-func TestNoRunTestStepIfNoTargets(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Fail")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Crash")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	stateCtx, _, _ := statectx.New()
-	errCh := make(chan error, 1)
-
-	go func() {
-		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-
-	testTimeout := 2 * time.Second
-	select {
-	case err = <-errCh:
-		assert.Nil(t, err, "the Crash TestStep shouldn't be ran")
-	case <-time.After(testTimeout):
-		assert.FailNow(t, "TestRunner should not time out")
-	}
-}
diff --git a/tests/plugins/teststeps/badtargets/badtargets.go b/tests/plugins/teststeps/badtargets/badtargets.go
new file mode 100644
index 00000000..72413ccc
--- /dev/null
+++ b/tests/plugins/teststeps/badtargets/badtargets.go
@@ -0,0 +1,79 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package badtargets
+
+import (
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/statectx"
+	"github.com/facebookincubator/contest/pkg/test"
+)
+
+// Name is the name used to look this plugin up.
+const Name = "BadTargets"
+
+// Events defines the events that a TestStep is allow to emit
+var Events = []event.Name{}
+
+type badTargets struct {
+}
+
+// Name returns the name of the Step
+func (ts *badTargets) Name() string {
+	return Name
+}
+
+// Run executes a step that messes up the flow of targets.
+func (ts *badTargets) Run(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
+	for {
+		select {
+		case target, ok := <-ch.In:
+			if !ok {
+				return nil
+			}
+			switch target.ID {
+			case "TDrop":
+				// ... crickets ...
+			case "TGood":
+				// We should not depend on pointer matching, so emit a copy.
+				target2 := *target
+				ch.Out <- &target2
+			case "TDup":
+				ch.Out <- target
+				ch.Out <- target
+			default:
+				// Mangle the returned target name.
+				target2 := *target
+				target2.ID = target2.ID + "XXX"
+				ch.Out <- &target2
+			}
+		case <-ctx.Done():
+			return nil
+		}
+	}
+}
+
+// ValidateParameters validates the parameters associated to the TestStep
+func (ts *badTargets) ValidateParameters(params test.TestStepParameters) error {
+	return nil
+}
+
+// Resume tries to resume a previously interrupted test step. ExampleTestStep
+// cannot resume.
+func (ts *badTargets) Resume(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.EmitterFetcher) error {
+	return &cerrors.ErrResumeNotSupported{StepName: Name}
+}
+
+// CanResume tells whether this step is able to resume.
+func (ts *badTargets) CanResume() bool {
+	return false
+}
+
+// New creates a new badTargets step
+func New() test.TestStep {
+	return &badTargets{}
+}
diff --git a/tests/plugins/teststeps/channels/channels.go b/tests/plugins/teststeps/channels/channels.go
index 0d2702e8..a86ad8cf 100644
--- a/tests/plugins/teststeps/channels/channels.go
+++ b/tests/plugins/teststeps/channels/channels.go
@@ -16,7 +16,7 @@ import (
 // Name is the name used to look this plugin up.
 var Name = "Channels"
 
-// Events defines the events that a TestStep is allow to emit
+// Events defines the events that a TestStep is allowed to emit
 var Events = []event.Name{}
 
 type channels struct {
@@ -27,12 +27,14 @@ func (ts *channels) Name() string {
 	return Name
 }
 
-// Run executes a step which does never return.s
+// Run executes a step that runs fine but closes its output channels on exit.
 func (ts *channels) Run(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
 	for target := range ch.In {
 		ch.Out <- target
 	}
+	// This is bad, do not do this.
 	close(ch.Out)
+	close(ch.Err)
 	return nil
 }
 
diff --git a/tests/plugins/teststeps/hanging/hanging.go b/tests/plugins/teststeps/hanging/hanging.go
index 86503d32..583ea7c0 100644
--- a/tests/plugins/teststeps/hanging/hanging.go
+++ b/tests/plugins/teststeps/hanging/hanging.go
@@ -27,7 +27,7 @@ func (ts *hanging) Name() string {
 	return Name
 }
 
-// Run executes a step which does never return.
+// Run executes a step that does not process any targets and never returns.
 func (ts *hanging) Run(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
 	channel := make(chan struct{})
 	<-channel
diff --git a/tests/plugins/teststeps/noreturn/noreturn.go b/tests/plugins/teststeps/noreturn/noreturn.go
index 9d4a5aac..645847e0 100644
--- a/tests/plugins/teststeps/noreturn/noreturn.go
+++ b/tests/plugins/teststeps/noreturn/noreturn.go
@@ -27,7 +27,7 @@ func (ts *noreturnStep) Name() string {
 	return Name
 }
 
-// Run executes a step which does never return.
+// Run executes a step that never returns.
 func (ts *noreturnStep) Run(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
 	for target := range ch.In {
 		ch.Out <- target

From bf5b42e0a49bcd28135a5d73c2abfccc4afa7358 Mon Sep 17 00:00:00 2001
From: "Deomid \"rojer\" Ryabkov" <rojer9@fb.com>
Date: Fri, 18 Dec 2020 23:20:56 +0000
Subject: [PATCH 2/5] Add goroutine leak checker

---
 pkg/runner/goroutine_leak_check.go | 103 +++++++++++++++++++++++++++++
 pkg/runner/test_runner_test.go     |  57 ++++++++++------
 2 files changed, 141 insertions(+), 19 deletions(-)
 create mode 100644 pkg/runner/goroutine_leak_check.go

diff --git a/pkg/runner/goroutine_leak_check.go b/pkg/runner/goroutine_leak_check.go
new file mode 100644
index 00000000..d7ca9d71
--- /dev/null
+++ b/pkg/runner/goroutine_leak_check.go
@@ -0,0 +1,103 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package runner
+
+import (
+	"fmt"
+	"path"
+	"path/filepath"
+	"regexp"
+	"runtime"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+var (
+	goidRegex     = regexp.MustCompile(`^\S+\s+(\d+)`)
+	funcNameRegex = regexp.MustCompile(`^(\S+)\(\S+`)
+	fileLineRegex = regexp.MustCompile(`^\s*(\S+):(\d+) \S+`)
+)
+
+type stackTraceEntry struct {
+	GoID  int
+	File  string
+	Line  int
+	Func  string
+	Trace []string
+}
+
+func (e *stackTraceEntry) String() string {
+	return fmt.Sprintf("%s:%d %s %d", e.File, e.Line, e.Func, e.GoID)
+}
+
+func CheckLeakedGoRoutines(funcWhitelist []string) error {
+	// Get goroutine stacks
+	buf := make([]byte, 1000000)
+	n := runtime.Stack(buf, true /* all */)
+	// First goroutine is always the running one, we skip it.
+	ph := 0
+	var e *stackTraceEntry
+	var badEntries []string
+	for _, line := range strings.Split(string(buf[:n]), "\n") {
+		switch ph {
+		case 0: // Look for an empty line
+			if len(line) == 0 {
+				if e != nil {
+					found := false
+					for _, wle := range funcWhitelist {
+						if wle == e.Func {
+							found = true
+							break
+						}
+					}
+					if !found {
+						badEntries = append(badEntries, e.String())
+					}
+				}
+				e = &stackTraceEntry{}
+				ph++
+			} else {
+				if e != nil {
+					e.Trace = append(e.Trace, line)
+				}
+			}
+		case 1: // Extract goroutine id
+			if m := goidRegex.FindStringSubmatch(line); m != nil {
+				if goid, err := strconv.Atoi(m[1]); err == nil {
+					e.GoID = goid
+					ph++
+					break
+				}
+			}
+			panic(fmt.Sprintf("Cannot parse backtrace (goid) %q", line))
+		case 2: // Extract function name
+			e.Trace = append(e.Trace, line)
+			if m := funcNameRegex.FindStringSubmatch(line); m != nil {
+				e.Func = path.Base(m[1])
+				ph++
+				break
+			}
+			panic(fmt.Sprintf("Cannot parse backtrace (func) %q", line))
+		case 3: // Extract file name
+			e.Trace = append(e.Trace, line)
+			if m := fileLineRegex.FindStringSubmatch(line); m != nil {
+				e.File = filepath.Base(m[1])
+				if ln, err := strconv.Atoi(m[2]); err == nil {
+					e.Line = ln
+				}
+				ph = 0
+				break
+			}
+			panic(fmt.Sprintf("Cannot parse backtrace (file) %q", line))
+		}
+	}
+	if len(badEntries) > 0 {
+		sort.Strings(badEntries)
+		return fmt.Errorf("leaked goroutines:\n  %s\n", strings.Join(badEntries, "\n  "))
+	}
+	return nil
+}
diff --git a/pkg/runner/test_runner_test.go b/pkg/runner/test_runner_test.go
index 70c590cf..444b4676 100644
--- a/pkg/runner/test_runner_test.go
+++ b/pkg/runner/test_runner_test.go
@@ -35,7 +35,11 @@ import (
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/panicstep"
 )
 
-const testName = "SimpleTest"
+const (
+	testName          = "SimpleTest"
+	stepInjectTimeout = 3 * time.Second
+	shutdownTimeout   = 3 * time.Second
+)
 
 var (
 	evs            storage.ResettableStorage
@@ -70,7 +74,22 @@ func TestMain(m *testing.M) {
 			panic(fmt.Sprintf("could not register TestStep: %v", err))
 		}
 	}
-	os.Exit(m.Run())
+	ret := m.Run()
+	if ret == 0 {
+		if err := CheckLeakedGoRoutines([]string{
+			// We expect these to leak.
+			"hanging.(*hanging).Run",
+			"noreturn.(*noreturnStep).Run",
+		}); err != nil {
+			fmt.Fprintf(os.Stderr, "%s", err)
+			ret = 1
+		}
+	}
+	os.Exit(ret)
+}
+
+func newTestRunner() TestRunner {
+	return NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout)
 }
 
 func eventToStringNoTime(ev testevent.Event) string {
@@ -174,7 +193,7 @@ func runWithTimeout(t *testing.T, tr TestRunner, ctx statectx.Context, resumeSta
 // Simple case: one target, one step, success.
 func Test1Step1Success(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
@@ -195,7 +214,7 @@ func Test1Step1Success(t *testing.T) {
 // Simple case: one target, one step, failure.
 func Test1Step1Fail(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
@@ -217,7 +236,7 @@ func Test1Step1Fail(t *testing.T) {
 // One step pipeline with two targets - one fails, one succeeds.
 func Test1Step1Success1Fail(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1"), tgt("T2")},
 		[]test.TestStepBundle{
@@ -244,7 +263,7 @@ func Test1Step1Success1Fail(t *testing.T) {
 // step 3 is not reached and not even run.
 func Test3StepsNotReachedStepNotRun(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1"), tgt("T2")},
 		[]test.TestStepBundle{
@@ -313,7 +332,7 @@ func TestNoReturnStepWithoutTargetForwarding(t *testing.T) {
 // A misbehaving step that panics.
 func TestStepPanics(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
@@ -328,7 +347,7 @@ func TestStepPanics(t *testing.T) {
 // A misbehaving step that closes its output channel.
 func TestStepClosesChannels(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
@@ -343,7 +362,7 @@ func TestStepClosesChannels(t *testing.T) {
 // A misbehaving step that yields a result for a target that does not exist.
 func TestStepYieldsResultForNonexistentTarget(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
@@ -356,14 +375,14 @@ func TestStepYieldsResultForNonexistentTarget(t *testing.T) {
 // A misbehaving step that yields a result for a target that does not exist.
 func TestStepYieldsDuplicateResult(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("TGood"), tgt("TDup")},
 		[]test.TestStepBundle{
 			// TGood makes it past here unscathed and gets delayed in Step 2,
 			// TDup also emerges fine at first but is then returned again, and that's bad.
 			newStep("Step 1", badtargets.Name, nil),
-			newExampleStep("Step 2", 0, "", "T2=100"),
+			newExampleStep("Step 2", 0, "", "TGood=100"),
 		},
 	)
 	require.Error(t, err)
@@ -373,7 +392,7 @@ func TestStepYieldsDuplicateResult(t *testing.T) {
 // A misbehaving step that loses targets.
 func TestStepLosesTargets(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("TGood"), tgt("TDrop")},
 		[]test.TestStepBundle{
@@ -388,7 +407,7 @@ func TestStepLosesTargets(t *testing.T) {
 // but is not currently waiting for it.
 func TestStepYieldsResultForUnexpectedTarget(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1"), tgt("T1XXX")},
 		[]test.TestStepBundle{
@@ -405,7 +424,7 @@ func TestStepYieldsResultForUnexpectedTarget(t *testing.T) {
 // Larger, randomized test - a number of steps, some targets failing, some succeeding.
 func TestRandomizedMultiStep(t *testing.T) {
 	resetEventStorage()
-	tr := NewTestRunner()
+	tr := newTestRunner()
 	var targets []*target.Target
 	for i := 1; i <= 100; i++ {
 		targets = append(targets, tgt(fmt.Sprintf("T%d", i)))
@@ -452,7 +471,7 @@ func TestPauseResumeSimple(t *testing.T) {
 		newExampleStep("Step 3", 0, "", ""),
 	}
 	{
-		tr1 := NewTestRunner()
+		tr1 := newTestRunner()
 		ctx1, pause, cancel := statectx.New()
 		defer cancel()
 		go func() {
@@ -468,7 +487,7 @@ func TestPauseResumeSimple(t *testing.T) {
 	log.Debugf("Resume state: %s", string(resumeState))
 	// Make sure that resume state is validated.
 	{
-		tr := NewTestRunner()
+		tr := newTestRunner()
 		ctx, _, cancel := statectx.New()
 		defer cancel()
 		resumeState2, err := runWithTimeout(
@@ -478,7 +497,7 @@ func TestPauseResumeSimple(t *testing.T) {
 		require.Nil(t, resumeState2)
 	}
 	{
-		tr := NewTestRunner()
+		tr := newTestRunner()
 		ctx, _, cancel := statectx.New()
 		defer cancel()
 		resumeState2 := strings.Replace(string(resumeState), `"version"`, `"Xversion"`, 1)
@@ -488,7 +507,7 @@ func TestPauseResumeSimple(t *testing.T) {
 		require.Contains(t, err.Error(), "incompatible resume state")
 	}
 	{
-		tr := NewTestRunner()
+		tr := newTestRunner()
 		ctx, _, cancel := statectx.New()
 		defer cancel()
 		resumeState2 := strings.Replace(string(resumeState), `"job_id":1`, `"job_id":2`, 1)
@@ -499,7 +518,7 @@ func TestPauseResumeSimple(t *testing.T) {
 	}
 	// Finally, resume and finish the job.
 	{
-		tr2 := NewTestRunner()
+		tr2 := newTestRunner()
 		ctx2, _, cancel := statectx.New()
 		defer cancel()
 		_, err := runWithTimeout(t, tr2, ctx2, resumeState, 5, 2*time.Second,

From f40a7c8d57dd50973e5d50ed3343eb94e2c6b66d Mon Sep 17 00:00:00 2001
From: "Deomid \"rojer\" Ryabkov" <rojer9@fb.com>
Date: Fri, 18 Dec 2020 23:22:39 +0000
Subject: [PATCH 3/5] Fix goroutine leak

---
 pkg/runner/test_runner.go                     | 18 ++++++++++----
 .../teststeps/badtargets/badtargets.go        | 24 +++++++++++++++----
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/pkg/runner/test_runner.go b/pkg/runner/test_runner.go
index f3a5b8d3..b62f5633 100644
--- a/pkg/runner/test_runner.go
+++ b/pkg/runner/test_runner.go
@@ -1,3 +1,8 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
 package runner
 
 import (
@@ -333,16 +338,19 @@ func (tr *testRunner) waitStepRunners(ctx statectx.Context) error {
 	case <-time.After(tr.shutdownTimeout):
 		tr.log.Errorf("step runners failed to shut down correctly")
 		// If there is a step with an error set, use that.
-		if err := tr.checkStepRunners(); err != nil {
-			return err
-		}
+		err := tr.checkStepRunners()
 		// If there isn't, enumerate ones that were still running at the time.
-		err := &cerrors.ErrTestStepsNeverReturned{}
+		nrerr := &cerrors.ErrTestStepsNeverReturned{}
+		if err == nil {
+			err = nrerr
+		}
 		tr.mu.Lock()
 		defer tr.mu.Unlock()
 		for _, ss := range tr.steps {
 			if ss.stepRunning {
-				err.StepNames = append(err.StepNames, ss.sb.TestStepLabel)
+				nrerr.StepNames = append(nrerr.StepNames, ss.sb.TestStepLabel)
+				// We cannot make the step itself return but we can at least release the reader.
+				tr.safeCloseOutCh(ss)
 			}
 		}
 		return err
diff --git a/tests/plugins/teststeps/badtargets/badtargets.go b/tests/plugins/teststeps/badtargets/badtargets.go
index 72413ccc..3f62b36d 100644
--- a/tests/plugins/teststeps/badtargets/badtargets.go
+++ b/tests/plugins/teststeps/badtargets/badtargets.go
@@ -41,15 +41,31 @@ func (ts *badTargets) Run(ctx statectx.Context, ch test.TestStepChannels, params
 			case "TGood":
 				// We should not depend on pointer matching, so emit a copy.
 				target2 := *target
-				ch.Out <- &target2
+				select {
+				case ch.Out <- &target2:
+				case <-ctx.Done():
+					return statectx.ErrCanceled
+				}
 			case "TDup":
-				ch.Out <- target
-				ch.Out <- target
+				select {
+				case ch.Out <- target:
+				case <-ctx.Done():
+					return statectx.ErrCanceled
+				}
+				select {
+				case ch.Out <- target:
+				case <-ctx.Done():
+					return statectx.ErrCanceled
+				}
 			default:
 				// Mangle the returned target name.
 				target2 := *target
 				target2.ID = target2.ID + "XXX"
-				ch.Out <- &target2
+				select {
+				case ch.Out <- &target2:
+				case <-ctx.Done():
+					return statectx.ErrCanceled
+				}
 			}
 		case <-ctx.Done():
 			return nil

From 5c930322e9a9efba1ec5ccbffa906dd7a9b7070a Mon Sep 17 00:00:00 2001
From: "Deomid \"rojer\" Ryabkov" <rojer9@fb.com>
Date: Mon, 21 Dec 2020 13:41:29 +0000
Subject: [PATCH 4/5] make sure correct events are emitted

---
 pkg/runner/test_runner.go                    |  76 +++++---
 pkg/runner/test_runner_test.go               | 175 +++++++++++--------
 plugins/teststeps/example/example.go         |  70 +-------
 tests/plugins/teststeps/teststep/teststep.go | 160 +++++++++++++++++
 4 files changed, 323 insertions(+), 158 deletions(-)
 create mode 100644 tests/plugins/teststeps/teststep/teststep.go

diff --git a/pkg/runner/test_runner.go b/pkg/runner/test_runner.go
index b62f5633..b47db202 100644
--- a/pkg/runner/test_runner.go
+++ b/pkg/runner/test_runner.go
@@ -16,6 +16,7 @@ import (
 
 	"github.com/facebookincubator/contest/pkg/cerrors"
 	"github.com/facebookincubator/contest/pkg/config"
+	"github.com/facebookincubator/contest/pkg/event"
 	"github.com/facebookincubator/contest/pkg/event/testevent"
 	"github.com/facebookincubator/contest/pkg/logging"
 	"github.com/facebookincubator/contest/pkg/statectx"
@@ -329,23 +330,24 @@ func (tr *testRunner) waitStepRunners(ctx statectx.Context) error {
 			tr.cond.Wait()
 		}
 	}()
+	var err error
 	select {
 	case <-swch:
 		tr.log.Debugf("step runners finished")
 		tr.mu.Lock()
 		defer tr.mu.Unlock()
-		return tr.checkStepRunners()
+		err = tr.checkStepRunners()
 	case <-time.After(tr.shutdownTimeout):
 		tr.log.Errorf("step runners failed to shut down correctly")
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
 		// If there is a step with an error set, use that.
-		err := tr.checkStepRunners()
+		err = tr.checkStepRunners()
 		// If there isn't, enumerate ones that were still running at the time.
 		nrerr := &cerrors.ErrTestStepsNeverReturned{}
 		if err == nil {
 			err = nrerr
 		}
-		tr.mu.Lock()
-		defer tr.mu.Unlock()
 		for _, ss := range tr.steps {
 			if ss.stepRunning {
 				nrerr.StepNames = append(nrerr.StepNames, ss.sb.TestStepLabel)
@@ -353,8 +355,16 @@ func (tr *testRunner) waitStepRunners(ctx statectx.Context) error {
 				tr.safeCloseOutCh(ss)
 			}
 		}
-		return err
 	}
+	// Emit step error events.
+	for _, ss := range tr.steps {
+		if ss.runErr != nil && ss.runErr != statectx.ErrPaused && ss.runErr != statectx.ErrCanceled {
+			if err := ss.emitEvent(EventTestError, nil, ss.runErr.Error()); err != nil {
+				tr.log.Errorf("failed to emit event: %s", err)
+			}
+		}
+	}
+	return err
 }
 
 // targetRunner runs one target through all the steps of the pipeline.
@@ -393,16 +403,28 @@ loop:
 		select {
 		case ss.inCh <- ts.tgt:
 			// Injected successfully.
+			err := ss.ev.Emit(testevent.Data{EventName: target.EventTargetIn, Target: ts.tgt})
 			tr.mu.Lock()
 			ts.CurPhase = targetStepPhaseRun
 			ss.numInjected++
+			if err != nil {
+				ss.runErr = fmt.Errorf("failed to report target injection: %w", err)
+				ss.log.Errorf("%s", ss.runErr)
+			}
 			tr.mu.Unlock()
 			tr.cond.Signal()
+			if err != nil {
+				break loop
+			}
 		case <-time.After(tr.stepInjectTimeout):
 			tr.mu.Lock()
 			ss.log.Errorf("timed out while injecting a target")
 			ss.runErr = &cerrors.ErrTestTargetInjectionTimedOut{StepName: ss.sb.TestStepLabel}
 			tr.mu.Unlock()
+			err := ss.ev.Emit(testevent.Data{EventName: target.EventTargetInErr, Target: ts.tgt})
+			if err != nil {
+				ss.log.Errorf("failed to emit event: %s", err)
+			}
 			break loop
 		case <-ctx.Done():
 			log.Debugf("%s: canceled 1", ts)
@@ -416,6 +438,15 @@ loop:
 				break loop
 			}
 			log.Debugf("%s: result for %s recd", ts, ss)
+			var err error
+			if res == nil {
+				err = ss.emitEvent(target.EventTargetOut, ts.tgt, nil)
+			} else {
+				err = ss.emitEvent(target.EventTargetErr, ts.tgt, target.ErrPayload{Error: res.Error()})
+			}
+			if err != nil {
+				ss.log.Errorf("failed to emit event: %s", err)
+			}
 			tr.mu.Lock()
 			ts.CurPhase = targetStepPhaseEnd
 			ts.res = res
@@ -464,20 +495,21 @@ func (tr *testRunner) runStepIfNeeded(ctx statectx.Context, ss *stepState) {
 	go tr.stepReader(ctx, ss)
 }
 
-// emitStepEvent emits an error event if step resulted in an error.
-func (ss *stepState) emitStepEvent(tgt *target.Target, err error) error {
-	if err == nil {
-		return nil
-	}
-	payload, jmErr := json.Marshal(err.Error())
-	if jmErr != nil {
-		return fmt.Errorf("failed to marshal event: %w", err)
+// emitEvent emits the specified event with the specified JSON payload (if any).
+func (ss *stepState) emitEvent(name event.Name, tgt *target.Target, payload interface{}) error {
+	var payloadJSON *json.RawMessage
+	if payload != nil {
+		payloadBytes, jmErr := json.Marshal(payload)
+		if jmErr != nil {
+			return fmt.Errorf("failed to marshal event: %w", jmErr)
+		}
+		pj := json.RawMessage(payloadBytes)
+		payloadJSON = &pj
 	}
-	rm := json.RawMessage(payload)
 	errEv := testevent.Data{
-		EventName: EventTestError,
+		EventName: name,
 		Target:    tgt,
-		Payload:   &rm,
+		Payload:   payloadJSON,
 	}
 	return ss.ev.Emit(errEv)
 }
@@ -491,7 +523,7 @@ func (tr *testRunner) stepRunner(ctx statectx.Context, ss *stepState) {
 			ss.stepRunning = false
 			ss.runErr = &cerrors.ErrTestStepPaniced{
 				StepName:   ss.sb.TestStepLabel,
-				StackTrace: string(debug.Stack()),
+				StackTrace: fmt.Sprintf("%s / %s", r, debug.Stack()),
 			}
 			tr.mu.Unlock()
 			tr.safeCloseOutCh(ss)
@@ -499,11 +531,6 @@ func (tr *testRunner) stepRunner(ctx statectx.Context, ss *stepState) {
 	}()
 	chans := test.TestStepChannels{In: ss.inCh, Out: ss.outCh, Err: ss.errCh}
 	runErr := ss.sb.TestStep.Run(ctx, chans, ss.sb.Parameters, ss.ev)
-	if err := ss.emitStepEvent(nil, runErr); err != nil {
-		if ss.runErr == nil {
-			ss.runErr = err
-		}
-	}
 	tr.mu.Lock()
 	ss.stepRunning = false
 	if runErr != nil {
@@ -548,11 +575,6 @@ func (tr *testRunner) reportTargetResult(ctx statectx.Context, ss *stepState, tg
 	if err != nil {
 		return err
 	}
-	if res != nil {
-		if err := ss.emitStepEvent(tgt, res); err != nil {
-			return err
-		}
-	}
 	select {
 	case resCh <- res:
 		break
diff --git a/pkg/runner/test_runner_test.go b/pkg/runner/test_runner_test.go
index 444b4676..d14f7b94 100644
--- a/pkg/runner/test_runner_test.go
+++ b/pkg/runner/test_runner_test.go
@@ -33,6 +33,7 @@ import (
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/noreturn"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/panicstep"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/teststep"
 )
 
 const (
@@ -69,6 +70,7 @@ func TestMain(m *testing.M) {
 		{hanging.Name, hanging.New, hanging.Events},
 		{noreturn.Name, noreturn.New, noreturn.Events},
 		{panicstep.Name, panicstep.New, panicstep.Events},
+		{teststep.Name, teststep.New, teststep.Events},
 	} {
 		if err := pluginRegistry.RegisterTestStep(e.name, e.factory, e.events); err != nil {
 			panic(fmt.Sprintf("could not register TestStep: %v", err))
@@ -156,11 +158,11 @@ func newStep(label, name string, params *test.TestStepParameters) test.TestStepB
 	return *sb
 }
 
-func newExampleStep(label string, failPct int, failTargets string, delayTargets string) test.TestStepBundle {
-	return newStep(label, example.Name, &test.TestStepParameters{
-		example.FailPctParam:      []test.Param{*test.NewParam(fmt.Sprintf("%d", failPct))},
-		example.FailTargetsParam:  []test.Param{*test.NewParam(failTargets)},
-		example.DelayTargetsParam: []test.Param{*test.NewParam(delayTargets)},
+func newTestStep(label string, failPct int, failTargets string, delayTargets string) test.TestStepBundle {
+	return newStep(label, teststep.Name, &test.TestStepParameters{
+		teststep.FailPctParam:      []test.Param{*test.NewParam(fmt.Sprintf("%d", failPct))},
+		teststep.FailTargetsParam:  []test.Param{*test.NewParam(failTargets)},
+		teststep.DelayTargetsParam: []test.Param{*test.NewParam(delayTargets)},
 	})
 }
 
@@ -197,17 +199,19 @@ func Test1Step1Success(t *testing.T) {
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
-			newExampleStep("Step 1", 0, "", ""),
+			newTestStep("Step 1", 0, "", ""),
 		},
 	)
 	require.NoError(t, err)
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents(""))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetOut]}
 `, getTargetEvents("T1"))
 }
 
@@ -218,18 +222,19 @@ func Test1Step1Fail(t *testing.T) {
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1")},
 		[]test.TestStepBundle{
-			newExampleStep("Step 1", 100, "", ""),
+			newTestStep("Step 1", 100, "", ""),
 		},
 	)
 	require.NoError(t, err)
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents("Step 1"))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
 `, getTargetEvents("T1"))
 }
 
@@ -240,22 +245,25 @@ func Test1Step1Success1Fail(t *testing.T) {
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1"), tgt("T2")},
 		[]test.TestStepBundle{
-			newExampleStep("Step 1", 0, "T1", "T2=100"),
+			newTestStep("Step 1", 0, "T1", "T2=100"),
 		},
 	)
 	require.NoError(t, err)
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents(""))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
 `, getTargetEvents("T1"))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetOut]}
 `, getTargetEvents("T2"))
 }
 
@@ -267,32 +275,36 @@ func Test3StepsNotReachedStepNotRun(t *testing.T) {
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		[]*target.Target{tgt("T1"), tgt("T2")},
 		[]test.TestStepBundle{
-			newExampleStep("Step 1", 0, "T1", ""),
-			newExampleStep("Step 2", 0, "T2", ""),
-			newExampleStep("Step 3", 0, "", ""),
+			newTestStep("Step 1", 0, "T1", ""),
+			newTestStep("Step 2", 0, "T2", ""),
+			newTestStep("Step 3", 0, "", ""),
 		},
 	)
 	require.NoError(t, err)
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents("Step 1"))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents("Step 2"))
 	require.Equal(t, "\n\n", getStepEvents("Step 3"))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
 `, getTargetEvents("T1"))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleFinishedEvent]}
-{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleFailedEvent]}
-{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestError &"\"target failed\""]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetOut]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestFailedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetErr &"{\"Error\":\"target failed\"}"]}
 `, getTargetEvents("T2"))
 }
 
@@ -342,6 +354,7 @@ func TestStepPanics(t *testing.T) {
 	require.Error(t, err)
 	require.IsType(t, &cerrors.ErrTestStepPaniced{}, err)
 	require.Equal(t, "\n\n", getTargetEvents("T1"))
+	require.Contains(t, getStepEvents("Step 1"), "step Step 1 paniced")
 }
 
 // A misbehaving step that closes its output channel.
@@ -356,7 +369,13 @@ func TestStepClosesChannels(t *testing.T) {
 	)
 	require.Error(t, err)
 	require.IsType(t, &cerrors.ErrTestStepClosedChannels{}, err)
-	require.Equal(t, "\n\n", getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetOut]}
+`, getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestError &"\"test step Step 1 closed output channels (api violation)\""]}
+`, getStepEvents("Step 1"))
 }
 
 // A misbehaving step that yields a result for a target that does not exist.
@@ -370,6 +389,13 @@ func TestStepYieldsResultForNonexistentTarget(t *testing.T) {
 		},
 	)
 	require.Error(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+`, getTargetEvents("T1"))
+	require.Equal(t, "\n\n", getTargetEvents("T1XXX"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestError &"\"[#0 Step 1]: result for nonexistent target Target{ID: \\\"T1XXX\\\"} \\u003cnil\\u003e\""]}
+`, getStepEvents("Step 1"))
 }
 
 // A misbehaving step that yields a result for a target that does not exist.
@@ -382,7 +408,7 @@ func TestStepYieldsDuplicateResult(t *testing.T) {
 			// TGood makes it past here unscathed and gets delayed in Step 2,
 			// TDup also emerges fine at first but is then returned again, and that's bad.
 			newStep("Step 1", badtargets.Name, nil),
-			newExampleStep("Step 2", 0, "", "TGood=100"),
+			newTestStep("Step 2", 0, "", "TGood=100"),
 		},
 	)
 	require.Error(t, err)
@@ -412,7 +438,7 @@ func TestStepYieldsResultForUnexpectedTarget(t *testing.T) {
 		[]*target.Target{tgt("T1"), tgt("T1XXX")},
 		[]test.TestStepBundle{
 			// T1XXX fails here.
-			newExampleStep("Step 1", 0, "T1XXX", ""),
+			newTestStep("Step 1", 0, "T1XXX", ""),
 			// Yet, a result for it is returned here, which we did not expect.
 			newStep("Step 2", badtargets.Name, nil),
 		},
@@ -432,9 +458,9 @@ func TestRandomizedMultiStep(t *testing.T) {
 	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
 		targets,
 		[]test.TestStepBundle{
-			newExampleStep("Step 1", 0, "", "*=10"),  // All targets pass the first step, with a slight delay
-			newExampleStep("Step 2", 25, "", ""),     // 25% don't make it past the second step
-			newExampleStep("Step 3", 25, "", "*=10"), // Another 25% fail at the third step
+			newTestStep("Step 1", 0, "", "*=10"),  // All targets pass the first step, with a slight delay
+			newTestStep("Step 2", 25, "", ""),     // 25% don't make it past the second step
+			newTestStep("Step 3", 25, "", "*=10"), // Another 25% fail at the third step
 		},
 	)
 	require.NoError(t, err)
@@ -443,12 +469,14 @@ func TestRandomizedMultiStep(t *testing.T) {
 	for _, tgt := range targets {
 		s1n := "Step 1"
 		require.Equal(t, fmt.Sprintf(`
-{[1 1 SimpleTest Step 1][Target{ID: "%s"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "%s"} ExampleFinishedEvent]}
-`, tgt.ID, tgt.ID),
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TargetOut]}
+`, tgt.ID, tgt.ID, tgt.ID, tgt.ID),
 			getEvents(&tgt.ID, &s1n))
 		s3n := "Step 3"
-		if strings.Contains(getEvents(&tgt.ID, &s3n), "ExampleFinishedEvent") {
+		if strings.Contains(getEvents(&tgt.ID, &s3n), "TestFinishedEvent") {
 			numFinished++
 		}
 	}
@@ -465,10 +493,10 @@ func TestPauseResumeSimple(t *testing.T) {
 	var resumeState []byte
 	targets := []*target.Target{tgt("T1"), tgt("T2"), tgt("T3")}
 	steps := []test.TestStepBundle{
-		newExampleStep("Step 1", 0, "T1", ""),
+		newTestStep("Step 1", 0, "T1", ""),
 		// T2 and T3 will be paused here, the step will be given time to finish.
-		newExampleStep("Step 2", 0, "", "T2=200,T3=200"),
-		newExampleStep("Step 3", 0, "", ""),
+		newTestStep("Step 2", 0, "", "T2=200,T3=200"),
+		newTestStep("Step 3", 0, "", ""),
 	}
 	{
 		tr1 := newTestRunner()
@@ -526,9 +554,9 @@ func TestPauseResumeSimple(t *testing.T) {
 			// Don't use the same pointers ot make sure there is no reliance on that.
 			[]*target.Target{tgt("T1"), tgt("T2"), tgt("T3")},
 			[]test.TestStepBundle{
-				newExampleStep("Step 1", 0, "T1", ""),
-				newExampleStep("Step 2", 0, "", "T2=200,T3=200"),
-				newExampleStep("Step 3", 0, "", ""),
+				newTestStep("Step 1", 0, "T1", ""),
+				newTestStep("Step 2", 0, "", "T2=200,T3=200"),
+				newTestStep("Step 3", 0, "", ""),
 			},
 		)
 		require.NoError(t, err)
@@ -537,31 +565,38 @@ func TestPauseResumeSimple(t *testing.T) {
 	// Steps 1 and 2 are executed entirely within the first runner instance
 	// and never started in the second.
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 1][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents("Step 1"))
 	require.Equal(t, `
-{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 1 SimpleTest Step 2][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents("Step 2"))
 	// Step 3 did not get to start in the first instance and ran in the second.
 	require.Equal(t, `
-{[1 5 SimpleTest Step 3][(*Target)(nil) ExampleStepRunningEvent]}
-{[1 5 SimpleTest Step 3][(*Target)(nil) ExampleStepFinishedEvent]}
+{[1 5 SimpleTest Step 3][(*Target)(nil) TestStepRunningEvent]}
+{[1 5 SimpleTest Step 3][(*Target)(nil) TestStepFinishedEvent]}
 `, getStepEvents("Step 3"))
 	// T1 failed entirely within the first run.
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} ExampleFailedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestError &"\"target failed\""]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
 `, getTargetEvents("T1"))
 	// T2 and T3 ran in both.
 	require.Equal(t, `
-{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 1][Target{ID: "T2"} ExampleFinishedEvent]}
-{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleStartedEvent]}
-{[1 1 SimpleTest Step 2][Target{ID: "T2"} ExampleFinishedEvent]}
-{[1 5 SimpleTest Step 3][Target{ID: "T2"} ExampleStartedEvent]}
-{[1 5 SimpleTest Step 3][Target{ID: "T2"} ExampleFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetOut]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetOut]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TargetIn]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TestStartedEvent]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TestFinishedEvent]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TargetOut]}
 `, getTargetEvents("T2"))
 }
diff --git a/plugins/teststeps/example/example.go b/plugins/teststeps/example/example.go
index c56b150f..3cd7f2a7 100644
--- a/plugins/teststeps/example/example.go
+++ b/plugins/teststeps/example/example.go
@@ -8,9 +8,7 @@ package example
 import (
 	"fmt"
 	"math/rand"
-	"strconv"
 	"strings"
-	"time"
 
 	"github.com/facebookincubator/contest/pkg/cerrors"
 	"github.com/facebookincubator/contest/pkg/event"
@@ -29,23 +27,17 @@ var log = logging.GetLogger("teststeps/" + strings.ToLower(Name))
 
 // Params this step accepts.
 const (
-	// A comma-delimited list of target IDs to fail on.
-	FailTargetsParam = "FailTargets"
-	// Alternatively, fail this percentage of targets at random.
+	// Fail this percentage of targets at random.
 	FailPctParam = "FailPct"
-	// A comma-delimited list of target IDs to delay and by how much, ID=delay_ms.
-	DelayTargetsParam = "DelayTargets"
 )
 
 // events that we may emit during the plugin's lifecycle. This is used in Events below.
 // Note that you don't normally need to emit start/finish/cancellation events as
 // these are emitted automatically by the framework.
 const (
-	StartedEvent      = event.Name("ExampleStartedEvent")
-	FinishedEvent     = event.Name("ExampleFinishedEvent")
-	FailedEvent       = event.Name("ExampleFailedEvent")
-	StepRunningEvent  = event.Name("ExampleStepRunningEvent")
-	StepFinishedEvent = event.Name("ExampleStepFinishedEvent")
+	StartedEvent  = event.Name("ExampleStartedEvent")
+	FinishedEvent = event.Name("ExampleFinishedEvent")
+	FailedEvent   = event.Name("ExampleFailedEvent")
 )
 
 // Events defines the events that a TestStep is allow to emit. Emitting an event
@@ -56,9 +48,7 @@ var Events = []event.Name{StartedEvent, FinishedEvent, FailedEvent}
 // consumes Targets in input and pipes them to the output or error channel
 // with intermediate buffering.
 type Step struct {
-	failPct      int64
-	failTargets  map[string]bool
-	delayTargets map[string]time.Duration
+	failPct int64
 }
 
 // Name returns the name of the Step
@@ -66,10 +56,7 @@ func (ts Step) Name() string {
 	return Name
 }
 
-func (ts *Step) shouldFail(t *target.Target, params test.TestStepParameters) bool {
-	if ts.failTargets[t.ID] {
-		return true
-	}
+func (ts *Step) shouldFail(t *target.Target) bool {
 	if ts.failPct > 0 {
 		roll := rand.Int63n(101)
 		return (roll <= ts.failPct)
@@ -87,16 +74,7 @@ func (ts *Step) Run(ctx statectx.Context, ch test.TestStepChannels, params test.
 		if err := ev.Emit(testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
 			return fmt.Errorf("failed to emit start event: %v", err)
 		}
-		delay := ts.delayTargets[target.ID]
-		if delay == 0 {
-			delay = ts.delayTargets["*"]
-		}
-		select {
-		case <-time.After(delay):
-		case <-ctx.Done():
-			return statectx.ErrCanceled
-		}
-		if ts.shouldFail(target, params) {
+		if ts.shouldFail(target) {
 			if err := ev.Emit(testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
 				return fmt.Errorf("failed to emit finished event: %v", err)
 			}
@@ -108,38 +86,11 @@ func (ts *Step) Run(ctx statectx.Context, ch test.TestStepChannels, params test.
 		}
 		return nil
 	}
-	if err := ev.Emit(testevent.Data{EventName: StepRunningEvent}); err != nil {
-		return fmt.Errorf("failed to emit failed event: %v", err)
-	}
-	res := teststeps.ForEachTarget(Name, ctx, ch, f)
-	if err := ev.Emit(testevent.Data{EventName: StepFinishedEvent}); err != nil {
-		return fmt.Errorf("failed to emit failed event: %v", err)
-	}
-	return res
+	return teststeps.ForEachTarget(Name, ctx, ch, f)
 }
 
 // ValidateParameters validates the parameters associated to the TestStep
 func (ts *Step) ValidateParameters(params test.TestStepParameters) error {
-	targetsToFail := params.GetOne(FailTargetsParam).String()
-	if len(targetsToFail) > 0 {
-		for _, t := range strings.Split(targetsToFail, ",") {
-			ts.failTargets[t] = true
-		}
-	}
-	targetsToDelay := params.GetOne(DelayTargetsParam).String()
-	if len(targetsToDelay) > 0 {
-		for _, e := range strings.Split(targetsToDelay, ",") {
-			kv := strings.Split(e, "=")
-			if len(kv) != 2 {
-				continue
-			}
-			v, err := strconv.Atoi(kv[1])
-			if err != nil {
-				return fmt.Errorf("invalid FailTargets: %w", err)
-			}
-			ts.delayTargets[kv[0]] = time.Duration(v) * time.Millisecond
-		}
-	}
 	if params.GetOne(FailPctParam).String() != "" {
 		if pct, err := params.GetInt(FailPctParam); err == nil {
 			ts.failPct = pct
@@ -163,10 +114,7 @@ func (ts *Step) CanResume() bool {
 
 // New initializes and returns a new ExampleTestStep.
 func New() test.TestStep {
-	return &Step{
-		failTargets:  make(map[string]bool),
-		delayTargets: make(map[string]time.Duration),
-	}
+	return &Step{}
 }
 
 // Load returns the name, factory and events which are needed to register the step.
diff --git a/tests/plugins/teststeps/teststep/teststep.go b/tests/plugins/teststeps/teststep/teststep.go
new file mode 100644
index 00000000..32db0907
--- /dev/null
+++ b/tests/plugins/teststeps/teststep/teststep.go
@@ -0,0 +1,160 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package teststep
+
+import (
+	"fmt"
+	"math/rand"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/statectx"
+	"github.com/facebookincubator/contest/pkg/target"
+	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/plugins/teststeps"
+)
+
+var Name = "Test"
+
+const (
+	// A comma-delimited list of target IDs to fail on.
+	FailTargetsParam = "FailTargets"
+	// Alternatively, fail this percentage of targets at random.
+	FailPctParam = "FailPct"
+	// A comma-delimited list of target IDs to delay and by how much, ID=delay_ms.
+	DelayTargetsParam = "DelayTargets"
+)
+
+const (
+	StartedEvent      = event.Name("TestStartedEvent")
+	FinishedEvent     = event.Name("TestFinishedEvent")
+	FailedEvent       = event.Name("TestFailedEvent")
+	StepRunningEvent  = event.Name("TestStepRunningEvent")
+	StepFinishedEvent = event.Name("TestStepFinishedEvent")
+)
+
+var Events = []event.Name{StartedEvent, FinishedEvent, FailedEvent, StepRunningEvent, StepFinishedEvent}
+
+type Step struct {
+	failPct      int64
+	failTargets  map[string]bool
+	delayTargets map[string]time.Duration
+}
+
+// Name returns the name of the Step
+func (ts Step) Name() string {
+	return Name
+}
+
+func (ts *Step) shouldFail(t *target.Target, params test.TestStepParameters) bool {
+	if ts.failTargets[t.ID] {
+		return true
+	}
+	if ts.failPct > 0 {
+		roll := rand.Int63n(101)
+		return (roll <= ts.failPct)
+	}
+	return false
+}
+
+// Run executes the example step.
+func (ts *Step) Run(ctx statectx.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
+	f := func(ctx statectx.Context, target *target.Target) error {
+		// Sleep to ensure argetIn fires first. This simplifies test assertions.
+		time.Sleep(10 * time.Millisecond)
+		if err := ev.Emit(testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
+			return fmt.Errorf("failed to emit start event: %v", err)
+		}
+		delay := ts.delayTargets[target.ID]
+		if delay == 0 {
+			delay = ts.delayTargets["*"]
+		}
+		select {
+		case <-time.After(delay):
+		case <-ctx.Done():
+			return statectx.ErrCanceled
+		}
+		if ts.shouldFail(target, params) {
+			if err := ev.Emit(testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit finished event: %v", err)
+			}
+			return fmt.Errorf("target failed")
+		} else {
+			if err := ev.Emit(testevent.Data{EventName: FinishedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit failed event: %v", err)
+			}
+		}
+		return nil
+	}
+	if err := ev.Emit(testevent.Data{EventName: StepRunningEvent}); err != nil {
+		return fmt.Errorf("failed to emit failed event: %v", err)
+	}
+	res := teststeps.ForEachTarget(Name, ctx, ch, f)
+	if err := ev.Emit(testevent.Data{EventName: StepFinishedEvent}); err != nil {
+		return fmt.Errorf("failed to emit failed event: %v", err)
+	}
+	return res
+}
+
+// ValidateParameters validates the parameters associated to the TestStep
+func (ts *Step) ValidateParameters(params test.TestStepParameters) error {
+	targetsToFail := params.GetOne(FailTargetsParam).String()
+	if len(targetsToFail) > 0 {
+		for _, t := range strings.Split(targetsToFail, ",") {
+			ts.failTargets[t] = true
+		}
+	}
+	targetsToDelay := params.GetOne(DelayTargetsParam).String()
+	if len(targetsToDelay) > 0 {
+		for _, e := range strings.Split(targetsToDelay, ",") {
+			kv := strings.Split(e, "=")
+			if len(kv) != 2 {
+				continue
+			}
+			v, err := strconv.Atoi(kv[1])
+			if err != nil {
+				return fmt.Errorf("invalid FailTargets: %w", err)
+			}
+			ts.delayTargets[kv[0]] = time.Duration(v) * time.Millisecond
+		}
+	}
+	if params.GetOne(FailPctParam).String() != "" {
+		if pct, err := params.GetInt(FailPctParam); err == nil {
+			ts.failPct = pct
+		} else {
+			return fmt.Errorf("invalid FailPct: %w", err)
+		}
+	}
+	return nil
+}
+
+// Resume tries to resume a previously interrupted test step. TestTestStep
+// cannot resume.
+func (ts *Step) Resume(ctx statectx.Context, ch test.TestStepChannels, _ test.TestStepParameters, ev testevent.EmitterFetcher) error {
+	return &cerrors.ErrResumeNotSupported{StepName: Name}
+}
+
+// CanResume tells whether this step is able to resume.
+func (ts *Step) CanResume() bool {
+	return false
+}
+
+// New initializes and returns a new TestStep.
+func New() test.TestStep {
+	return &Step{
+		failTargets:  make(map[string]bool),
+		delayTargets: make(map[string]time.Duration),
+	}
+}
+
+// Load returns the name, factory and events which are needed to register the step.
+func Load() (string, test.TestStepFactory, []event.Name) {
+	return Name, New, Events
+}

From 86b113dfdad0794e2bc8ee43f9cc9facb19ce5ac Mon Sep 17 00:00:00 2001
From: Deomid Ryabkov <rojer@rojer.me>
Date: Sun, 17 Jan 2021 05:15:25 +0000
Subject: [PATCH 5/5] Address review comments

---
 pkg/cerrors/cerrors.go                        |   8 +-
 pkg/jobmanager/jobmanager.go                  |   2 -
 pkg/runner/job_runner.go                      |   2 +-
 pkg/runner/test_runner.go                     | 238 ++++++++++--------
 pkg/runner/test_runner_test.go                |  13 +-
 .../goroutine_leak_check.go                   |  26 +-
 .../goroutine_leak_check_test.go              |  53 ++++
 7 files changed, 214 insertions(+), 128 deletions(-)
 rename {pkg/runner => tests/goroutine_leak_check}/goroutine_leak_check.go (73%)
 create mode 100644 tests/goroutine_leak_check/goroutine_leak_check_test.go

diff --git a/pkg/cerrors/cerrors.go b/pkg/cerrors/cerrors.go
index d0a1dfdd..649bbb4a 100644
--- a/pkg/cerrors/cerrors.go
+++ b/pkg/cerrors/cerrors.go
@@ -65,7 +65,7 @@ func (e *ErrTestStepClosedChannels) Error() string {
 	return fmt.Sprintf("test step %v closed output channels (api violation)", e.StepName)
 }
 
-// ErrTestStepPaniced indicates that a test step's method paniced.
+// ErrTestStepPaniced indicates that a test step's method panicked.
 type ErrTestStepPaniced struct {
 	StepName   string
 	StackTrace string
@@ -76,7 +76,8 @@ func (e *ErrTestStepPaniced) Error() string {
 	return fmt.Sprintf("test step %s paniced, trace: %q", e.StepName, e.StackTrace)
 }
 
-// ErrTestStepReturnedDuplicateResult indicates that a test step's method paniced.
+// ErrTestStepReturnedDuplicateResult indicates that a test step returned result
+// twice for the same target.
 type ErrTestStepReturnedDuplicateResult struct {
 	StepName string
 	Target   string
@@ -87,7 +88,8 @@ func (e *ErrTestStepReturnedDuplicateResult) Error() string {
 	return fmt.Sprintf("test step %s returned duplicate result for %s", e.StepName, e.Target)
 }
 
-// ErrTestStepReturnedUnexpectedResult indicates that a test step's method paniced.
+// ErrTestStepReturnedUnexpectedResult indicates that a test step returned result
+// for a target that was not given to it.
 type ErrTestStepReturnedUnexpectedResult struct {
 	StepName string
 	Target   string
diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go
index 78479db2..411001f3 100644
--- a/pkg/jobmanager/jobmanager.go
+++ b/pkg/jobmanager/jobmanager.go
@@ -158,8 +158,6 @@ func newPartialJobFromDescriptor(pr *pluginregistry.PluginRegistry, jd *job.JobD
 			if err != nil {
 				return nil, err
 			}
-			// test step index is incremented by 1 so we can use 0 to signal an
-			// anomaly.
 			tsb, err := pr.NewTestStepBundle(*testStepDesc, tse)
 			if err != nil {
 				return nil, fmt.Errorf("NewTestStepBundle for test step '%s' with index %d failed: %w", testStepDesc.Name, idx, err)
diff --git a/pkg/runner/job_runner.go b/pkg/runner/job_runner.go
index efb0c4a9..c76d3b87 100644
--- a/pkg/runner/job_runner.go
+++ b/pkg/runner/job_runner.go
@@ -186,7 +186,7 @@ func (jr *JobRunner) Run(j *job.Job) ([][]*job.Report, []*job.Report, error) {
 				var resumeState []byte
 				resumeState, err := testRunner.Run(j.StateCtx, t, targets, j.ID, types.RunID(run+1), resumeState)
 				if err == statectx.ErrPaused {
-					jobLog.Debugf("Runner paused, state: %s", string(resumeState))
+					jobLog.Errorf("Runner paused, state: %s", string(resumeState))
 					// TODO(rojer): Persist the state.
 				} else {
 					runErr = err
diff --git a/pkg/runner/test_runner.go b/pkg/runner/test_runner.go
index b47db202..465c68f9 100644
--- a/pkg/runner/test_runner.go
+++ b/pkg/runner/test_runner.go
@@ -26,19 +26,11 @@ import (
 	"github.com/facebookincubator/contest/pkg/types"
 )
 
-// TestRunner is the interface test runner implements
-type TestRunner interface {
-	// ErrPaused will be returned when runner was able to pause successfully.
-	// When this error is returned from Run(), it is accompanied by valid serialized state
-	// that can be passed later to Run() to continue from where we left off.
-	Run(ctx statectx.Context, test *test.Test, targets []*target.Target, jobID types.JobID, runID types.RunID, resumeState []byte) ([]byte, error)
-}
-
-// testRunner is the state associated with a test run.
+// TestRunner is the state associated with a test run.
 // Here's how a test run works:
-//  * Each target gets a targetState and a "target runner" - a goroutine that takes that particular
+//  * Each target gets a targetState and a "target handler" - a goroutine that takes that particular
 //    target through each step of the pipeline in sequence. It injects the target, waits for the result,
-//    then mves on to the next step.
+//    then moves on to the next step.
 //  * Each step of the pipeline gets a stepState and:
 //    - A "step runner" - a goroutine that is responsible for running the step's Run() method
 //    - A "step reader" - a goroutine that processes results and sends them on to target runners that await them.
@@ -46,10 +38,10 @@ type TestRunner interface {
 //    that checks on the pipeline's progress and is responsible for closing step input channels
 //    when all the targets have been injected.
 //  * Monitor loop finishes when all the targets have been injected into the last step
-//    or if a step has encountered and error.
+//    or if a step has encountered an error.
 //  * We then wait for all the step runners and readers to shut down.
 //  * Once all the activity has died down, resulting state is examined and an error is returned, if any.
-type testRunner struct {
+type TestRunner struct {
 	stepInjectTimeout time.Duration // Time to wait for steps to accept each target
 	shutdownTimeout   time.Duration // Time to wait for steps runners to finish a the end of the run
 
@@ -75,8 +67,7 @@ type stepState struct {
 	errCh chan cerrors.TargetError
 	ev    testevent.Emitter
 
-	numInjected int                     // Number of targets injected.
-	tgtDone     map[*target.Target]bool // Targets for which results have been received.
+	tgtDone map[*target.Target]bool // Targets for which results have been received.
 
 	stepRunning   bool  // testStep.Run() is currently running.
 	readerRunning bool  // Result reader is running.
@@ -89,10 +80,11 @@ type stepState struct {
 type targetStepPhase int
 
 const (
-	targetStepPhaseInit  targetStepPhase = 0
-	targetStepPhaseBegin targetStepPhase = 1 // Picked up for execution.
-	targetStepPhaseRun   targetStepPhase = 2 // Injected into step.
-	targetStepPhaseEnd   targetStepPhase = 3 // Finished running a step.
+	targetStepPhaseInvalid targetStepPhase = iota
+	targetStepPhaseInit                    // (1) Created
+	targetStepPhaseBegin                   // (2) Picked up for execution.
+	targetStepPhaseRun                     // (3) Injected into step.
+	targetStepPhaseEnd                     // (4) Finished running a step.
 )
 
 // targetState contains state associated with one target progressing through the pipeline.
@@ -121,7 +113,7 @@ type resumeStateStruct struct {
 const resumeStateStructVersion = 1
 
 // Run is the main enty point of the runner.
-func (tr *testRunner) Run(
+func (tr *TestRunner) Run(
 	ctx statectx.Context,
 	t *test.Test, targets []*target.Target,
 	jobID types.JobID, runID types.RunID,
@@ -141,7 +133,7 @@ func (tr *testRunner) Run(
 	return resumeState, err
 }
 
-func (tr *testRunner) run(
+func (tr *TestRunner) run(
 	ctx statectx.Context,
 	t *test.Test, targets []*target.Target,
 	jobID types.JobID, runID types.RunID,
@@ -170,6 +162,7 @@ func (tr *testRunner) run(
 			tgtDone: make(map[*target.Target]bool),
 			log:     logging.AddField(tr.log, "step", sb.TestStepLabel),
 		})
+		// Step handlers will be started from target handlers as targets reach them.
 	}
 
 	// Set up the targets
@@ -195,7 +188,9 @@ func (tr *testRunner) run(
 	for _, tgt := range targets {
 		ts := tr.targets[tgt.ID]
 		if ts == nil {
-			ts = &targetState{}
+			ts = &targetState{
+				CurPhase: targetStepPhaseInit,
+			}
 		}
 		ts.tgt = tgt
 		ts.resCh = make(chan error)
@@ -203,7 +198,10 @@ func (tr *testRunner) run(
 		tr.targets[tgt.ID] = ts
 		tr.mu.Unlock()
 		tr.targetsWg.Add(1)
-		go tr.targetRunner(targetCtx, stepCtx, ts)
+		go func() {
+			tr.targetHandler(targetCtx, stepCtx, ts)
+			tr.targetsWg.Done()
+		}()
 	}
 
 	// Run until no more progress can be made.
@@ -293,7 +291,7 @@ func (tr *testRunner) run(
 	return resumeState, runErr
 }
 
-func (tr *testRunner) waitStepRunners(ctx statectx.Context) error {
+func (tr *TestRunner) waitStepRunners(ctx statectx.Context) error {
 	tr.log.Debugf("waiting for step runners to finish")
 	swch := make(chan struct{})
 	go func() {
@@ -367,14 +365,77 @@ func (tr *testRunner) waitStepRunners(ctx statectx.Context) error {
 	return err
 }
 
-// targetRunner runs one target through all the steps of the pipeline.
-func (tr *testRunner) targetRunner(ctx, stepCtx statectx.Context, ts *targetState) {
+func (tr *TestRunner) injectTarget(ctx statectx.Context, ts *targetState, ss *stepState, log *logrus.Entry) error {
+	log.Debugf("%s: injecting into %s", ts, ss)
+	select {
+	case ss.inCh <- ts.tgt:
+		// Injected successfully.
+		err := ss.ev.Emit(testevent.Data{EventName: target.EventTargetIn, Target: ts.tgt})
+		tr.mu.Lock()
+		ts.CurPhase = targetStepPhaseRun
+		if err != nil {
+			return fmt.Errorf("failed to report target injection: %w", err)
+		}
+		tr.mu.Unlock()
+		tr.cond.Signal()
+		if err != nil {
+			return err
+		}
+	case <-time.After(tr.stepInjectTimeout):
+		ss.log.Errorf("timed out while injecting a target")
+		if err := ss.ev.Emit(testevent.Data{EventName: target.EventTargetInErr, Target: ts.tgt}); err != nil {
+			ss.log.Errorf("failed to emit event: %s", err)
+		}
+		return &cerrors.ErrTestTargetInjectionTimedOut{StepName: ss.sb.TestStepLabel}
+	case <-ctx.Done():
+		return statectx.ErrCanceled
+	}
+	return nil
+}
+
+func (tr *TestRunner) awaitTargetResult(ctx statectx.Context, ts *targetState, ss *stepState, log *logrus.Entry) error {
+	select {
+	case res, ok := <-ts.resCh:
+		if !ok {
+			log.Debugf("%s: result channel closed", ts)
+			return statectx.ErrCanceled
+		}
+		log.Debugf("%s: result for %s recd", ts, ss)
+		var err error
+		if res == nil {
+			err = ss.emitEvent(target.EventTargetOut, ts.tgt, nil)
+		} else {
+			err = ss.emitEvent(target.EventTargetErr, ts.tgt, target.ErrPayload{Error: res.Error()})
+		}
+		if err != nil {
+			ss.log.Errorf("failed to emit event: %s", err)
+		}
+		tr.mu.Lock()
+		ts.res = res
+		ts.CurPhase = targetStepPhaseEnd
+		tr.mu.Unlock()
+		tr.cond.Signal()
+		return err
+		// Check for cancellation.
+		// Notably we are not checking for the pause condition here:
+		// when paused, we want to let all the injected targets to finish
+		// and collect all the results they produce. If that doesn't happen,
+		// step runner will close resCh on its way out and unlock us.
+	case <-ctx.Done():
+		log.Debugf("%s: canceled 2", ts)
+		return statectx.ErrCanceled
+	}
+}
+
+// targetHandler takes a single target through each step of the pipeline in sequence.
+// It injects the target, waits for the result, then moves on to the next step.
+func (tr *TestRunner) targetHandler(ctx, stepCtx statectx.Context, ts *targetState) {
 	log := logging.AddField(tr.log, "target", ts.tgt.ID)
 	log.Debugf("%s: target runner active", ts)
 	// NB: CurStep may be non-zero on entry if resumed
 loop:
 	for i := ts.CurStep; i < len(tr.steps); {
-		// Early check for pause of cancelation.
+		// Early check for pause or cancelation.
 		select {
 		case <-ctx.Paused():
 			log.Debugf("%s: paused 0", ts)
@@ -395,92 +456,47 @@ loop:
 		ts.CurStep = i
 		ts.CurPhase = targetStepPhaseBegin
 		tr.mu.Unlock()
-		// Make sure we have a step runner active.
-		// These are started on-demand.
+		// Make sure we have a step runner active. If not, start one.
 		tr.runStepIfNeeded(stepCtx, ss)
 		// Inject the target.
-		log.Debugf("%s: injecting into %s", ts, ss)
-		select {
-		case ss.inCh <- ts.tgt:
-			// Injected successfully.
-			err := ss.ev.Emit(testevent.Data{EventName: target.EventTargetIn, Target: ts.tgt})
-			tr.mu.Lock()
-			ts.CurPhase = targetStepPhaseRun
-			ss.numInjected++
-			if err != nil {
-				ss.runErr = fmt.Errorf("failed to report target injection: %w", err)
-				ss.log.Errorf("%s", ss.runErr)
-			}
-			tr.mu.Unlock()
-			tr.cond.Signal()
-			if err != nil {
-				break loop
-			}
-		case <-time.After(tr.stepInjectTimeout):
-			tr.mu.Lock()
-			ss.log.Errorf("timed out while injecting a target")
-			ss.runErr = &cerrors.ErrTestTargetInjectionTimedOut{StepName: ss.sb.TestStepLabel}
-			tr.mu.Unlock()
-			err := ss.ev.Emit(testevent.Data{EventName: target.EventTargetInErr, Target: ts.tgt})
-			if err != nil {
-				ss.log.Errorf("failed to emit event: %s", err)
-			}
-			break loop
-		case <-ctx.Done():
-			log.Debugf("%s: canceled 1", ts)
-			break loop
+		err := tr.injectTarget(ctx, ts, ss, log)
+		// Await result. It will be communicated to us by the step runner
+		// and returned in ts.res.
+		if err == nil {
+			err = tr.awaitTargetResult(ctx, ts, ss, log)
 		}
-		// Await result. It will be communicated to us by the step runner.
-		select {
-		case res, ok := <-ts.resCh:
-			if !ok {
-				log.Debugf("%s: result channel closed", ts)
-				break loop
-			}
-			log.Debugf("%s: result for %s recd", ts, ss)
-			var err error
-			if res == nil {
-				err = ss.emitEvent(target.EventTargetOut, ts.tgt, nil)
-			} else {
-				err = ss.emitEvent(target.EventTargetErr, ts.tgt, target.ErrPayload{Error: res.Error()})
-			}
-			if err != nil {
-				ss.log.Errorf("failed to emit event: %s", err)
-			}
-			tr.mu.Lock()
-			ts.CurPhase = targetStepPhaseEnd
-			ts.res = res
-			tr.cond.Signal()
-			if res != nil {
+		if err != nil {
+			ss.log.Errorf("%s", err)
+			if err != statectx.ErrCanceled {
+				tr.mu.Lock()
+				ss.runErr = err
 				tr.mu.Unlock()
-				break loop
-			}
-			i++
-			if i < len(tr.steps) {
-				ts.CurStep = i
-				ts.CurPhase = targetStepPhaseInit
+			} else {
+				log.Debugf("%s: canceled 1", ts)
 			}
+			break
+		}
+		tr.mu.Lock()
+		if ts.res != nil {
 			tr.mu.Unlock()
-			// Check for cancellation.
-			// Notably we are not checking for the pause condition here:
-			// when paused, we want to let all the injected targets to finish
-			// and collect all the results they produce. If that doesn't happen,
-			// step runner will close resCh on its way out and unlock us.
-		case <-ctx.Done():
-			log.Debugf("%s: canceled 2", ts)
-			break loop
+			break
 		}
+		i++
+		if i < len(tr.steps) {
+			ts.CurStep = i
+			ts.CurPhase = targetStepPhaseInit
+		}
+		tr.mu.Unlock()
 	}
 	log.Debugf("%s: target runner finished", ts)
 	tr.mu.Lock()
 	ts.resCh = nil
 	tr.cond.Signal()
 	tr.mu.Unlock()
-	tr.targetsWg.Done()
 }
 
 // runStepIfNeeded starts the step runner goroutine if not already running.
-func (tr *testRunner) runStepIfNeeded(ctx statectx.Context, ss *stepState) {
+func (tr *TestRunner) runStepIfNeeded(ctx statectx.Context, ss *stepState) {
 	tr.mu.Lock()
 	defer tr.mu.Unlock()
 	if ss.stepRunning {
@@ -515,7 +531,7 @@ func (ss *stepState) emitEvent(name event.Name, tgt *target.Target, payload inte
 }
 
 // stepRunner runs a test pipeline's step (the Run() method).
-func (tr *testRunner) stepRunner(ctx statectx.Context, ss *stepState) {
+func (tr *TestRunner) stepRunner(ctx statectx.Context, ss *stepState) {
 	ss.log.Debugf("%s: step runner active", ss)
 	defer func() {
 		if r := recover(); r != nil {
@@ -543,7 +559,7 @@ func (tr *testRunner) stepRunner(ctx statectx.Context, ss *stepState) {
 }
 
 // reportTargetResult reports result of executing a step to the appropriate target runner.
-func (tr *testRunner) reportTargetResult(ctx statectx.Context, ss *stepState, tgt *target.Target, res error) error {
+func (tr *TestRunner) reportTargetResult(ctx statectx.Context, ss *stepState, tgt *target.Target, res error) error {
 	resCh, err := func() (chan error, error) {
 		tr.mu.Lock()
 		defer tr.mu.Unlock()
@@ -584,7 +600,7 @@ func (tr *testRunner) reportTargetResult(ctx statectx.Context, ss *stepState, tg
 	return nil
 }
 
-func (tr *testRunner) safeCloseOutCh(ss *stepState) {
+func (tr *TestRunner) safeCloseOutCh(ss *stepState) {
 	defer func() {
 		if r := recover(); r != nil {
 			tr.mu.Lock()
@@ -596,7 +612,7 @@ func (tr *testRunner) safeCloseOutCh(ss *stepState) {
 }
 
 // safeCloseErrCh closes error channel safely, even if it has already been closed.
-func (tr *testRunner) safeCloseErrCh(ss *stepState) {
+func (tr *TestRunner) safeCloseErrCh(ss *stepState) {
 	defer func() {
 		if r := recover(); r != nil {
 			tr.mu.Lock()
@@ -608,7 +624,7 @@ func (tr *testRunner) safeCloseErrCh(ss *stepState) {
 }
 
 // stepReader receives results from the step's output channel and forwards them to the appropriate target runners.
-func (tr *testRunner) stepReader(ctx statectx.Context, ss *stepState) {
+func (tr *TestRunner) stepReader(ctx statectx.Context, ss *stepState) {
 	ss.log.Debugf("%s: step reader active", ss)
 	var err error
 	outCh := ss.outCh
@@ -620,9 +636,9 @@ loop:
 				ss.log.Debugf("%s: out chan closed", ss)
 				// At this point we may still have an error to report,
 				// wait until error channel is emptied too.
-				outCh = make(chan *target.Target)
+				outCh = nil
 				tr.safeCloseErrCh(ss)
-				break
+				continue loop
 			}
 			if err = tr.reportTargetResult(ctx, ss, tgt, nil); err != nil {
 				break loop
@@ -668,7 +684,7 @@ loop:
 }
 
 // checkStepRunnersLocked checks if any step runner has encountered an error.
-func (tr *testRunner) checkStepRunners() error {
+func (tr *TestRunner) checkStepRunners() error {
 	for _, ss := range tr.steps {
 		if ss.runErr != nil {
 			return ss.runErr
@@ -682,12 +698,12 @@ func (tr *testRunner) checkStepRunners() error {
 // It also monitors steps for critical errors and cancels the whole run.
 // Note: input channels remain open when cancellation is requested,
 // plugins are expected to handle it explicitly.
-func (tr *testRunner) runMonitor() error {
+func (tr *TestRunner) runMonitor() error {
 	tr.log.Debugf("monitor: active")
 	tr.mu.Lock()
 	defer tr.mu.Unlock()
 	// First, compute the starting step of the pipeline (it may be non-zero
-	// if the pipleine was resumed).
+	// if the pipeline was resumed).
 	minStep := len(tr.steps)
 	for _, ts := range tr.targets {
 		if ts.CurStep < minStep {
@@ -735,8 +751,8 @@ loop:
 	return runErr
 }
 
-func NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout time.Duration) TestRunner {
-	tr := &testRunner{
+func NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout time.Duration) *TestRunner {
+	tr := &TestRunner{
 		stepInjectTimeout: stepInjectTimeout,
 		shutdownTimeout:   shutdownTimeout,
 	}
@@ -744,12 +760,14 @@ func NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout time.Duration)
 	return tr
 }
 
-func NewTestRunner() TestRunner {
+func NewTestRunner() *TestRunner {
 	return NewTestRunnerWithTimeouts(config.StepInjectTimeout, config.TestRunnerShutdownTimeout)
 }
 
 func (tph targetStepPhase) String() string {
 	switch tph {
+	case targetStepPhaseInvalid:
+		return "INVALID"
 	case targetStepPhaseInit:
 		return "init"
 	case targetStepPhaseBegin:
diff --git a/pkg/runner/test_runner_test.go b/pkg/runner/test_runner_test.go
index d14f7b94..0f791a3e 100644
--- a/pkg/runner/test_runner_test.go
+++ b/pkg/runner/test_runner_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/facebookincubator/contest/pkg/types"
 	"github.com/facebookincubator/contest/plugins/storage/memory"
 	"github.com/facebookincubator/contest/plugins/teststeps/example"
+	"github.com/facebookincubator/contest/tests/goroutine_leak_check"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/badtargets"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/channels"
 	"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging"
@@ -78,11 +79,11 @@ func TestMain(m *testing.M) {
 	}
 	ret := m.Run()
 	if ret == 0 {
-		if err := CheckLeakedGoRoutines([]string{
+		if err := goroutine_leak_check.CheckLeakedGoRoutines(
 			// We expect these to leak.
-			"hanging.(*hanging).Run",
-			"noreturn.(*noreturnStep).Run",
-		}); err != nil {
+			"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging.(*hanging).Run",
+			"github.com/facebookincubator/contest/tests/plugins/teststeps/noreturn.(*noreturnStep).Run",
+		); err != nil {
 			fmt.Fprintf(os.Stderr, "%s", err)
 			ret = 1
 		}
@@ -90,7 +91,7 @@ func TestMain(m *testing.M) {
 	os.Exit(ret)
 }
 
-func newTestRunner() TestRunner {
+func newTestRunner() *TestRunner {
 	return NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout)
 }
 
@@ -171,7 +172,7 @@ type runRes struct {
 	err error
 }
 
-func runWithTimeout(t *testing.T, tr TestRunner, ctx statectx.Context, resumeState []byte, runID types.RunID, timeout time.Duration, targets []*target.Target, bundles []test.TestStepBundle) ([]byte, error) {
+func runWithTimeout(t *testing.T, tr *TestRunner, ctx statectx.Context, resumeState []byte, runID types.RunID, timeout time.Duration, targets []*target.Target, bundles []test.TestStepBundle) ([]byte, error) {
 	newCtx, _, cancel := statectx.WithParent(ctx)
 	test := &test.Test{
 		Name:             testName,
diff --git a/pkg/runner/goroutine_leak_check.go b/tests/goroutine_leak_check/goroutine_leak_check.go
similarity index 73%
rename from pkg/runner/goroutine_leak_check.go
rename to tests/goroutine_leak_check/goroutine_leak_check.go
index d7ca9d71..4d364357 100644
--- a/pkg/runner/goroutine_leak_check.go
+++ b/tests/goroutine_leak_check/goroutine_leak_check.go
@@ -3,7 +3,7 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 
-package runner
+package goroutine_leak_check
 
 import (
 	"fmt"
@@ -19,7 +19,7 @@ import (
 var (
 	goidRegex     = regexp.MustCompile(`^\S+\s+(\d+)`)
 	funcNameRegex = regexp.MustCompile(`^(\S+)\(\S+`)
-	fileLineRegex = regexp.MustCompile(`^\s*(\S+):(\d+) \S+`)
+	fileLineRegex = regexp.MustCompile(`^\s*(\S+):(\d+)`)
 )
 
 type stackTraceEntry struct {
@@ -34,7 +34,11 @@ func (e *stackTraceEntry) String() string {
 	return fmt.Sprintf("%s:%d %s %d", e.File, e.Line, e.Func, e.GoID)
 }
 
-func CheckLeakedGoRoutines(funcWhitelist []string) error {
+// CheckLeakedGoRoutines is used to check for goroutine leaks at the end of a test.
+// It is not uncommon to leave a go routine that will never finish,
+// e.g. blocked on a channel that is unreachable will never be closed.
+// This function enumerates goroutines and reports any goroutines that are left running.
+func CheckLeakedGoRoutines(funcWhitelist ...string) error {
 	// Get goroutine stacks
 	buf := make([]byte, 1000000)
 	n := runtime.Stack(buf, true /* all */)
@@ -74,13 +78,18 @@ func CheckLeakedGoRoutines(funcWhitelist []string) error {
 				}
 			}
 			panic(fmt.Sprintf("Cannot parse backtrace (goid) %q", line))
-		case 2: // Extract function name
+		case 2: // Extract function name.
 			e.Trace = append(e.Trace, line)
 			if m := funcNameRegex.FindStringSubmatch(line); m != nil {
-				e.Func = path.Base(m[1])
+				e.Func = m[1]
 				ph++
 				break
 			}
+			if e.Func != "" {
+				// This means entire routine is in stdlib, ignore it.
+				ph = 1
+				break
+			}
 			panic(fmt.Sprintf("Cannot parse backtrace (func) %q", line))
 		case 3: // Extract file name
 			e.Trace = append(e.Trace, line)
@@ -89,7 +98,12 @@ func CheckLeakedGoRoutines(funcWhitelist []string) error {
 				if ln, err := strconv.Atoi(m[2]); err == nil {
 					e.Line = ln
 				}
-				ph = 0
+				// We are looking for a non-stdlib function.
+				if !strings.Contains(e.Func, "/") || !strings.Contains(path.Dir(e.Func), ".") {
+					ph = 2
+				} else {
+					ph = 0
+				}
 				break
 			}
 			panic(fmt.Sprintf("Cannot parse backtrace (file) %q", line))
diff --git a/tests/goroutine_leak_check/goroutine_leak_check_test.go b/tests/goroutine_leak_check/goroutine_leak_check_test.go
new file mode 100644
index 00000000..b0ddfc68
--- /dev/null
+++ b/tests/goroutine_leak_check/goroutine_leak_check_test.go
@@ -0,0 +1,53 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package goroutine_leak_check
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func Func1(c *sync.Cond) {
+	c.L.Lock()
+	c.Signal()
+	c.Wait()
+	c.L.Unlock()
+}
+
+func TestNoLeaks(t *testing.T) {
+	require.NoError(t, CheckLeakedGoRoutines())
+}
+
+func TestLeaksFunc1(t *testing.T) {
+	c := sync.NewCond(&sync.Mutex{})
+	c.L.Lock()
+	go Func1(c)
+	c.Wait() // Wait for the go routine to start.
+	c.L.Unlock()
+	err := CheckLeakedGoRoutines()
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "goroutine_leak_check_test.go")
+	require.Contains(t, err.Error(), "Func1")
+	c.Signal()
+	// Give some time for the goroutine to exit
+	// otherwise it causes flakes in case of multiple test runs (-count=N).
+	time.Sleep(10 * time.Millisecond)
+}
+
+func TestLeaksFunc1Whitelisted(t *testing.T) {
+	c := sync.NewCond(&sync.Mutex{})
+	c.L.Lock()
+	go Func1(c)
+	c.Wait()
+	c.L.Unlock()
+	require.NoError(t, CheckLeakedGoRoutines(
+		"github.com/facebookincubator/contest/tests/goroutine_leak_check.Func1"))
+	c.Signal()
+	time.Sleep(10 * time.Millisecond)
+}