From 5e608d39b7899f004ddd26a2aa013168c04d3b34 Mon Sep 17 00:00:00 2001
From: "Deomid \"rojer\" Ryabkov" <rojer9@fb.com>
Date: Mon, 22 Feb 2021 13:16:49 +0000
Subject: [PATCH] New TestRunner implementation

With inter-step state serialization

This is take 2, incorporating a fix for #228 and comments from #219
---
 pkg/api/api.go                                |   2 +-
 pkg/api/api_test.go                           |   6 +-
 pkg/cerrors/cerrors.go                        |  57 ++
 pkg/event/testevent/test.go                   |  16 +-
 pkg/jobmanager/bundles.go                     |   3 +-
 pkg/jobmanager/jobmanager.go                  |   6 +-
 pkg/pluginregistry/bundles.go                 |   2 +-
 pkg/runner/job_runner.go                      |  12 +-
 pkg/runner/test_runner.go                     | 281 ------
 pkg/runner/test_runner2.go                    | 842 ++++++++++++++++++
 pkg/runner/test_runner_pipeline.go            | 540 -----------
 pkg/runner/test_runner_route.go               | 292 ------
 pkg/runner/test_runner_route_test.go          | 400 ---------
 pkg/runner/test_runner_test.go                | 597 +++++++++++++
 pkg/runner/wait_for_first_target.go           |  57 --
 pkg/runner/wait_for_first_target_test.go      | 142 ---
 pkg/storage/limits/limits_test.go             |   2 +-
 pkg/storage/storage.go                        |   1 +
 .../listeners/httplistener/httplistener.go    |   3 +-
 plugins/storage/memory/memory.go              |   2 +-
 plugins/teststeps/echo/echo.go                |   2 +-
 plugins/teststeps/example/example.go          |  84 +-
 plugins/teststeps/randecho/randecho.go        |   2 +-
 plugins/teststeps/sshcmd/sshcmd.go            |   2 +-
 .../terminalexpect/terminalexpect.go          |   2 +-
 plugins/teststeps/teststeps.go                |  14 +-
 tests/common/get_events.go                    |  48 +
 .../goroutine_leak_check.go                   |   6 +-
 .../goroutine_leak_check_test.go              |   6 +-
 tests/integ/events/frameworkevents/common.go  |  20 +-
 tests/integ/events/testevents/common.go       |  24 +-
 tests/integ/job/common.go                     |  44 +-
 tests/integ/jobmanager/common.go              |  23 +-
 tests/integ/test/testrunner_test.go           | 224 +----
 .../teststeps/badtargets/badtargets.go        | 111 +++
 tests/plugins/teststeps/channels/channels.go  |   6 +-
 tests/plugins/teststeps/fail/fail.go          |   2 +-
 tests/plugins/teststeps/hanging/hanging.go    |   2 +-
 tests/plugins/teststeps/noreturn/noreturn.go  |   2 +-
 tests/plugins/teststeps/teststep/teststep.go  | 160 ++++
 40 files changed, 1997 insertions(+), 2050 deletions(-)
 delete mode 100644 pkg/runner/test_runner.go
 create mode 100644 pkg/runner/test_runner2.go
 delete mode 100644 pkg/runner/test_runner_pipeline.go
 delete mode 100644 pkg/runner/test_runner_route.go
 delete mode 100644 pkg/runner/test_runner_route_test.go
 create mode 100644 pkg/runner/test_runner_test.go
 delete mode 100644 pkg/runner/wait_for_first_target.go
 delete mode 100644 pkg/runner/wait_for_first_target_test.go
 create mode 100644 tests/common/get_events.go
 rename tests/common/{ => goroutine_leak_check}/goroutine_leak_check.go (97%)
 rename tests/common/{ => goroutine_leak_check}/goroutine_leak_check_test.go (86%)
 create mode 100644 tests/plugins/teststeps/badtargets/badtargets.go
 create mode 100644 tests/plugins/teststeps/teststep/teststep.go

diff --git a/pkg/api/api.go b/pkg/api/api.go
index 0286d453..e85aa43a 100644
--- a/pkg/api/api.go
+++ b/pkg/api/api.go
@@ -46,7 +46,7 @@ type API struct {
 
 // New returns an initialized instance of an API struct with the specified
 // server ID generation function.
-func New(ctx xcontext.Context, opts ...Option) (*API, error) {
+func New(opts ...Option) (*API, error) {
 	cfg := getConfig(opts...)
 	serverID, err := obtainServerID(cfg.ServerIDFunc)
 	if err != nil {
diff --git a/pkg/api/api_test.go b/pkg/api/api_test.go
index 6ed44a25..ab1d6e07 100644
--- a/pkg/api/api_test.go
+++ b/pkg/api/api_test.go
@@ -23,7 +23,7 @@ var ctx = logrusctx.NewContext(logger.LevelDebug)
 func TestOptions(t *testing.T) {
 	eventTimeout := 3141592654 * time.Second
 	serverID := "myUnitTestServerID"
-	api, err := New(ctx,
+	api, err := New(
 		OptionEventTimeout(eventTimeout),
 		OptionServerID(serverID),
 	)
@@ -40,7 +40,7 @@ func (d dummyEventMsg) Requestor() EventRequestor {
 
 func TestEventTimeout(t *testing.T) {
 	t.Run("timeout", func(t *testing.T) {
-		apiInstance, err := New(ctx, OptionServerID("unit-test"), OptionEventTimeout(time.Nanosecond))
+		apiInstance, err := New(OptionServerID("unit-test"), OptionEventTimeout(time.Nanosecond))
 		require.NoError(t, err)
 		t.Run("Status", func(t *testing.T) {
 			startTime := time.Now()
@@ -63,7 +63,7 @@ func TestEventTimeout(t *testing.T) {
 	})
 
 	t.Run("noTimeout", func(t *testing.T) {
-		apiInstance, err := New(ctx, OptionServerID("unit-test"))
+		apiInstance, err := New(OptionServerID("unit-test"))
 		require.NoError(t, err)
 
 		respExpected := &EventResponse{
diff --git a/pkg/cerrors/cerrors.go b/pkg/cerrors/cerrors.go
index 4fa3211e..e3c67e75 100644
--- a/pkg/cerrors/cerrors.go
+++ b/pkg/cerrors/cerrors.go
@@ -43,6 +43,17 @@ func (e *ErrTestStepsNeverReturned) Error() string {
 	return fmt.Sprintf("test step [%s] did not return", strings.Join(e.StepNames, ", "))
 }
 
+// ErrTestTargetInjectionTimedOut indicates that test step did not ingest a target
+// within allotted time.
+type ErrTestTargetInjectionTimedOut struct {
+	StepName string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestTargetInjectionTimedOut) Error() string {
+	return fmt.Sprintf("test step %v failed to ingest a target", e.StepName)
+}
+
 // ErrTestStepClosedChannels indicates that the test step returned after
 // closing its output channels, which constitutes an API violation
 type ErrTestStepClosedChannels struct {
@@ -53,3 +64,49 @@ type ErrTestStepClosedChannels struct {
 func (e *ErrTestStepClosedChannels) Error() string {
 	return fmt.Sprintf("test step %v closed output channels (api violation)", e.StepName)
 }
+
+// ErrTestStepPaniced indicates that a test step's method panicked.
+type ErrTestStepPaniced struct {
+	StepName   string
+	StackTrace string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepPaniced) Error() string {
+	return fmt.Sprintf("test step %s paniced, trace: %q", e.StepName, e.StackTrace)
+}
+
+// ErrTestStepReturnedDuplicateResult indicates that a test step returned result
+// twice for the same target.
+type ErrTestStepReturnedDuplicateResult struct {
+	StepName string
+	Target   string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepReturnedDuplicateResult) Error() string {
+	return fmt.Sprintf("test step %s returned duplicate result for %s", e.StepName, e.Target)
+}
+
+// ErrTestStepReturnedUnexpectedResult indicates that a test step returned result
+// for a target that was not given to it.
+type ErrTestStepReturnedUnexpectedResult struct {
+	StepName string
+	Target   string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepReturnedUnexpectedResult) Error() string {
+	return fmt.Sprintf("test step %s returned unexpected result for %s", e.StepName, e.Target)
+}
+
+// ErrTestStepLostTargets indicates that targets have been lost during test run.
+type ErrTestStepLostTargets struct {
+	StepName string
+	Targets  []string
+}
+
+// Error returns the error string associated with the error
+func (e *ErrTestStepLostTargets) Error() string {
+	return fmt.Sprintf("test step %s lost targets %v", e.StepName, e.Targets)
+}
diff --git a/pkg/event/testevent/test.go b/pkg/event/testevent/test.go
index 95ed859d..ead40539 100644
--- a/pkg/event/testevent/test.go
+++ b/pkg/event/testevent/test.go
@@ -17,7 +17,7 @@ import (
 	"github.com/facebookincubator/contest/pkg/xcontext"
 )
 
-// Header models the header of a test event, which consists in metadatat hat defines the
+// Header models the header of a test event, which consists in metadata that defines the
 // emitter of the events. The Header is under ConTest control and cannot be manipulated
 // by the TestStep
 type Header struct {
@@ -29,8 +29,8 @@ type Header struct {
 
 // Data models the data of a test event. It is populated by the TestStep
 type Data struct {
-	EventName event.Name
 	Target    *target.Target
+	EventName event.Name
 	Payload   *json.RawMessage
 }
 
@@ -153,3 +153,15 @@ type EmitterFetcher interface {
 	Emitter
 	Fetcher
 }
+
+func (h *Header) String() string {
+	return fmt.Sprintf("[%d %d %s %s]", h.JobID, h.RunID, h.TestName, h.TestStepLabel)
+}
+
+func (d *Data) String() string {
+	ps := ""
+	if d.Payload != nil {
+		ps = fmt.Sprintf(" %q", d.Payload) //nolint SA5009 - works fine
+	}
+	return fmt.Sprintf("[%s %s%s]", d.Target, d.EventName, ps)
+}
diff --git a/pkg/jobmanager/bundles.go b/pkg/jobmanager/bundles.go
index 4b1c0a5f..e5d77476 100644
--- a/pkg/jobmanager/bundles.go
+++ b/pkg/jobmanager/bundles.go
@@ -72,8 +72,7 @@ func newBundlesFromSteps(ctx xcontext.Context, descriptors []*test.TestStepDescr
 		if err != nil {
 			return nil, err
 		}
-		// test step index is incremented by 1 so we can use 0 to signal an anomaly
-		tsb, err := registry.NewTestStepBundle(ctx, *descriptor, uint(idx)+1, tse)
+		tsb, err := registry.NewTestStepBundle(ctx, *descriptor, tse)
 		if err != nil {
 			return nil, fmt.Errorf("NewStepBundle for test step '%s' with index %d failed: %w", descriptor.Name, idx, err)
 		}
diff --git a/pkg/jobmanager/jobmanager.go b/pkg/jobmanager/jobmanager.go
index 881de8e9..28915399 100644
--- a/pkg/jobmanager/jobmanager.go
+++ b/pkg/jobmanager/jobmanager.go
@@ -135,13 +135,15 @@ func (jm *JobManager) handleEvent(ev *api.Event) {
 // signals, propagating the signals downwards to all jobs.
 func (jm *JobManager) Start(ctx xcontext.Context, sigs chan os.Signal) error {
 	log := ctx.Logger()
+	log.Debugf("Starting JobManager")
 
-	a, err := api.New(ctx, jm.config.apiOptions...)
+	a, err := api.New(jm.config.apiOptions...)
 	if err != nil {
 		return fmt.Errorf("Cannot start JobManager: %w", err)
 	}
 
 	apiCtx, apiCancel := xcontext.WithCancel(ctx)
+	apiCtx, apiPause := xcontext.WithNotify(apiCtx, xcontext.Paused)
 	errCh := make(chan error, 1)
 	go func() {
 		lErr := jm.apiListener.Serve(apiCtx, a)
@@ -176,7 +178,7 @@ loop:
 				apiCancel()
 			} else {
 				log.Debugf("Interrupted by signal '%s': pause jobs and exit", sig)
-				apiCancel()
+				apiPause()
 				jm.PauseJobs(ctx)
 			}
 			select {
diff --git a/pkg/pluginregistry/bundles.go b/pkg/pluginregistry/bundles.go
index 923e8a3a..116f30df 100644
--- a/pkg/pluginregistry/bundles.go
+++ b/pkg/pluginregistry/bundles.go
@@ -16,7 +16,7 @@ import (
 )
 
 // NewTestStepBundle creates a TestStepBundle from a TestStepDescriptor
-func (r *PluginRegistry) NewTestStepBundle(ctx xcontext.Context, testStepDescriptor test.TestStepDescriptor, stepIndex uint, allowedEvents map[event.Name]bool) (*test.TestStepBundle, error) {
+func (r *PluginRegistry) NewTestStepBundle(ctx xcontext.Context, testStepDescriptor test.TestStepDescriptor, allowedEvents map[event.Name]bool) (*test.TestStepBundle, error) {
 	testStep, err := r.NewTestStep(testStepDescriptor.Name)
 	if err != nil {
 		return nil, fmt.Errorf("could not get the desired TestStep (%s): %v", testStepDescriptor.Name, err)
diff --git a/pkg/runner/job_runner.go b/pkg/runner/job_runner.go
index fc43273b..8ff92306 100644
--- a/pkg/runner/job_runner.go
+++ b/pkg/runner/job_runner.go
@@ -127,7 +127,7 @@ func (jr *JobRunner) Run(j *job.Job) ([][]*job.Report, []*job.Report, error) {
 				targets = <-targetsCh
 				if err != nil {
 					err = fmt.Errorf("run #%d: cannot fetch targets for test '%s': %v", run+1, t.Name, err)
-					j.StateCtx.Logger().Errorf("%v", err.Error())
+					j.StateCtx.Logger().Errorf(err.Error())
 					return nil, nil, err
 				}
 				// Associate the targets with the job for later retrievel
@@ -181,7 +181,13 @@ func (jr *JobRunner) Run(j *job.Job) ([][]*job.Report, []*job.Report, error) {
 			if runErr = jr.emitAcquiredTargets(j.StateCtx, testEventEmitter, targets); runErr == nil {
 				j.StateCtx.Logger().Infof("Run #%d: running test #%d for job '%s' (job ID: %d) on %d targets", run+1, idx, j.Name, j.ID, len(targets))
 				testRunner := NewTestRunner()
-				runErr = testRunner.Run(j.StateCtx, t, targets, j.ID, types.RunID(run+1))
+				resumeState, err := testRunner.Run(j.StateCtx, t, targets, j.ID, types.RunID(run+1), nil)
+				if err == xcontext.Paused {
+					j.StateCtx.Logger().Debugf("Runner paused, state: %s", string(resumeState))
+					// TODO(rojer): Persist the state.
+				} else {
+					runErr = err
+				}
 			}
 
 			// Job is done, release all the targets
@@ -232,7 +238,7 @@ func (jr *JobRunner) Run(j *job.Job) ([][]*job.Report, []*job.Report, error) {
 				j.StateCtx.Logger().Warnf("Run reporter failed while calculating run results, proceeding anyway: %v", err)
 			} else {
 				if success {
-					j.StateCtx.Logger().Warnf("Run #%d of job %d considered successful according to %s", run+1, j.ID, bundle.Reporter.Name())
+					j.StateCtx.Logger().Debugf("Run #%d of job %d considered successful according to %s", run+1, j.ID, bundle.Reporter.Name())
 				} else {
 					j.StateCtx.Logger().Errorf("Run #%d of job %d considered failed according to %s", run+1, j.ID, bundle.Reporter.Name())
 				}
diff --git a/pkg/runner/test_runner.go b/pkg/runner/test_runner.go
deleted file mode 100644
index afc47edc..00000000
--- a/pkg/runner/test_runner.go
+++ /dev/null
@@ -1,281 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"context"
-	"fmt"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/cerrors"
-	"github.com/facebookincubator/contest/pkg/config"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/facebookincubator/contest/pkg/types"
-	"github.com/facebookincubator/contest/pkg/xcontext"
-)
-
-// TestRunnerTimeouts collects all the timeouts values that the test runner uses
-type TestRunnerTimeouts struct {
-	StepInjectTimeout   time.Duration
-	MessageTimeout      time.Duration
-	ShutdownTimeout     time.Duration
-	StepShutdownTimeout time.Duration
-}
-
-// routingCh represents a set of unidirectional channels used by the routing subsystem.
-// There is a routing block for each TestStep of the pipeline, which is responsible for
-// the following actions:
-// * Targets in egress from the previous routing block are injected into the
-// current TestStep
-// * Targets in egress from the current TestStep are injected into the next
-// routing block
-type routingCh struct {
-	// routeIn and routeOut connect the routing block to other routing blocks
-	routeIn  <-chan *target.Target
-	routeOut chan<- *target.Target
-	// Channels that connect the routing block to the TestStep
-	stepIn  chan<- *target.Target
-	stepOut <-chan *target.Target
-	stepErr <-chan cerrors.TargetError
-	// targetErr connects the routing block directly to the TestRunner. Failing
-	// targets are acquired by the TestRunner via this channel
-	targetErr chan<- cerrors.TargetError
-}
-
-// stepCh represents a set of bidirectional channels that a TestStep and its associated
-// routing block use to communicate. The TestRunner forces the direction of each
-// channel when connecting the TestStep to the routing block.
-type stepCh struct {
-	stepIn  chan *target.Target
-	stepOut chan *target.Target
-	stepErr chan cerrors.TargetError
-}
-
-type injectionCh struct {
-	stepIn   chan<- *target.Target
-	resultCh chan<- injectionResult
-}
-
-// injectionResult represents the result of an injection goroutine
-type injectionResult struct {
-	target *target.Target
-	err    error
-}
-
-// routeResult represents the result of routing block, possibly carrying error information
-type routeResult struct {
-	bundle test.TestStepBundle
-	err    error
-}
-
-// stepResult represents the result of a TestStep, possibly carrying error information
-type stepResult struct {
-	jobID  types.JobID
-	runID  types.RunID
-	bundle test.TestStepBundle
-	err    error
-}
-
-// pipelineCtrlCh represents multiple result and control channels that the pipeline uses
-// to collect results from routing blocks, steps and target completing the test and to
-//  signa cancellation to various pipeline subsystems
-type pipelineCtrlCh struct {
-	routingResultCh <-chan routeResult
-	stepResultCh    <-chan stepResult
-	targetOut       <-chan *target.Target
-	targetErr       <-chan cerrors.TargetError
-
-	// ctx  is a control context used to cancel/pause the steps of the pipeline
-	ctx    xcontext.Context
-	pause  func()
-	cancel func()
-}
-
-// TestRunner is the main runner of TestSteps in ConTest. `results` collects
-// the results of the run. It is not safe to access `results` concurrently.
-type TestRunner struct {
-	timeouts TestRunnerTimeouts
-}
-
-// targetWriter is a helper object which exposes methods to write targets into step channels
-type targetWriter struct {
-	timeouts TestRunnerTimeouts
-}
-
-func (w *targetWriter) writeTimeout(ctx xcontext.Context, ch chan<- *target.Target, target *target.Target, timeout time.Duration) error {
-	ctx.Logger().Debugf("writing target %+v, timeout %v", target, timeout)
-	start := time.Now()
-	select {
-	case <-ctx.Done():
-		ctx.Logger().Debugf("terminate requested while writing target %+v", target)
-	case ch <- target:
-	case <-time.After(timeout):
-		return fmt.Errorf("timeout (%v) while writing target %+v", timeout, target)
-	}
-	ctx.Logger().Debugf("done writing target %+v, spent %v", target, time.Since(start))
-	return nil
-}
-
-// writeTargetWithResult attempts to deliver a Target on the input channel of a step,
-// returning the result of the operation on the result channel wrapped in the
-// injectionCh argument
-func (w *targetWriter) writeTargetWithResult(ctx xcontext.Context, target *target.Target, ch injectionCh) {
-	err := w.writeTimeout(ctx, ch.stepIn, target, w.timeouts.StepInjectTimeout)
-	select {
-	case <-ctx.Done():
-	case ch.resultCh <- injectionResult{target: target, err: err}:
-	case <-time.After(w.timeouts.MessageTimeout):
-		ctx.Logger().Panicf("timeout while writing result for target %+v after %v", target, w.timeouts.MessageTimeout)
-	}
-}
-
-// writeTargetError writes a TargetError object to a TargetError channel with timeout
-func (w *targetWriter) writeTargetError(ctx context.Context, ch chan<- cerrors.TargetError, targetError cerrors.TargetError, timeout time.Duration) error {
-	select {
-	case <-ctx.Done():
-	case ch <- targetError:
-	case <-time.After(timeout):
-		return fmt.Errorf("timeout while writing targetError %+v", targetError)
-	}
-	return nil
-}
-
-func newTargetWriter(timeouts TestRunnerTimeouts) *targetWriter {
-	return &targetWriter{timeouts: timeouts}
-}
-
-// Run implements the main logic of the TestRunner, i.e. the instantiation and
-// connection of the TestSteps, routing blocks and pipeline runner.
-func (tr *TestRunner) Run(ctx xcontext.Context, test *test.Test, targets []*target.Target, jobID types.JobID, runID types.RunID) error {
-
-	if len(test.TestStepsBundles) == 0 {
-		return fmt.Errorf("no steps to run for test")
-	}
-
-	ctx = ctx.WithFields(xcontext.Fields{
-		"job_id": jobID,
-		"run_id": runID,
-	})
-
-	testPipeline := newPipeline(test.TestStepsBundles, test, jobID, runID, tr.timeouts)
-
-	ctx.Logger().Infof("setting up pipeline")
-	completedTargets := make(chan *target.Target, 1)
-	inCh := testPipeline.init(ctx)
-
-	// inject targets in the step
-	terminateInjectionCtx, terminateInjection := xcontext.WithCancel(xcontext.Background())
-	go func(ctx xcontext.Context, inputChannel chan<- *target.Target) {
-		defer close(inputChannel)
-		ctx = ctx.WithTag("step", "injection")
-		writer := newTargetWriter(tr.timeouts)
-		for _, t := range targets {
-			if err := writer.writeTimeout(ctx, inputChannel, t, tr.timeouts.MessageTimeout); err != nil {
-				ctx.Logger().Debugf("could not inject target %+v into first routing block: %+v", t, err)
-			}
-		}
-	}(terminateInjectionCtx.WithLogger(ctx.Logger()), inCh)
-
-	errCh := make(chan error, 1)
-	go func() {
-		ctx.Logger().Infof("running pipeline")
-		errCh <- testPipeline.run(ctx, completedTargets)
-	}()
-
-	defer terminateInjection()
-	// Receive targets from the completed channel controlled by the pipeline, while
-	// waiting for termination signals or fatal errors encountered while running
-	// the pipeline.
-	for {
-		select {
-		case err := <-errCh:
-			ctx.Logger().Debugf("test runner terminated, returning %v", err)
-			return err
-		case t := <-completedTargets:
-			ctx.Logger().Infof("test runner completed target: %v", t)
-		}
-	}
-}
-
-// NewTestRunner initializes and returns a new TestRunner object. This test
-// runner will use default timeout values
-func NewTestRunner() TestRunner {
-	return TestRunner{
-		timeouts: TestRunnerTimeouts{
-			StepInjectTimeout:   config.StepInjectTimeout,
-			MessageTimeout:      config.TestRunnerMsgTimeout,
-			ShutdownTimeout:     config.TestRunnerShutdownTimeout,
-			StepShutdownTimeout: config.TestRunnerStepShutdownTimeout,
-		},
-	}
-}
-
-// NewTestRunnerWithTimeouts initializes and returns a new TestRunner object with
-// custom timeouts
-func NewTestRunnerWithTimeouts(timeouts TestRunnerTimeouts) TestRunner {
-	return TestRunner{timeouts: timeouts}
-}
-
-// State is a structure that models the current state of the test runner
-type State struct {
-	completedSteps   map[string]error
-	completedRouting map[string]error
-	completedTargets map[*target.Target]error
-}
-
-// NewState initializes a State object.
-func NewState() *State {
-	r := State{}
-	r.completedSteps = make(map[string]error)
-	r.completedRouting = make(map[string]error)
-	r.completedTargets = make(map[*target.Target]error)
-	return &r
-}
-
-// CompletedTargets returns a map that associates each target with its returning error.
-// If the target succeeded, the error will be nil
-func (r *State) CompletedTargets() map[*target.Target]error {
-	return r.completedTargets
-}
-
-// CompletedRouting returns a map that associates each routing block with its returning error.
-// If the routing block succeeded, the error will be nil
-func (r *State) CompletedRouting() map[string]error {
-	return r.completedRouting
-}
-
-// CompletedSteps returns a map that associates each step with its returning error.
-// If the step succeeded, the error will be nil
-func (r *State) CompletedSteps() map[string]error {
-	return r.completedSteps
-}
-
-// SetRouting sets the error associated with a routing block
-func (r *State) SetRouting(testStepLabel string, err error) {
-	r.completedRouting[testStepLabel] = err
-}
-
-// SetTarget sets the error associated with a target
-func (r *State) SetTarget(target *target.Target, err error) {
-	r.completedTargets[target] = err
-}
-
-// SetStep sets the error associated with a step
-func (r *State) SetStep(testStepLabel string, err error) {
-	r.completedSteps[testStepLabel] = err
-}
-
-// IncompleteSteps returns a slice of step names for which the result hasn't been set yet
-func (r *State) IncompleteSteps(bundles []test.TestStepBundle) []string {
-	var incompleteSteps []string
-	for _, bundle := range bundles {
-		if _, ok := r.completedSteps[bundle.TestStepLabel]; !ok {
-			incompleteSteps = append(incompleteSteps, bundle.TestStepLabel)
-		}
-	}
-	return incompleteSteps
-}
diff --git a/pkg/runner/test_runner2.go b/pkg/runner/test_runner2.go
new file mode 100644
index 00000000..8e63ef8e
--- /dev/null
+++ b/pkg/runner/test_runner2.go
@@ -0,0 +1,842 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package runner
+
+import (
+	"encoding/json"
+	"fmt"
+	"runtime/debug"
+	"sync"
+	"time"
+
+	"github.com/facebookincubator/contest/pkg/xcontext"
+	"github.com/insomniacslk/xjson"
+
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/config"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/storage"
+	"github.com/facebookincubator/contest/pkg/target"
+	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/pkg/types"
+)
+
+// TestRunner is the state associated with a test run.
+// Here's how a test run works:
+//  * Each target gets a targetState and a "target handler" - a goroutine that takes that particular
+//    target through each step of the pipeline in sequence. It injects the target, waits for the result,
+//    then moves on to the next step.
+//  * Each step of the pipeline gets a stepState and:
+//    - A "step runner" - a goroutine that is responsible for running the step's Run() method
+//    - A "step reader" - a goroutine that processes results and sends them on to target handlers that await them.
+//  * After starting all of the above, the main goroutine goes into "monitor" mode
+//    that checks on the pipeline's progress and is responsible for closing step input channels
+//    when all the targets have been injected.
+//  * Monitor loop finishes when all the targets have been injected into the last step
+//    or if a step has encountered an error.
+//  * We then wait for all the step runners and readers to shut down.
+//  * Once all the activity has died down, resulting state is examined and an error is returned, if any.
+type TestRunner struct {
+	stepInjectTimeout time.Duration // Time to wait for steps to accept each target
+	shutdownTimeout   time.Duration // Time to wait for steps runners to finish a the end of the run
+
+	steps     []*stepState            // The pipeline, in order of execution
+	targets   map[string]*targetState // Target state lookup map
+	targetsWg sync.WaitGroup          // Tracks all the target handlers
+
+	// One mutex to rule them all, used to serialize access to all the state above.
+	// Could probably be split into several if necessary.
+	mu   sync.Mutex
+	cond *sync.Cond // Used to notify the monitor about changes
+}
+
+// stepState contains state associated with one state of the pipeline:
+type stepState struct {
+	ctx xcontext.Context
+
+	stepIndex int                 // Index of this step in the pipeline.
+	sb        test.TestStepBundle // The test bundle.
+
+	// Channels used to communicate with the plugin.
+	inCh  chan *target.Target
+	outCh chan *target.Target
+	errCh chan cerrors.TargetError
+	ev    testevent.Emitter
+
+	tgtDone map[*target.Target]bool // Targets for which results have been received.
+
+	stepRunning   bool  // testStep.Run() is currently running.
+	readerRunning bool  // Result reader is running.
+	runErr        error // Runner error, returned from Run() or an error condition detected by the reader.
+}
+
+// targetStepPhase denotes progression of a target through a step
+type targetStepPhase int
+
+const (
+	targetStepPhaseInvalid       targetStepPhase = iota
+	targetStepPhaseInit                          // (1) Created
+	targetStepPhaseBegin                         // (2) Picked up for execution.
+	targetStepPhaseRun                           // (3) Injected into step.
+	targetStepPhaseResultPending                 // (4) Result posted to the handler.
+	targetStepPhaseEnd                           // (5) Finished running a step.
+)
+
+// targetState contains state associated with one target progressing through the pipeline.
+type targetState struct {
+	tgt *target.Target
+
+	// This part of state gets serialized into JSON for resumption.
+	CurStep  int             `json:"cs"`            // Current step number.
+	CurPhase targetStepPhase `json:"cp"`            // Current phase of step execution.
+	Res      *xjson.Error    `json:"res,omitempty"` // Final result, if reached the end state.
+
+	resCh chan error // Channel used to communicate result by the step runner.
+}
+
+// resumeStateStruct is used to serialize runner state to be resumed in the future.
+type resumeStateStruct struct {
+	Version int                     `json:"v"`
+	JobID   types.JobID             `json:"job_id"`
+	RunID   types.RunID             `json:"run_id"`
+	Targets map[string]*targetState `json:"targets"`
+}
+
+// Resume state version we are compatible with.
+// When imcompatible changes are made to the state format, bump this.
+// Restoring incompatible state will abort the job.
+const resumeStateStructVersion = 2
+
+// Run is the main enty point of the runner.
+func (tr *TestRunner) Run(
+	ctx xcontext.Context,
+	t *test.Test, targets []*target.Target,
+	jobID types.JobID, runID types.RunID,
+	resumeState json.RawMessage,
+) (json.RawMessage, error) {
+
+	ctx = ctx.WithFields(xcontext.Fields{
+		"job_id": jobID,
+		"run_id": runID,
+	})
+
+	ctx.Logger().Debugf("== test runner starting job %d, run %d", jobID, runID)
+	resumeState, err := tr.run(ctx.WithTag("phase", "run"), t, targets, jobID, runID, resumeState)
+	ctx.Logger().Debugf("== test runner finished job %d, run %d, err: %v", jobID, runID, err)
+	return resumeState, err
+}
+
+func (tr *TestRunner) run(
+	ctx xcontext.Context,
+	t *test.Test, targets []*target.Target,
+	jobID types.JobID, runID types.RunID,
+	resumeState json.RawMessage,
+) (json.RawMessage, error) {
+
+	// Peel off contexts used for steps and target handlers.
+	stepCtx, stepCancel := xcontext.WithCancel(ctx)
+	defer stepCancel()
+	targetCtx, targetCancel := xcontext.WithCancel(ctx)
+	defer targetCancel()
+
+	// Set up the pipeline
+	for i, sb := range t.TestStepsBundles {
+		tr.steps = append(tr.steps, &stepState{
+			ctx:       stepCtx,
+			stepIndex: i,
+			sb:        sb,
+			inCh:      make(chan *target.Target),
+			outCh:     make(chan *target.Target),
+			errCh:     make(chan cerrors.TargetError),
+			ev: storage.NewTestEventEmitter(testevent.Header{
+				JobID:         jobID,
+				RunID:         runID,
+				TestName:      t.Name,
+				TestStepLabel: sb.TestStepLabel,
+			}),
+			tgtDone: make(map[*target.Target]bool),
+		})
+		// Step handlers will be started from target handlers as targets reach them.
+	}
+
+	// Set up the targets
+	tr.targets = make(map[string]*targetState)
+	// If we have target state to resume, do it now.
+	if len(resumeState) > 0 {
+		ctx.Logger().Debugf("Attempting to resume from state: %s", string(resumeState))
+		var rs resumeStateStruct
+		if err := json.Unmarshal(resumeState, &rs); err != nil {
+			return nil, fmt.Errorf("invalid resume state: %w", err)
+		}
+		if rs.Version != resumeStateStructVersion {
+			return nil, fmt.Errorf("incompatible resume state version %d (want %d)",
+				rs.Version, resumeStateStructVersion)
+		}
+		if rs.JobID != jobID {
+			return nil, fmt.Errorf("wrong resume state, job id %d (want %d)", rs.JobID, jobID)
+		}
+		tr.targets = rs.Targets
+	}
+	// Initialize remaining fields of the target structures,
+	// build the map and kick off target processing.
+	for _, tgt := range targets {
+		tr.mu.Lock()
+		tgs := tr.targets[tgt.ID]
+		if tgs == nil {
+			tgs = &targetState{
+				CurPhase: targetStepPhaseInit,
+			}
+		}
+		tgs.tgt = tgt
+		tgs.resCh = make(chan error)
+		tr.targets[tgt.ID] = tgs
+		tr.mu.Unlock()
+		tr.targetsWg.Add(1)
+		go func() {
+			tr.targetHandler(targetCtx, tgs)
+			tr.targetsWg.Done()
+		}()
+	}
+
+	// Run until no more progress can be made.
+	runErr := tr.runMonitor(ctx)
+	if runErr != nil {
+		ctx.Logger().Errorf("monitor returned error: %q, canceling", runErr)
+		stepCancel()
+	}
+
+	// Wait for step runners and readers to exit.
+	if err := tr.waitStepRunners(ctx); err != nil {
+		if runErr == nil {
+			runErr = err
+		}
+	}
+
+	// There will be no more results, reel in all the target handlers (if any).
+	ctx.Logger().Debugf("waiting for target handlers to finish")
+	targetCancel()
+	tr.targetsWg.Wait()
+
+	// Examine the resulting state.
+	ctx.Logger().Debugf("leaving, err %v, target states:", runErr)
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	resumeOk := (runErr == nil)
+	numInFlightTargets := 0
+	for i, tgt := range targets {
+		tgs := tr.targets[tgt.ID]
+		stepErr := tr.steps[tgs.CurStep].runErr
+		ctx.Logger().Debugf("  %d %s %v", i, tgs, stepErr)
+		if tgs.CurPhase == targetStepPhaseRun {
+			numInFlightTargets++
+		}
+		if stepErr != nil && stepErr != xcontext.Paused {
+			resumeOk = false
+		}
+	}
+	ctx.Logger().Debugf("- %d in flight, ok to resume? %t", numInFlightTargets, resumeOk)
+	ctx.Logger().Debugf("step states:")
+	for i, ss := range tr.steps {
+		ctx.Logger().Debugf("  %d %s %t %t %v", i, ss, ss.stepRunning, ss.readerRunning, ss.runErr)
+	}
+
+	// Is there a useful error to report?
+	if runErr != nil {
+		return nil, runErr
+	}
+
+	// Has the run been canceled?
+	ctx.Logger().Debugf("ctx-cancel-is: %v; ctx-notifications-are: %v", ctx.Err(), ctx.Notifications())
+	select {
+	case <-ctx.Done():
+		return nil, xcontext.Canceled
+	default:
+	}
+
+	// Have we been asked to pause? If yes, is it safe to do so?
+	select {
+	case <-ctx.WaitFor(xcontext.Paused):
+		if !resumeOk {
+			ctx.Logger().Warnf("paused but not ok to resume")
+			break
+		}
+		rs := &resumeStateStruct{
+			Version: resumeStateStructVersion,
+			JobID:   jobID, RunID: runID,
+			Targets: tr.targets,
+		}
+		resumeState, runErr = json.Marshal(rs)
+		if runErr != nil {
+			ctx.Logger().Errorf("unable to serialize the state: %s", runErr)
+		} else {
+			runErr = xcontext.Paused
+		}
+	default:
+	}
+
+	return resumeState, runErr
+}
+
+func (tr *TestRunner) waitStepRunners(ctx xcontext.Context) error {
+	ctx.Logger().Debugf("waiting for step runners to finish")
+	swch := make(chan struct{})
+	go func() {
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		for {
+			ok := true
+			for _, ss := range tr.steps {
+				// numRunning == 1 is also acceptable: we allow the Run() goroutine
+				// to continue in case of error, if the result processor decided
+				// to abandon its runner, there's nothing we can do.
+				switch {
+				case !ss.stepRunning && !ss.readerRunning:
+				// Done
+				case ss.stepRunning && ss.readerRunning:
+					// Still active
+					ok = false
+				case !ss.stepRunning && ss.readerRunning:
+					// Transient state, let it finish
+					ok = false
+				case ss.stepRunning && !ss.readerRunning:
+					// This is possible if plugin got stuck and result processor gave up on it.
+					// If so, it should have left an error.
+					if ss.runErr == nil {
+						ctx.Logger().Errorf("%s: result processor left runner with no error", ss)
+						// There's nothing we can do at this point, fall through.
+					}
+				}
+			}
+			if ok {
+				close(swch)
+				return
+			}
+			tr.cond.Wait()
+		}
+	}()
+	var err error
+	select {
+	case <-swch:
+		ctx.Logger().Debugf("step runners finished")
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		err = tr.checkStepRunners()
+	case <-time.After(tr.shutdownTimeout):
+		ctx.Logger().Errorf("step runners failed to shut down correctly")
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		// If there is a step with an error set, use that.
+		err = tr.checkStepRunners()
+		// If there isn't, enumerate ones that were still running at the time.
+		nrerr := &cerrors.ErrTestStepsNeverReturned{}
+		if err == nil {
+			err = nrerr
+		}
+		for _, ss := range tr.steps {
+			if ss.stepRunning {
+				nrerr.StepNames = append(nrerr.StepNames, ss.sb.TestStepLabel)
+				// We cannot make the step itself return but we can at least release the reader.
+				tr.safeCloseOutCh(ss)
+			}
+		}
+	}
+	// Emit step error events.
+	for _, ss := range tr.steps {
+		ctx.Logger().Debugf("%s %v", ss, ss.runErr)
+		if ss.runErr != nil && ss.runErr != xcontext.Paused && ss.runErr != xcontext.Canceled {
+			if err := ss.emitEvent(ctx, EventTestError, nil, ss.runErr.Error()); err != nil {
+				ctx.Logger().Errorf("failed to emit event: %s", err)
+			}
+		}
+	}
+	return err
+}
+
+func (tr *TestRunner) injectTarget(ctx xcontext.Context, tgs *targetState, ss *stepState) error {
+	ctx.Logger().Debugf("%s: injecting into %s", tgs, ss)
+	select {
+	case ss.inCh <- tgs.tgt:
+		// Injected successfully.
+		err := ss.ev.Emit(ctx, testevent.Data{EventName: target.EventTargetIn, Target: tgs.tgt})
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		// By the time we get here the target could have been processed and result posted already, hence the check.
+		if tgs.CurPhase == targetStepPhaseBegin {
+			tgs.CurPhase = targetStepPhaseRun
+		}
+		if err != nil {
+			return fmt.Errorf("failed to report target injection: %w", err)
+		}
+		tr.cond.Signal()
+	case <-time.After(tr.stepInjectTimeout):
+		ctx.Logger().Errorf("timed out while injecting a target")
+		if err := ss.ev.Emit(ctx, testevent.Data{EventName: target.EventTargetInErr, Target: tgs.tgt}); err != nil {
+			ctx.Logger().Errorf("failed to emit event: %s", err)
+		}
+		return &cerrors.ErrTestTargetInjectionTimedOut{StepName: ss.sb.TestStepLabel}
+	case <-ctx.Done():
+		return xcontext.Canceled
+	}
+	return nil
+}
+
+func (tr *TestRunner) awaitTargetResult(ctx xcontext.Context, tgs *targetState, ss *stepState) error {
+	select {
+	case res, ok := <-tgs.resCh:
+		if !ok {
+			ctx.Logger().Debugf("%s: result channel closed", tgs)
+			return xcontext.Canceled
+		}
+		ctx.Logger().Debugf("%s: result recd for %s", tgs, ss)
+		var err error
+		if res == nil {
+			err = ss.emitEvent(ctx, target.EventTargetOut, tgs.tgt, nil)
+		} else {
+			err = ss.emitEvent(ctx, target.EventTargetErr, tgs.tgt, target.ErrPayload{Error: res.Error()})
+		}
+		if err != nil {
+			ctx.Logger().Errorf("failed to emit event: %s", err)
+		}
+		tr.mu.Lock()
+		if res != nil {
+			tgs.Res = xjson.NewError(res)
+		}
+		tgs.CurPhase = targetStepPhaseEnd
+		tr.mu.Unlock()
+		tr.cond.Signal()
+		return err
+		// Check for cancellation.
+		// Notably we are not checking for the pause condition here:
+		// when paused, we want to let all the injected targets to finish
+		// and collect all the results they produce. If that doesn't happen,
+		// step runner will close resCh on its way out and unlock us.
+	case <-ctx.Done():
+		ctx.Logger().Debugf("%s: canceled 2", tgs)
+		return xcontext.Canceled
+	}
+}
+
+// targetHandler takes a single target through each step of the pipeline in sequence.
+// It injects the target, waits for the result, then moves on to the next step.
+func (tr *TestRunner) targetHandler(ctx xcontext.Context, tgs *targetState) {
+	ctx = ctx.WithField("target", tgs.tgt.ID)
+	ctx.Logger().Debugf("%s: target handler active", tgs)
+	// NB: CurStep may be non-zero on entry if resumed
+loop:
+	for i := tgs.CurStep; i < len(tr.steps); {
+		// Early check for pause or cancelation.
+		select {
+		case <-ctx.WaitFor(xcontext.Paused):
+			ctx.Logger().Debugf("%s: paused 0", tgs)
+			break loop
+		case <-ctx.Done():
+			ctx.Logger().Debugf("%s: canceled 0", tgs)
+			break loop
+		default:
+		}
+		tr.mu.Lock()
+		ss := tr.steps[i]
+		if tgs.CurPhase == targetStepPhaseEnd {
+			// This target already terminated.
+			// Can happen if resumed from terminal state.
+			tr.mu.Unlock()
+			break loop
+		}
+		tgs.CurPhase = targetStepPhaseBegin
+		tr.mu.Unlock()
+		// Make sure we have a step runner active. If not, start one.
+		tr.runStepIfNeeded(ss)
+		// Inject the target.
+		err := tr.injectTarget(ctx, tgs, ss)
+		// Await result. It will be communicated to us by the step runner
+		// and returned in tgs.res.
+		if err == nil {
+			err = tr.awaitTargetResult(ctx, tgs, ss)
+		}
+		if err != nil {
+			ss.ctx.Logger().Errorf("%s", err)
+			if err != xcontext.Canceled {
+				ss.setErr(&tr.mu, err)
+			} else {
+				ss.ctx.Logger().Debugf("%s: canceled 1", tgs)
+			}
+			break
+		}
+		tr.mu.Lock()
+		if tgs.Res != nil {
+			tr.mu.Unlock()
+			break
+		}
+		i++
+		if i < len(tr.steps) {
+			tgs.CurStep = i
+			tgs.CurPhase = targetStepPhaseInit
+		}
+		tr.mu.Unlock()
+	}
+	ctx.Logger().Debugf("%s: target handler finished", tgs)
+	tr.mu.Lock()
+	tgs.resCh = nil
+	tr.cond.Signal()
+	tr.mu.Unlock()
+}
+
+// runStepIfNeeded starts the step runner goroutine if not already running.
+func (tr *TestRunner) runStepIfNeeded(ss *stepState) {
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	if ss.stepRunning {
+		return
+	}
+	if ss.runErr != nil {
+		return
+	}
+	ss.stepRunning = true
+	ss.readerRunning = true
+	go tr.stepRunner(ss)
+	go tr.stepReader(ss)
+}
+
+func (ss *stepState) setErr(mu sync.Locker, err error) {
+	mu.Lock()
+	defer mu.Unlock()
+	ss.setErrLocked(err)
+}
+
+// setErrLocked sets step runner error unless already set.
+func (ss *stepState) setErrLocked(err error) {
+	if err == nil || ss.runErr != nil {
+		return
+	}
+	ss.ctx.Logger().Errorf("err: %v", err)
+	ss.runErr = err
+}
+
+// emitEvent emits the specified event with the specified JSON payload (if any).
+func (ss *stepState) emitEvent(ctx xcontext.Context, name event.Name, tgt *target.Target, payload interface{}) error {
+	var payloadJSON *json.RawMessage
+	if payload != nil {
+		payloadBytes, jmErr := json.Marshal(payload)
+		if jmErr != nil {
+			return fmt.Errorf("failed to marshal event: %w", jmErr)
+		}
+		pj := json.RawMessage(payloadBytes)
+		payloadJSON = &pj
+	}
+	errEv := testevent.Data{
+		EventName: name,
+		Target:    tgt,
+		Payload:   payloadJSON,
+	}
+	return ss.ev.Emit(ctx, errEv)
+}
+
+// stepRunner runs a test pipeline's step (the Run() method).
+func (tr *TestRunner) stepRunner(ss *stepState) {
+	ss.ctx.Logger().Debugf("%s: step runner active", ss)
+	defer func() {
+		if r := recover(); r != nil {
+			tr.mu.Lock()
+			ss.stepRunning = false
+			ss.setErrLocked(&cerrors.ErrTestStepPaniced{
+				StepName:   ss.sb.TestStepLabel,
+				StackTrace: fmt.Sprintf("%s / %s", r, debug.Stack()),
+			})
+			tr.mu.Unlock()
+			tr.safeCloseOutCh(ss)
+		}
+	}()
+	chans := test.TestStepChannels{In: ss.inCh, Out: ss.outCh, Err: ss.errCh}
+	runErr := ss.sb.TestStep.Run(ss.ctx, chans, ss.sb.Parameters, ss.ev)
+	ss.ctx.Logger().Debugf("%s: step runner finished %v", ss, runErr)
+	tr.mu.Lock()
+	ss.stepRunning = false
+	ss.setErrLocked(runErr)
+	tr.mu.Unlock()
+	// Signal to the result processor that no more will be coming.
+	tr.safeCloseOutCh(ss)
+}
+
+// reportTargetResult reports result of executing a step to the appropriate target handler.
+func (tr *TestRunner) reportTargetResult(ss *stepState, tgt *target.Target, res error) error {
+	resCh, err := func() (chan error, error) {
+		tr.mu.Lock()
+		defer tr.mu.Unlock()
+		tgs := tr.targets[tgt.ID]
+		if tgs == nil {
+			return nil, &cerrors.ErrTestStepReturnedUnexpectedResult{
+				StepName: ss.sb.TestStepLabel,
+				Target:   tgt.ID,
+			}
+		}
+		if ss.tgtDone[tgt] {
+			return nil, &cerrors.ErrTestStepReturnedDuplicateResult{
+				StepName: ss.sb.TestStepLabel,
+				Target:   tgt.ID,
+			}
+		}
+		ss.tgtDone[tgt] = true
+		// Begin is also allowed here because it may happen that we get a result before target handler updates phase.
+		if tgs.CurStep != ss.stepIndex ||
+			(tgs.CurPhase != targetStepPhaseBegin && tgs.CurPhase != targetStepPhaseRun) {
+			return nil, &cerrors.ErrTestStepReturnedUnexpectedResult{
+				StepName: ss.sb.TestStepLabel,
+				Target:   tgt.ID,
+			}
+		}
+		if tgs.resCh == nil {
+			select {
+			case <-ss.ctx.Done():
+				// If canceled, target handler may have left early. We don't care though.
+				return nil, xcontext.Canceled
+			default:
+				// This should not happen, must be an internal error.
+				return nil, fmt.Errorf("%s: target handler %s is not there, dropping result on the floor", ss, tgs)
+			}
+		}
+		tgs.CurPhase = targetStepPhaseResultPending
+		ss.ctx.Logger().Debugf("%s: result for %s: %v", ss, tgs, res)
+		return tgs.resCh, nil
+	}()
+	if err != nil {
+		return err
+	}
+	select {
+	case resCh <- res:
+		break
+	case <-ss.ctx.Done():
+		break
+	}
+	return nil
+}
+
+func (tr *TestRunner) safeCloseOutCh(ss *stepState) {
+	defer func() {
+		if r := recover(); r != nil {
+			ss.setErr(&tr.mu, &cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel})
+		}
+	}()
+	close(ss.outCh)
+}
+
+// safeCloseErrCh closes error channel safely, even if it has already been closed.
+func (tr *TestRunner) safeCloseErrCh(ss *stepState) {
+	defer func() {
+		if r := recover(); r != nil {
+			ss.setErr(&tr.mu, &cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel})
+		}
+	}()
+	close(ss.errCh)
+}
+
+// stepReader receives results from the step's output channel and forwards them to the appropriate target handlers.
+func (tr *TestRunner) stepReader(ss *stepState) {
+	ss.ctx.Logger().Debugf("%s: step reader active", ss)
+	var err error
+	outCh := ss.outCh
+	cancelCh := ss.ctx.Done()
+	var shutdownTimeoutCh <-chan time.Time
+loop:
+	for {
+		select {
+		case tgt, ok := <-outCh:
+			if !ok {
+				ss.ctx.Logger().Debugf("%s: out chan closed", ss)
+				// At this point we may still have an error to report,
+				// wait until error channel is emptied too.
+				outCh = nil
+				tr.safeCloseErrCh(ss)
+				continue loop
+			}
+			if err = tr.reportTargetResult(ss, tgt, nil); err != nil {
+				break loop
+			}
+		case res, ok := <-ss.errCh:
+			if !ok {
+				tr.mu.Lock()
+				if ss.stepRunning {
+					// Error channel is always closed after the output,
+					// which is only closed after runner goroutine has exited.
+					// If we got here, it means that the plugin closed the error channel.
+					ss.setErrLocked(&cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel})
+				}
+				tr.mu.Unlock()
+				ss.ctx.Logger().Debugf("%s: err chan closed", ss)
+				break loop
+			}
+			if err = tr.reportTargetResult(ss, res.Target, res.Err); err != nil {
+				break loop
+			}
+		case <-cancelCh:
+			ss.ctx.Logger().Debugf("%s: canceled 3, draining", ss)
+			// Allow some time to drain
+			cancelCh = nil
+			shutdownTimeoutCh = time.After(tr.shutdownTimeout)
+		case <-shutdownTimeoutCh:
+			ss.setErr(&tr.mu, &cerrors.ErrTestStepsNeverReturned{})
+		}
+	}
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	ss.setErrLocked(err)
+	if ss.stepRunning && ss.runErr == nil {
+		// This means that plugin closed its channels before leaving.
+		ss.setErrLocked(&cerrors.ErrTestStepClosedChannels{StepName: ss.sb.TestStepLabel})
+	}
+	ss.readerRunning = false
+	ss.ctx.Logger().Debugf("%s: step reader finished, %t %t %v", ss, ss.stepRunning, ss.readerRunning, ss.runErr)
+	tr.cond.Signal()
+}
+
+// checkStepRunnersLocked checks if any step runner has encountered an error.
+func (tr *TestRunner) checkStepRunners() error {
+	for _, ss := range tr.steps {
+		if ss.runErr != nil {
+			return ss.runErr
+		}
+	}
+	return nil
+}
+
+// runMonitor monitors progress of targets through the pipeline
+// and closes input channels of the steps to indicate that no more are expected.
+// It also monitors steps for critical errors and cancels the whole run.
+// Note: input channels remain open when cancellation is requested,
+// plugins are expected to handle it explicitly.
+func (tr *TestRunner) runMonitor(ctx xcontext.Context) error {
+	ctx.Logger().Debugf("monitor: active")
+	tr.mu.Lock()
+	defer tr.mu.Unlock()
+	// First, compute the starting step of the pipeline (it may be non-zero
+	// if the pipeline was resumed).
+	minStep := len(tr.steps)
+	for _, tgs := range tr.targets {
+		if tgs.CurStep < minStep {
+			minStep = tgs.CurStep
+		}
+	}
+	if minStep < len(tr.steps) {
+		ctx.Logger().Debugf("monitor: starting at step %s", tr.steps[minStep])
+	}
+
+	// Run the main loop.
+	pass := 1
+	var runErr error
+stepLoop:
+	for step := minStep; step < len(tr.steps); pass++ {
+		ss := tr.steps[step]
+		ctx.Logger().Debugf("monitor pass %d: current step %s", pass, ss)
+		// Check if all the targets have either made it past the injection phase or terminated.
+		ok := true
+		for _, tgs := range tr.targets {
+			ctx.Logger().Debugf("monitor pass %d: %s: %s", pass, ss, tgs)
+			if tgs.resCh == nil { // Not running anymore
+				continue
+			}
+			if tgs.CurStep < step || tgs.CurPhase < targetStepPhaseRun {
+				ctx.Logger().Debugf("monitor pass %d: %s: not all targets injected yet (%s)", pass, ss, tgs)
+				ok = false
+				break
+			}
+		}
+		if runErr = tr.checkStepRunners(); runErr != nil {
+			break stepLoop
+		}
+		if !ok {
+			// Wait for notification: as progress is being made, we get notified.
+			tr.cond.Wait()
+			continue
+		}
+		// All targets ok, close the step's input channel.
+		ctx.Logger().Debugf("monitor pass %d: %s: no more targets, closing input channel", pass, ss)
+		close(ss.inCh)
+		step++
+	}
+	// Wait for all the targets to finish.
+	ctx.Logger().Debugf("monitor: waiting for targets to finish")
+tgtLoop:
+	for ; runErr == nil; pass++ {
+		ok := true
+		for _, tgs := range tr.targets {
+			ctx.Logger().Debugf("monitor pass %d: %s", pass, tgs)
+			if runErr = tr.checkStepRunners(); runErr != nil {
+				break tgtLoop
+			}
+			if tgs.resCh != nil && tgs.CurStep < len(tr.steps) {
+				ss := tr.steps[tgs.CurStep]
+				if tgs.CurPhase == targetStepPhaseRun && !ss.readerRunning {
+					// Target has been injected but step runner has exited, this target has been lost.
+					runErr = &cerrors.ErrTestStepLostTargets{
+						StepName: ss.sb.TestStepLabel,
+						Targets:  []string{tgs.tgt.ID},
+					}
+					break tgtLoop
+				}
+				ok = false
+				break
+			}
+		}
+		if ok {
+			break
+		}
+		// Wait for notification: as progress is being made, we get notified.
+		tr.cond.Wait()
+	}
+	ctx.Logger().Debugf("monitor: finished, %v", runErr)
+	return runErr
+}
+
+func NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout time.Duration) *TestRunner {
+	tr := &TestRunner{
+		stepInjectTimeout: stepInjectTimeout,
+		shutdownTimeout:   shutdownTimeout,
+	}
+	tr.cond = sync.NewCond(&tr.mu)
+	return tr
+}
+
+func NewTestRunner() *TestRunner {
+	return NewTestRunnerWithTimeouts(config.StepInjectTimeout, config.TestRunnerShutdownTimeout)
+}
+
+func (tph targetStepPhase) String() string {
+	switch tph {
+	case targetStepPhaseInvalid:
+		return "INVALID"
+	case targetStepPhaseInit:
+		return "init"
+	case targetStepPhaseBegin:
+		return "begin"
+	case targetStepPhaseRun:
+		return "run"
+	case targetStepPhaseResultPending:
+		return "result_pending"
+	case targetStepPhaseEnd:
+		return "end"
+	}
+	return fmt.Sprintf("???(%d)", tph)
+}
+
+func (ss *stepState) String() string {
+	return fmt.Sprintf("[#%d %s]", ss.stepIndex, ss.sb.TestStepLabel)
+}
+
+func (tgs *targetState) String() string {
+	var resText string
+	if tgs.Res != nil {
+		resStr := fmt.Sprintf("%v", tgs.Res)
+		if len(resStr) > 20 {
+			resStr = resStr[:20] + "..."
+		}
+		resText = fmt.Sprintf("%q", resStr)
+	} else {
+		resText = "<nil>"
+	}
+	finished := tgs.resCh == nil
+	return fmt.Sprintf("[%s %d %s %t %s]",
+		tgs.tgt, tgs.CurStep, tgs.CurPhase, finished, resText)
+}
diff --git a/pkg/runner/test_runner_pipeline.go b/pkg/runner/test_runner_pipeline.go
deleted file mode 100644
index 241dd932..00000000
--- a/pkg/runner/test_runner_pipeline.go
+++ /dev/null
@@ -1,540 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"encoding/json"
-	"fmt"
-	"runtime/debug"
-	"sync/atomic"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/cerrors"
-	"github.com/facebookincubator/contest/pkg/event/testevent"
-	"github.com/facebookincubator/contest/pkg/storage"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/facebookincubator/contest/pkg/types"
-	"github.com/facebookincubator/contest/pkg/xcontext"
-)
-
-// pipeline represents a sequence of steps through which targets flow. A pipeline could implement
-// either a test sequence or a cleanup sequence
-type pipeline struct {
-	bundles []test.TestStepBundle
-
-	jobID types.JobID
-	runID types.RunID
-
-	state *State
-	test  *test.Test
-
-	timeouts TestRunnerTimeouts
-
-	// ctrlChannels represents a set of result and completion channels for this pipeline,
-	// used to collect the results of routing blocks, steps and targets completing
-	// the pipeline. It's available only after having initialized the pipeline
-	ctrlChannels *pipelineCtrlCh
-
-	// numIngress represents the total number of targets that have been seen entering
-	// the pipeline. This number is set by the first routing block as soon as the injection
-	// terminates
-	numIngress uint64
-}
-
-// runStep runs synchronously a TestStep and peforms sanity checks on the status
-// of the input/output channels on the defer control path. When the TestStep returns,
-// the associated output channels are closed. This signals to the routing subsytem
-// that this step of the pipeline will not be producing any more targets. The API
-// dictates that the TestStep can only return if its input channel has been closed,
-// which indicates that no more targets will be submitted to that step. If the
-// TestStep does not comply with this rule, an error is sent downstream to the pipeline
-// runner, which will in turn shutdown the whole pipeline and flag the TestStep
-// as misbehaving. All attempts to send downstream a result after a TestStep complete
-// are protected by a timeout and fail open. This because it's not guaranteed that
-// the TestRunner will still be listening on the result channels. If the TestStep hangs
-// indefinitely and does not respond to cancellation signals, the TestRunner will
-// flag it as misbehaving and return. If the TestStep returns once the TestRunner
-// has completed, it will timeout trying to write on the result channel.
-func (p *pipeline) runStep(ctx xcontext.Context, jobID types.JobID, runID types.RunID, bundle test.TestStepBundle, stepCh stepCh, resultCh chan<- stepResult, ev testevent.EmitterFetcher) {
-	stepLabel := bundle.TestStepLabel
-	ctx = ctx.WithField("step", stepLabel).WithTag("phase", "runStep")
-
-	ctx.Logger().Debugf("initializing step")
-	timeout := p.timeouts.MessageTimeout
-	defer func() {
-		if r := recover(); r != nil {
-			err := fmt.Errorf("step %s panicked (%v): %s", stepLabel, r, debug.Stack())
-			select {
-			case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-			case <-time.After(p.timeouts.MessageTimeout):
-				ctx.Logger().Warnf("sending error back from test step runner timed out after %v: %v", timeout, err)
-				return
-			}
-			return
-		}
-	}()
-
-	// We should not run a test step if there're no targets in stepCh.stepIn
-	// First we check if there's at least one incoming target, and only
-	// then we call `bundle.TestStep.Run()`.
-	//
-	// ITS: https://github.com/facebookincubator/contest/issues/101
-	stepIn, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(ctx.WaitFor(), stepCh.stepIn)
-
-	haveTargets := false
-	select {
-	case <-onFirstTargetChan:
-		ctx.Logger().Debugf("first target received, will start step")
-		haveTargets = true
-	case <-ctx.Done():
-		ctx.Logger().Debugf("cancelled")
-	case <-ctx.WaitFor(xcontext.Paused):
-		ctx.Logger().Debugf("paused")
-	case <-onNoTargetsChan:
-		ctx.Logger().Debugf("no targets")
-	}
-
-	var err error
-	if haveTargets {
-		// Run the TestStep and defer to the return path the handling of panic
-		// conditions. If multiple error conditions occur, send downstream only
-		// the first error encountered.
-		channels := test.TestStepChannels{
-			In:  stepIn,
-			Out: stepCh.stepOut,
-			Err: stepCh.stepErr,
-		}
-		err = bundle.TestStep.Run(ctx, channels, bundle.Parameters, ev)
-	}
-
-	ctx.Logger().Debugf("step %s returned", bundle.TestStepLabel)
-
-	// Check if we are shutting down. If so, do not perform sanity checks on the
-	// channels but return immediately as the TestStep itself probably returned
-	// because it honored the termination signal.
-
-	cancellationAsserted := ctx.Err() != nil
-	pauseAsserted := ctx.IsSignaledWith(xcontext.Paused)
-
-	if cancellationAsserted && err == nil {
-		err = fmt.Errorf("test step cancelled")
-	}
-
-	if cancellationAsserted || pauseAsserted {
-		select {
-		case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-		case <-time.After(timeout):
-			ctx.Logger().Warnf("sending error back from TestStep runner timed out after %v: %v", timeout, err)
-		}
-		return
-	}
-
-	// If the TestStep has crashed with an error, return immediately the result to the TestRunner
-	// which in turn will issue a cancellation signal to the pipeline
-	if err != nil {
-		select {
-		case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-		case <-time.After(timeout):
-			ctx.Logger().Warnf("sending error back from test step runner (%s) timed out after %v: %v", stepLabel, timeout, err)
-		}
-		return
-	}
-
-	// Perform sanity checks on the status of the channels. The TestStep API
-	// mandates that output and error channels shall not be closed by the
-	// TestStep itself. If the TestStep does not comply with the API, it is
-	// flagged as misbehaving.
-	select {
-	case _, ok := <-stepCh.stepIn:
-		if !ok {
-			break
-		}
-		// stepCh.stepIn is not closed, but the TestStep returned, which is a violation
-		// of the API. Record the error if no other error condition has been seen.
-		err = fmt.Errorf("step %s returned, but input channel is not closed (api violation; case 0)", stepLabel)
-
-	default:
-		// stepCh.stepIn is not closed, and a read operation would block. The TestStep
-		// does not comply with the API (see above).
-		err = fmt.Errorf("step %s returned, but input channel is not closed (api violation; case 1)", stepLabel)
-	}
-
-	select {
-	case _, ok := <-stepCh.stepOut:
-		if !ok {
-			// stepOutCh has been closed. This is a violation of the API. Record the error
-			// if no other error condition has been seen.
-			if err == nil {
-				err = &cerrors.ErrTestStepClosedChannels{StepName: stepLabel}
-			}
-		}
-	default:
-		// stepCh.stepOut is open. Flag it for closure to signal to the routing subsystem that
-		// no more Targets will go through from this channel.
-		close(stepCh.stepOut)
-	}
-
-	select {
-	case _, ok := <-stepCh.stepErr:
-		if !ok {
-			// stepErrCh has been closed. This is a violation of the API. Record the error
-			// if no other error condition has been seen.
-			if err == nil {
-				err = &cerrors.ErrTestStepClosedChannels{StepName: stepLabel}
-			}
-		}
-	default:
-		// stepCh.stepErr is open. Flag it for closure to signal to the routing subsystem that
-		// no more Targets will go through from this channel.
-		close(stepCh.stepErr)
-	}
-
-	select {
-	case resultCh <- stepResult{jobID: jobID, runID: runID, bundle: bundle, err: err}:
-	case <-time.After(timeout):
-		ctx.Logger().Warnf("sending error back from step runner (%s) timed out after %v: %v", stepLabel, timeout, err)
-		return
-	}
-}
-
-// waitTargets reads results coming from results channels until all Targets
-// have completed or an error occurs. If all Targets complete successfully, it checks
-// whether TestSteps and routing blocks have completed as well. If not, returns an
-// error. Termination is signalled via terminate channel.
-func (p *pipeline) waitTargets(ctx xcontext.Context, completedCh chan<- *target.Target) error {
-	ctx = ctx.WithTag("phase", "waitTargets")
-
-	var (
-		err                  error
-		completedTarget      *target.Target
-		completedTargetError error
-	)
-
-	outChannel := p.ctrlChannels.targetOut
-
-	writer := newTargetWriter(p.timeouts)
-	for {
-		select {
-		case t, ok := <-outChannel:
-			if !ok {
-				ctx.Logger().Debugf("pipeline output channel was closed, no more targets will come through")
-				outChannel = nil
-				break
-			}
-			completedTarget = t
-		case <-ctx.WaitFor():
-			// When termination is signaled just stop wait. It is up
-			// to the caller to decide how to further handle pipeline termination.
-			ctx.Logger().Debugf("termination requested")
-			return nil
-		case res := <-p.ctrlChannels.routingResultCh:
-			err = res.err
-			p.state.SetRouting(res.bundle.TestStepLabel, res.err)
-		case res := <-p.ctrlChannels.stepResultCh:
-			err = res.err
-			if err != nil {
-				payload, jmErr := json.Marshal(err.Error())
-				if jmErr != nil {
-					ctx.Logger().Warnf("failed to marshal error string to JSON: %v", jmErr)
-					continue
-				}
-				rm := json.RawMessage(payload)
-				header := testevent.Header{
-					JobID:         res.jobID,
-					RunID:         res.runID,
-					TestName:      p.test.Name,
-					TestStepLabel: res.bundle.TestStepLabel,
-				}
-				ev := storage.NewTestEventEmitterFetcher(header)
-				// this event is not associated to any target, e.g. a plugin has returned an error.
-				errEv := testevent.Data{EventName: EventTestError, Target: nil, Payload: &rm}
-				// emit test event containing the completion error
-				if err := ev.Emit(ctx, errEv); err != nil {
-					ctx.Logger().Warnf("could not emit completion error event %v", errEv)
-				}
-			}
-			p.state.SetStep(res.bundle.TestStepLabel, res.err)
-
-		case targetErr := <-p.ctrlChannels.targetErr:
-			completedTarget = targetErr.Target
-			completedTargetError = targetErr.Err
-		}
-
-		if err != nil {
-			return err
-		}
-
-		if outChannel == nil {
-			ctx.Logger().Debugf("no more targets to wait, output channel is closed")
-			break
-		}
-
-		if completedTarget != nil {
-			p.state.SetTarget(completedTarget, completedTargetError)
-			ctx.Logger().Debugf("writing target %+v on the completed channel", completedTarget)
-			if err := writer.writeTimeout(ctx, completedCh, completedTarget, p.timeouts.MessageTimeout); err != nil {
-				ctx.Logger().Panicf("could not write completed target: %v", err)
-			}
-			completedTarget = nil
-			completedTargetError = nil
-		}
-
-		numIngress := atomic.LoadUint64(&p.numIngress)
-		numCompleted := uint64(len(p.state.CompletedTargets()))
-		ctx.Logger().Debugf("targets completed: %d, expected: %d", numCompleted, numIngress)
-		if numIngress != 0 && numCompleted == numIngress {
-			ctx.Logger().Debugf("no more targets to wait, all targets (%d) completed", numCompleted)
-			break
-		}
-	}
-	// The test run completed, we have collected all Targets. TestSteps might have already
-	// closed `ch.out`, in which case the pipeline terminated correctly (channels are closed
-	// in a "domino" sequence, so seeing the last channel closed indicates that the
-	// sequence of close operations has completed). If `ch.out` is still open,
-	// there are still TestSteps that might have not returned. Wait for all
-	// TestSteps to complete or `StepShutdownTimeout` to occur.
-	ctx.Logger().Infof("waiting for all steps to complete")
-	return p.waitSteps(ctx)
-}
-
-// waitTermination reads results coming from result channels waiting
-// for the pipeline to completely shutdown before `ShutdownTimeout` occurs. A
-// "complete shutdown" means that all TestSteps and routing blocks have sent
-// downstream their results.
-func (p *pipeline) waitTermination(ctx xcontext.Context) error {
-
-	if len(p.bundles) == 0 {
-		return fmt.Errorf("no bundles specified for waitTermination")
-	}
-
-	ctx = ctx.WithTag("phase", "waitTermination")
-	ctx.Logger().Debugf("waiting for pipeline to terminate")
-
-	for {
-
-		ctx.Logger().Debugf("steps completed: %d", len(p.state.CompletedSteps()))
-		ctx.Logger().Debugf("routing completed: %d", len(p.state.CompletedRouting()))
-		stepsCompleted := len(p.state.CompletedSteps()) == len(p.bundles)
-		routingCompleted := len(p.state.CompletedRouting()) == len(p.bundles)
-		if stepsCompleted && routingCompleted {
-			return nil
-		}
-
-		select {
-		case <-time.After(p.timeouts.ShutdownTimeout):
-			incompleteSteps := p.state.IncompleteSteps(p.bundles)
-			if len(incompleteSteps) > 0 {
-				return &cerrors.ErrTestStepsNeverReturned{StepNames: incompleteSteps}
-			}
-			return fmt.Errorf("pipeline did not return but all test steps completed")
-		case res := <-p.ctrlChannels.routingResultCh:
-			p.state.SetRouting(res.bundle.TestStepLabel, res.err)
-		case res := <-p.ctrlChannels.stepResultCh:
-			p.state.SetStep(res.bundle.TestStepLabel, res.err)
-		}
-	}
-}
-
-// waitSteps reads results coming from result channels until `StepShutdownTimeout`
-// occurs or an error is encountered. It then checks whether TestSteps and routing
-// blocks have all returned correctly. If not, it returns an error.
-func (p *pipeline) waitSteps(ctx xcontext.Context) error {
-
-	if len(p.bundles) == 0 {
-		return fmt.Errorf("no bundles specified for waitSteps")
-	}
-
-	ctx = ctx.WithTag("phase", "waitSteps")
-
-	var err error
-
-	ctx.Logger().Debugf("waiting for test steps to terminate")
-	for {
-		select {
-		case <-time.After(p.timeouts.StepShutdownTimeout):
-			ctx.Logger().Warnf("timed out waiting for steps to complete after %v", p.timeouts.StepShutdownTimeout)
-			incompleteSteps := p.state.IncompleteSteps(p.bundles)
-			if len(incompleteSteps) > 0 {
-				err = &cerrors.ErrTestStepsNeverReturned{StepNames: incompleteSteps}
-				break
-			}
-			if len(p.state.CompletedRouting()) != len(p.bundles) {
-				err = fmt.Errorf("not all routing completed: %d!=%d", len(p.state.CompletedRouting()), len(p.bundles))
-			}
-		case res := <-p.ctrlChannels.routingResultCh:
-			ctx.Logger().Debugf("received routing block result for %s", res.bundle.TestStepLabel)
-			p.state.SetRouting(res.bundle.TestStepLabel, res.err)
-			err = res.err
-		case res := <-p.ctrlChannels.stepResultCh:
-			ctx.Logger().Debugf("received step result for %s", res.bundle.TestStepLabel)
-			p.state.SetStep(res.bundle.TestStepLabel, res.err)
-			err = res.err
-		}
-		if err != nil {
-			return err
-		}
-		ctx.Logger().Debugf("steps completed: %d, expected: %d", len(p.state.CompletedSteps()), len(p.bundles))
-		ctx.Logger().Debugf("routing completed: %d, expected: %d", len(p.state.CompletedRouting()), len(p.bundles))
-		stepsCompleted := len(p.state.CompletedSteps()) == len(p.bundles)
-		routingCompleted := len(p.state.CompletedRouting()) == len(p.bundles)
-		if stepsCompleted && routingCompleted {
-			break
-		}
-	}
-
-	return nil
-}
-
-// init initializes the pipeline by connecting steps and routing blocks. The result of pipeline
-// initialization is a set of control/result channels assigned to the pipeline object. The pipeline
-// input channel is returned.
-func (p *pipeline) init(ctx xcontext.Context) (routeInFirst chan *target.Target) {
-	ctx.Logger().Debugf("starting")
-
-	if p.ctrlChannels != nil {
-		ctx.Logger().Panicf("pipeline is already initialized, control channel are already configured")
-	}
-
-	var (
-		routeOut chan *target.Target
-		routeIn  chan *target.Target
-	)
-
-	pipelineCtx, pause := xcontext.WithNotify(ctx, xcontext.Paused)
-	pipelineCtx, cancel := xcontext.WithCancel(pipelineCtx)
-
-	// result channels used to communicate result information from the routing blocks
-	// and step executors
-	routingResultCh := make(chan routeResult)
-	stepResultCh := make(chan stepResult)
-	targetErrCh := make(chan cerrors.TargetError)
-
-	routeIn = make(chan *target.Target)
-	for position, testStepBundle := range p.bundles {
-
-		// Input and output channels for the TestStep
-		stepInCh := make(chan *target.Target)
-		stepOutCh := make(chan *target.Target)
-		stepErrCh := make(chan cerrors.TargetError)
-
-		routeOut = make(chan *target.Target)
-
-		// First step of the pipeline, keep track of the routeIn channel as this is
-		// going to be used to injects targets into the pipeline from outside. Also
-		// add an intermediate goroutine which keeps track of how many targets have
-		// been injected into the pipeline
-		if position == 0 {
-			routeInFirst = make(chan *target.Target)
-			routeInStep := routeIn
-			go func() {
-				defer close(routeInStep)
-				numIngress := uint64(0)
-				for t := range routeInFirst {
-					routeInStep <- t
-					numIngress++
-				}
-				atomic.StoreUint64(&p.numIngress, numIngress)
-			}()
-		}
-
-		stepChannels := stepCh{stepIn: stepInCh, stepErr: stepErrCh, stepOut: stepOutCh}
-		routingChannels := routingCh{
-			routeIn:   routeIn,
-			routeOut:  routeOut,
-			stepIn:    stepInCh,
-			stepErr:   stepErrCh,
-			stepOut:   stepOutCh,
-			targetErr: targetErrCh,
-		}
-
-		// Build the Header that the the TestStep will be using for emitting events
-		Header := testevent.Header{
-			JobID:         p.jobID,
-			RunID:         p.runID,
-			TestName:      p.test.Name,
-			TestStepLabel: testStepBundle.TestStepLabel,
-		}
-		ev := storage.NewTestEventEmitterFetcherWithAllowedEvents(Header, &testStepBundle.AllowedEvents)
-
-		stepCtx := pipelineCtx.WithFields(xcontext.Fields{
-			"run_id": p.runID,
-			"step":   testStepBundle.TestStepLabel,
-		})
-		router := newStepRouter(testStepBundle, routingChannels, ev, p.timeouts)
-		go router.route(stepCtx, routingResultCh)
-		go p.runStep(stepCtx, p.jobID, p.runID, testStepBundle, stepChannels, stepResultCh, ev)
-		// The input of the next routing block is the output of the current routing block
-		routeIn = routeOut
-	}
-
-	p.ctrlChannels = &pipelineCtrlCh{
-		routingResultCh: routingResultCh,
-		stepResultCh:    stepResultCh,
-		targetErr:       targetErrCh,
-		targetOut:       routeOut,
-
-		ctx:    pipelineCtx,
-		cancel: cancel,
-		pause:  pause,
-	}
-
-	return
-}
-
-// run is a blocking method which executes the pipeline until successful or failed termination
-func (p *pipeline) run(pipelineCtx xcontext.Context, completedTargetsCh chan<- *target.Target) error {
-	log := pipelineCtx.Logger()
-	log.Debugf("run")
-	if p.ctrlChannels == nil {
-		log.Panicf("pipeline is not initialized, control channels are not available")
-	}
-	channelsCtx := p.ctrlChannels.ctx
-
-	// Wait for the pipeline to complete. If an error occurs, cancel all TestSteps
-	// and routing blocks and wait again for completion until shutdown timeout occurs.
-	log.Infof("waiting for pipeline to complete")
-	completionError := p.waitTargets(channelsCtx, completedTargetsCh)
-
-	pauseAsserted := channelsCtx.IsSignaledWith(xcontext.Paused)
-	cancellationAsserted := channelsCtx.Err() != nil
-
-	if completionError != nil {
-		log.Warnf("test failed to complete: %v. Forcing cancellation.", completionError)
-		p.ctrlChannels.cancel() // terminate routing and and propagate cancellation to the steps
-	} else if cancellationAsserted {
-		log.Infof("cancellation was asserted")
-	} else if pauseAsserted {
-		log.Warnf("received pause request")
-	}
-
-	// If either cancellation or pause have been asserted, we need to wait for the
-	// pipeline to terminate
-	if cancellationAsserted || pauseAsserted || completionError != nil {
-		signal := "cancellation"
-		if pauseAsserted {
-			signal = "pause"
-		}
-		terminationError := p.waitTermination(channelsCtx)
-		if terminationError != nil {
-			log.Infof("test did not terminate correctly after %s signal: %v", signal, terminationError)
-		} else {
-			log.Infof("test terminated correctly after %s signal", signal)
-		}
-		if completionError != nil {
-			return completionError
-		}
-		return terminationError
-	}
-	log.Infof("completed")
-	return nil
-}
-
-func newPipeline(bundles []test.TestStepBundle, test *test.Test, jobID types.JobID, runID types.RunID, timeouts TestRunnerTimeouts) *pipeline {
-	p := pipeline{bundles: bundles, jobID: jobID, runID: runID, test: test, timeouts: timeouts}
-	p.state = NewState()
-	return &p
-}
diff --git a/pkg/runner/test_runner_route.go b/pkg/runner/test_runner_route.go
deleted file mode 100644
index 6f4492a6..00000000
--- a/pkg/runner/test_runner_route.go
+++ /dev/null
@@ -1,292 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"container/list"
-	"encoding/json"
-	"fmt"
-	"log"
-	"sync"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/event/testevent"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/facebookincubator/contest/pkg/xcontext"
-)
-
-// router implements the routing logic that injects targets into a test step and consumes
-// targets in output from another test step
-type stepRouter struct {
-	routingChannels routingCh
-	bundle          test.TestStepBundle
-	ev              testevent.EmitterFetcher
-
-	timeouts TestRunnerTimeouts
-}
-
-// routeIn is responsible for accepting a target from the previous routing block
-// and injecting it into the associated test step. Returns the number of targets
-// injected into the test step or an error upon failure
-func (r *stepRouter) routeIn(ctx xcontext.Context) (int, error) {
-	stepLabel := r.bundle.TestStepLabel
-	ctx = ctx.WithTag("phase", "routeIn").WithField("step", stepLabel)
-
-	var (
-		err             error
-		injectionWg     sync.WaitGroup
-		routeInProgress bool
-	)
-
-	// terminateTargetWriter is a control channel used to signal termination to
-	// the writer object which injects a target into the test step
-	terminateTargetWriterCtx, terminateTargetWriter := xcontext.WithCancel(xcontext.ResetSignalers(ctx))
-	defer terminateTargetWriter() // avoids possible goroutine deadlock in context.WithCancel implementation
-
-	// `targets` is used to buffer targets coming from the previous routing blocks,
-	// queueing them for injection into the TestStep. The list is accessed
-	// synchronously by a single goroutine.
-	targets := list.New()
-
-	// `ingressTarget` is used to keep track of ingress times of a target into a test step
-	ingressTarget := make(map[string]time.Time)
-
-	// Channel that the injection goroutine uses to communicate back to `routeIn` the results
-	// of asynchronous injection
-	injectResultCh := make(chan injectionResult)
-
-	// injectionChannels are used to inject targets into test step and return results to `routeIn`
-	injectionChannels := injectionCh{stepIn: r.routingChannels.stepIn, resultCh: injectResultCh}
-
-	ctx.Logger().Debugf("initializing routeIn for %s", stepLabel)
-	targetWriter := newTargetWriter(r.timeouts)
-
-	for {
-		select {
-		case <-ctx.WaitFor():
-			err = fmt.Errorf("termination requested for routing into %s", stepLabel)
-		case injectionResult := <-injectResultCh:
-			ctx.Logger().Debugf("received injection result for %v", injectionResult.target)
-			routeInProgress = false
-			if injectionResult.err != nil {
-				err = fmt.Errorf("routing failed while injecting target %+v into %s", injectionResult.target, stepLabel)
-				targetInErrEv := testevent.Data{EventName: target.EventTargetInErr, Target: injectionResult.target}
-				if err := r.ev.Emit(ctx, targetInErrEv); err != nil {
-					ctx.Logger().Warnf("could not emit %v event for target %+v: %v", targetInErrEv, *injectionResult.target, err)
-				}
-			} else {
-				targetInEv := testevent.Data{EventName: target.EventTargetIn, Target: injectionResult.target}
-				if err := r.ev.Emit(ctx, targetInEv); err != nil {
-					ctx.Logger().Warnf("could not emit %v event for Target: %+v", targetInEv, *injectionResult.target)
-				}
-			}
-		case t, chanIsOpen := <-r.routingChannels.routeIn:
-			if !chanIsOpen {
-				ctx.Logger().Debugf("routing input channel closed")
-				r.routingChannels.routeIn = nil
-			} else {
-				ctx.Logger().Debugf("received target %v in input", t)
-				targets.PushFront(t)
-			}
-		}
-
-		if err != nil {
-			break
-		}
-
-		if routeInProgress {
-			continue
-		}
-
-		// no targets currently being injected in the test step
-		if targets.Len() == 0 {
-			if r.routingChannels.routeIn == nil {
-				ctx.Logger().Debugf("input channel is closed and no more targets are available, closing step input channel")
-				close(r.routingChannels.stepIn)
-				break
-			}
-			continue
-		}
-
-		t := targets.Back().Value.(*target.Target)
-		ingressTarget[t.ID] = time.Now()
-		targets.Remove(targets.Back())
-		ctx.Logger().Debugf("writing target %v into test step", t)
-		routeInProgress = true
-		injectionWg.Add(1)
-		go func() {
-			defer injectionWg.Done()
-			targetWriter.writeTargetWithResult(terminateTargetWriterCtx, t, injectionChannels)
-		}()
-	}
-	// Signal termination to the injection routines regardless of the result of the
-	// routing. If the routing completed successfully, this is a no-op. If there is an
-	// injection goroutine running, wait for it to terminate, as we might have gotten
-	// here after a cancellation signal.
-	terminateTargetWriter()
-	injectionWg.Wait()
-
-	if err != nil {
-		ctx.Logger().Debugf("routeIn failed: %v", err)
-		return 0, err
-	}
-	return len(ingressTarget), nil
-}
-
-func (r *stepRouter) emitOutEvent(ctx xcontext.Context, t *target.Target, err error) error {
-	ctx = ctx.WithTag("phase", "emitOutEvent").WithField("step", r.bundle.TestStepLabel)
-
-	if err != nil {
-		targetErrPayload := target.ErrPayload{Error: err.Error()}
-		payloadEncoded, err := json.Marshal(targetErrPayload)
-		if err != nil {
-			ctx.Logger().Warnf("could not encode target error ('%s'): %v", targetErrPayload, err)
-		}
-		rawPayload := json.RawMessage(payloadEncoded)
-		targetErrEv := testevent.Data{EventName: target.EventTargetErr, Target: t, Payload: &rawPayload}
-		if err := r.ev.Emit(ctx, targetErrEv); err != nil {
-			return err
-		}
-	} else {
-		targetOutEv := testevent.Data{EventName: target.EventTargetOut, Target: t}
-		if err := r.ev.Emit(ctx, targetOutEv); err != nil {
-			ctx.Logger().Warnf("could not emit %v event for target: %v", targetOutEv, *t)
-		}
-	}
-	return nil
-}
-
-// routeOut is responsible for accepting a target from the associated test step
-// and forward it to the next routing block. Returns the number of targets
-// received from the test step or an error upon failure
-func (r *stepRouter) routeOut(ctx xcontext.Context) (int, error) {
-
-	stepLabel := r.bundle.TestStepLabel
-	ctx = ctx.WithTag("phase", "routeOut").WithField("step", stepLabel)
-
-	targetWriter := newTargetWriter(r.timeouts)
-
-	var err error
-
-	ctx.Logger().Debugf("initializing routeOut for %s", stepLabel)
-	// `egressTarget` is used to keep track of egress times of a target from a test step
-	egressTarget := make(map[string]time.Time)
-
-	for {
-		select {
-		case <-ctx.WaitFor():
-			err = fmt.Errorf("termination requested for routing into %s", r.bundle.TestStepLabel)
-		case t, chanIsOpen := <-r.routingChannels.stepOut:
-			if !chanIsOpen {
-				ctx.Logger().Debugf("step output closed")
-				r.routingChannels.stepOut = nil
-				break
-			}
-
-			if _, targetPresent := egressTarget[t.ID]; targetPresent {
-				err = fmt.Errorf("step %s returned target %+v multiple times", r.bundle.TestStepLabel, t)
-				break
-			}
-			// Emit an event signaling that the target has left the TestStep
-			if err := r.emitOutEvent(ctx, t, nil); err != nil {
-				ctx.Logger().Warnf("could not emit out event for target %v: %v", *t, err)
-			}
-			// Register egress time and forward target to the next routing block
-			egressTarget[t.ID] = time.Now()
-			if err := targetWriter.writeTimeout(ctx, r.routingChannels.routeOut, t, r.timeouts.MessageTimeout); err != nil {
-				ctx.Logger().Panicf("could not forward target to the test runner: %+v", err)
-			}
-		case targetError, chanIsOpen := <-r.routingChannels.stepErr:
-			if !chanIsOpen {
-				ctx.Logger().Debugf("step error closed")
-				r.routingChannels.stepErr = nil
-				break
-			}
-
-			if _, targetPresent := egressTarget[targetError.Target.ID]; targetPresent {
-				err = fmt.Errorf("step %s returned target %+v multiple times", r.bundle.TestStepLabel, targetError.Target)
-			} else {
-				if err := r.emitOutEvent(ctx, targetError.Target, targetError.Err); err != nil {
-					ctx.Logger().Warnf("could not emit err event for target: %v", *targetError.Target)
-				}
-				egressTarget[targetError.Target.ID] = time.Now()
-				if err := targetWriter.writeTargetError(ctx, r.routingChannels.targetErr, targetError, r.timeouts.MessageTimeout); err != nil {
-					log.Panicf("could not forward target (%+v) to the test runner: %v", targetError.Target, err)
-				}
-			}
-		}
-		if err != nil {
-			break
-		}
-		if r.routingChannels.stepErr == nil && r.routingChannels.stepOut == nil {
-			ctx.Logger().Debugf("output and error channel from step are closed, routeOut should terminate")
-			close(r.routingChannels.routeOut)
-			break
-		}
-	}
-
-	if err != nil {
-		ctx.Logger().Debugf("routeOut failed: %v", err)
-		return 0, err
-	}
-	return len(egressTarget), nil
-
-}
-
-// route implements the routing logic from the previous routing block to the test step
-// and from the test step to the next routing block
-func (r *stepRouter) route(ctx xcontext.Context, resultCh chan<- routeResult) {
-
-	var (
-		inTargets, outTargets   int
-		errRouteIn, errRouteOut error
-	)
-
-	terminateInternalCtx, terminateInternal := xcontext.WithCancel(ctx)
-	defer terminateInternal() // avoids possible goroutine deadlock in context.WithCancel implementation
-
-	wg := sync.WaitGroup{}
-	wg.Add(2)
-	go func() {
-		defer wg.Done()
-		inTargets, errRouteIn = r.routeIn(terminateInternalCtx)
-		if errRouteIn != nil {
-			terminateInternal()
-		}
-	}()
-
-	go func() {
-		defer wg.Done()
-		outTargets, errRouteOut = r.routeOut(terminateInternalCtx)
-		if errRouteOut != nil {
-			terminateInternal()
-		}
-	}()
-	wg.Wait()
-
-	routingErr := errRouteIn
-	if routingErr == nil {
-		routingErr = errRouteOut
-	}
-	if routingErr == nil && inTargets != outTargets {
-		routingErr = fmt.Errorf("step %s completed but did not return all injected Targets (%d!=%d)", r.bundle.TestStepLabel, inTargets, outTargets)
-	}
-
-	// Send the result to the test runner, which is expected to be listening
-	// within `MessageTimeout`. If that's not the case, we hit an unrecovrable
-	// condition.
-	select {
-	case resultCh <- routeResult{bundle: r.bundle, err: routingErr}:
-	case <-time.After(r.timeouts.MessageTimeout):
-		log.Panicf("could not send routing block result")
-	}
-}
-
-func newStepRouter(bundle test.TestStepBundle, routingChannels routingCh, ev testevent.EmitterFetcher, timeouts TestRunnerTimeouts) *stepRouter {
-	r := stepRouter{bundle: bundle, routingChannels: routingChannels, ev: ev, timeouts: timeouts}
-	return &r
-}
diff --git a/pkg/runner/test_runner_route_test.go b/pkg/runner/test_runner_route_test.go
deleted file mode 100644
index 47bebe3d..00000000
--- a/pkg/runner/test_runner_route_test.go
+++ /dev/null
@@ -1,400 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/facebookincubator/contest/pkg/cerrors"
-	"github.com/facebookincubator/contest/pkg/event"
-	"github.com/facebookincubator/contest/pkg/event/testevent"
-	"github.com/facebookincubator/contest/pkg/pluginregistry"
-	"github.com/facebookincubator/contest/pkg/storage"
-	"github.com/facebookincubator/contest/pkg/target"
-	"github.com/facebookincubator/contest/pkg/test"
-	"github.com/facebookincubator/contest/pkg/xcontext/bundles/logrusctx"
-	"github.com/facebookincubator/contest/pkg/xcontext/logger"
-	"github.com/facebookincubator/contest/plugins/storage/memory"
-	"github.com/facebookincubator/contest/plugins/teststeps/example"
-
-	"github.com/stretchr/testify/require"
-	"github.com/stretchr/testify/suite"
-)
-
-var ctx = logrusctx.NewContext(logger.LevelDebug)
-
-var testSteps = map[string]test.TestStepFactory{
-	example.Name: example.New,
-}
-
-var testStepsEvents = map[string][]event.Name{
-	example.Name: example.Events,
-}
-
-type TestRunnerSuite struct {
-	suite.Suite
-	routingChannels routingCh
-	router          *stepRouter
-
-	// the following channels are the same channels wrapped within `routingChannels`,
-	// but used in the opposite direction compared to the routing block. When the
-	// routing block reads results, the corresponding channel of the test suite has
-	// write direction, when the routing block writes results, the corresponding channel
-	// of the test suite has read direction
-	routeInCh  chan<- *target.Target
-	routeOutCh <-chan *target.Target
-
-	stepInCh  <-chan *target.Target
-	stepOutCh chan<- *target.Target
-	stepErrCh chan<- cerrors.TargetError
-
-	targetErrCh <-chan cerrors.TargetError
-}
-
-func (suite *TestRunnerSuite) SetupTest() {
-
-	pluginRegistry := pluginregistry.NewPluginRegistry(ctx)
-	// Setup the PluginRegistry by registering TestSteps
-	for name, tsfactory := range testSteps {
-		if _, ok := testStepsEvents[name]; !ok {
-			suite.T().Errorf("test step %s does not define any associated event", name)
-		}
-		err := pluginRegistry.RegisterTestStep(name, tsfactory, testStepsEvents[name])
-		require.NoError(suite.T(), err)
-	}
-	ts1, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(suite.T(), err)
-
-	params := make(test.TestStepParameters)
-	bundle := test.TestStepBundle{TestStep: ts1, TestStepLabel: "FirstStage", Parameters: params}
-
-	routeInCh := make(chan *target.Target)
-	routeOutCh := make(chan *target.Target)
-	stepInCh := make(chan *target.Target)
-	stepOutCh := make(chan *target.Target)
-	stepErrCh := make(chan cerrors.TargetError)
-	targetErrCh := make(chan cerrors.TargetError)
-
-	suite.routeInCh = routeInCh
-	suite.routeOutCh = routeOutCh
-	suite.stepOutCh = stepOutCh
-	suite.stepInCh = stepInCh
-	suite.stepErrCh = stepErrCh
-	suite.targetErrCh = targetErrCh
-
-	suite.routingChannels = routingCh{
-		routeIn:   routeInCh,
-		routeOut:  routeOutCh,
-		stepIn:    stepInCh,
-		stepErr:   stepErrCh,
-		stepOut:   stepOutCh,
-		targetErr: targetErrCh,
-	}
-
-	s, err := memory.New()
-	require.NoError(suite.T(), err)
-	err = storage.SetStorage(s)
-	require.NoError(suite.T(), err)
-
-	header := testevent.Header{
-		JobID:         1,
-		RunID:         1,
-		TestName:      "TestRunnerSuite",
-		TestStepLabel: "TestRunnerSuite",
-	}
-	ev := storage.NewTestEventEmitterFetcher(header)
-
-	timeouts := TestRunnerTimeouts{
-		StepInjectTimeout:   30 * time.Second,
-		MessageTimeout:      25 * time.Second,
-		ShutdownTimeout:     5 * time.Second,
-		StepShutdownTimeout: 5 * time.Second,
-	}
-
-	suite.router = newStepRouter(bundle, suite.routingChannels, ev, timeouts)
-}
-
-func (suite *TestRunnerSuite) TestRouteInRoutesAllTargets() {
-
-	// test that all targets are routed to step input channel without delay, in order
-	targets := []*target.Target{
-		{ID: "001", FQDN: "host001.facebook.com"},
-		{ID: "002", FQDN: "host002.facebook.com"},
-		{ID: "003", FQDN: "host003.facebook.com"},
-	}
-
-	routeInResult := make(chan error)
-	stepInResult := make(chan error)
-
-	go func() {
-		// start routing
-		_, _ = suite.router.routeIn(ctx)
-	}()
-
-	// inject targets
-	go func() {
-		defer close(suite.routeInCh)
-		for _, target := range targets {
-			select {
-			case <-time.After(2 * time.Second):
-				routeInResult <- fmt.Errorf("target should be accepted by routeIn block within timeout")
-				return
-			case suite.routeInCh <- target:
-			}
-		}
-		routeInResult <- nil
-	}()
-
-	// mimic the test step consuming targets in input
-	go func() {
-		numTargets := 0
-		for {
-			select {
-			case t, ok := <-suite.stepInCh:
-				if !ok {
-					if numTargets != len(targets) {
-						stepInResult <- fmt.Errorf("not all targets received by teste step")
-					} else {
-						stepInResult <- nil
-					}
-					return
-				}
-				if numTargets+1 > len(targets) {
-					stepInResult <- fmt.Errorf("more targets returned than injected")
-					return
-				}
-				if t != targets[numTargets] {
-					stepInResult <- fmt.Errorf("targets returned in wrong order")
-					return
-				}
-				numTargets++
-			case <-time.After(2 * time.Second):
-				stepInResult <- fmt.Errorf("expected target on channel within timeout")
-			}
-		}
-	}()
-
-	for {
-		select {
-		case <-time.After(5 * time.Second):
-			suite.T().Errorf("test should return within timeout")
-		case err := <-stepInResult:
-			stepInResult = nil
-			if err != nil {
-				suite.T().Errorf("step in returned error: %v", err)
-			}
-		case err := <-routeInResult:
-			routeInResult = nil
-			if err != nil {
-				suite.T().Errorf("route in returned error: %v", err)
-			}
-		}
-		if stepInResult == nil && routeInResult == nil {
-			return
-		}
-	}
-}
-
-func (suite *TestRunnerSuite) TestRouteOutRoutesAllSuccessfulTargets() {
-
-	// test that all targets are routed in output from a test step are received by
-	// the routing logic and forwarded to the next routing block
-	targets := []*target.Target{
-		{ID: "001", FQDN: "host001.facebook.com"},
-		{ID: "002", FQDN: "host002.facebook.com"},
-		{ID: "003", FQDN: "host003.facebook.com"},
-	}
-
-	go func() {
-		// start routing
-		_, _ = suite.router.routeOut(ctx)
-	}()
-
-	stepResult := make(chan error)
-	routeResult := make(chan error)
-
-	// mimic the test step returning targets on the output channel. This test exercise
-	// the path where all targets complete successfully
-	go func() {
-		// it's expected that the step closes both channels
-		defer close(suite.stepOutCh)
-		defer close(suite.stepErrCh)
-		for _, target := range targets {
-			select {
-			case <-time.After(2 * time.Second):
-				stepResult <- fmt.Errorf("target should be accepted by routing block within timeout")
-				return
-			case suite.stepOutCh <- target:
-			}
-		}
-		stepResult <- nil
-	}()
-
-	// mimic the next routing block and the pipeline reading from the routeOut channel and the targetErr
-	// channel. We don't expect any target coming through targetErr, we expect all targets to be successful
-	go func() {
-		numTargets := 0
-		for {
-			select {
-			case _, ok := <-suite.targetErrCh:
-				if !ok {
-					suite.targetErrCh = nil
-				} else {
-					routeResult <- fmt.Errorf("no targets expected on the error channel")
-					return
-				}
-			case t, ok := <-suite.routeOutCh:
-				if !ok {
-					if numTargets != len(targets) {
-						routeResult <- fmt.Errorf("not all targets have been returned")
-					} else {
-						routeResult <- nil
-					}
-					return
-				}
-				if numTargets+1 > len(targets) {
-					routeResult <- fmt.Errorf("more targets returned than injected")
-					return
-				}
-				if t.ID != targets[numTargets].ID {
-					routeResult <- fmt.Errorf("targets returned in wrong order")
-					return
-				}
-				numTargets++
-			case <-time.After(2 * time.Second):
-				routeResult <- fmt.Errorf("expected target on channel within timeout")
-				return
-			}
-		}
-	}()
-
-	for {
-		select {
-		case <-time.After(5 * time.Second):
-			suite.T().Errorf("test should return within timeout")
-		case err := <-stepResult:
-			stepResult = nil
-			if err != nil {
-				suite.T().Errorf("step output returned error: %v", err)
-			}
-		case err := <-routeResult:
-			routeResult = nil
-			if err != nil {
-				suite.T().Errorf("router returned error: %v", err)
-			}
-		}
-		if stepResult == nil && routeResult == nil {
-			return
-		}
-	}
-}
-
-func (suite *TestRunnerSuite) TestRouteOutRoutesAllFailedTargets() {
-
-	// test that all targets are routed in output from a test step are received by
-	// the routing logic and forwarded to the next routing block
-	targets := []*target.Target{
-		{ID: "001", FQDN: "host001.facebook.com"},
-		{ID: "002", FQDN: "host002.facebook.com"},
-		{ID: "003", FQDN: "host003.facebook.com"},
-	}
-
-	go func() {
-		// start routing
-		_, _ = suite.router.routeOut(ctx)
-	}()
-
-	stepResult := make(chan error)
-	routeResult := make(chan error)
-
-	// mimic the test step returning targets on the output channel. This test exercise
-	// the path where all targets complete successfully
-	go func() {
-		// it's expected that the step closes both channels
-		defer close(suite.stepOutCh)
-		defer close(suite.stepErrCh)
-		for _, target := range targets {
-			targetErr := cerrors.TargetError{Target: target, Err: fmt.Errorf("test error")}
-			select {
-			case <-time.After(2 * time.Second):
-				stepResult <- fmt.Errorf("target should be accepted by routing block within timeout")
-				return
-			case suite.stepErrCh <- targetErr:
-			}
-		}
-		stepResult <- nil
-	}()
-
-	// mimic the next routing block and the pipeline reading from the routeOut channel and the targetErr
-	// channel. We don't expect any target coming through targetErr, we expect all targets to be successful
-	go func() {
-		numTargets := 0
-		for {
-			select {
-			case targetErr, ok := <-suite.targetErrCh:
-				if !ok {
-					routeResult <- fmt.Errorf("target error channel should not be closed by routing block")
-					return
-				}
-				if targetErr.Err == nil {
-					routeResult <- fmt.Errorf("expected error associated to the target")
-					return
-				}
-				if numTargets+1 > len(targets) {
-					routeResult <- fmt.Errorf("more targets returned than injected")
-					return
-				}
-				if targetErr.Target.ID != targets[numTargets].ID {
-					routeResult <- fmt.Errorf("targets returned in wrong order")
-					return
-				}
-
-				numTargets++
-				if numTargets == len(targets) {
-					routeResult <- nil
-					return
-				}
-			case _, ok := <-suite.routeOutCh:
-				if !ok {
-					suite.routeOutCh = nil
-				} else {
-					routeResult <- fmt.Errorf("no targets expected on the output channel")
-					return
-				}
-			case <-time.After(2 * time.Second):
-				routeResult <- fmt.Errorf("expected target on channel within timeout")
-				return
-			}
-		}
-	}()
-
-	for {
-		select {
-		case <-time.After(5 * time.Second):
-			suite.T().Errorf("test should return within timeout")
-		case err := <-stepResult:
-			stepResult = nil
-			if err != nil {
-				suite.T().Errorf("step output routine returned error: %v", err)
-			}
-		case err := <-routeResult:
-			routeResult = nil
-			if err != nil {
-				suite.T().Errorf("router returned error: %v", err)
-			}
-		}
-		if stepResult == nil && routeResult == nil {
-			return
-		}
-	}
-}
-
-func TestTestRunnerSuite(t *testing.T) {
-	TestRunnerSuite := &TestRunnerSuite{}
-	suite.Run(t, TestRunnerSuite)
-}
diff --git a/pkg/runner/test_runner_test.go b/pkg/runner/test_runner_test.go
new file mode 100644
index 00000000..6a920cb5
--- /dev/null
+++ b/pkg/runner/test_runner_test.go
@@ -0,0 +1,597 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package runner
+
+import (
+	"flag"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/facebookincubator/contest/pkg/xcontext"
+	"github.com/facebookincubator/contest/pkg/xcontext/bundles/logrusctx"
+	"github.com/facebookincubator/contest/pkg/xcontext/logger"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/pluginregistry"
+	"github.com/facebookincubator/contest/pkg/storage"
+	"github.com/facebookincubator/contest/pkg/target"
+	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/pkg/types"
+	"github.com/facebookincubator/contest/plugins/storage/memory"
+	"github.com/facebookincubator/contest/tests/common"
+	"github.com/facebookincubator/contest/tests/common/goroutine_leak_check"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/badtargets"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/channels"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/noreturn"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/panicstep"
+	"github.com/facebookincubator/contest/tests/plugins/teststeps/teststep"
+)
+
+const (
+	testName          = "SimpleTest"
+	stepInjectTimeout = 3 * time.Second
+	shutdownTimeout   = 3 * time.Second
+)
+
+var (
+	evs            storage.ResettableStorage
+	pluginRegistry *pluginregistry.PluginRegistry
+)
+
+var (
+	ctx = logrusctx.NewContext(logger.LevelDebug)
+)
+
+func TestMain(m *testing.M) {
+	flag.Parse()
+	if ms, err := memory.New(); err == nil {
+		evs = ms
+		if err := storage.SetStorage(ms); err != nil {
+			panic(err.Error())
+		}
+	} else {
+		panic(fmt.Sprintf("could not initialize in-memory storage layer: %v", err))
+	}
+	pluginRegistry = pluginregistry.NewPluginRegistry(ctx)
+	for _, e := range []struct {
+		name    string
+		factory test.TestStepFactory
+		events  []event.Name
+	}{
+		{badtargets.Name, badtargets.New, badtargets.Events},
+		{channels.Name, channels.New, channels.Events},
+		{hanging.Name, hanging.New, hanging.Events},
+		{noreturn.Name, noreturn.New, noreturn.Events},
+		{panicstep.Name, panicstep.New, panicstep.Events},
+		{teststep.Name, teststep.New, teststep.Events},
+	} {
+		if err := pluginRegistry.RegisterTestStep(e.name, e.factory, e.events); err != nil {
+			panic(fmt.Sprintf("could not register TestStep: %v", err))
+		}
+	}
+	flag.Parse()
+	goroutine_leak_check.LeakCheckingTestMain(m,
+		// We expect these to leak.
+		"github.com/facebookincubator/contest/tests/plugins/teststeps/hanging.(*hanging).Run",
+		"github.com/facebookincubator/contest/tests/plugins/teststeps/noreturn.(*noreturnStep).Run",
+
+		// No leak in contexts checked with itsown unit-tests
+		"github.com/facebookincubator/contest/pkg/xcontext.(*ctxValue).cloneWithStdContext.func2",
+	)
+}
+
+func newTestRunner() *TestRunner {
+	return NewTestRunnerWithTimeouts(stepInjectTimeout, shutdownTimeout)
+}
+
+func resetEventStorage() {
+	if err := evs.Reset(); err != nil {
+		panic(err.Error())
+	}
+}
+
+func tgt(id string) *target.Target {
+	return &target.Target{ID: id}
+}
+
+func getStepEvents(stepLabel string) string {
+	return common.GetTestEventsAsString(ctx, evs, testName, nil, &stepLabel)
+}
+
+func getTargetEvents(targetID string) string {
+	return common.GetTestEventsAsString(ctx, evs, testName, &targetID, nil)
+}
+
+func newStep(label, name string, params *test.TestStepParameters) test.TestStepBundle {
+	td := test.TestStepDescriptor{
+		Name:  name,
+		Label: label,
+	}
+	if params != nil {
+		td.Parameters = *params
+	}
+	sb, err := pluginRegistry.NewTestStepBundle(ctx, td, nil)
+	if err != nil {
+		panic(fmt.Sprintf("failed to create test step bundle: %v", err))
+	}
+	return *sb
+}
+
+func newTestStep(label string, failPct int, failTargets string, delayTargets string) test.TestStepBundle {
+	return newStep(label, teststep.Name, &test.TestStepParameters{
+		teststep.FailPctParam:      []test.Param{*test.NewParam(fmt.Sprintf("%d", failPct))},
+		teststep.FailTargetsParam:  []test.Param{*test.NewParam(failTargets)},
+		teststep.DelayTargetsParam: []test.Param{*test.NewParam(delayTargets)},
+	})
+}
+
+type runRes struct {
+	res []byte
+	err error
+}
+
+func runWithTimeout(t *testing.T, tr *TestRunner, ctx xcontext.Context, resumeState []byte, runID types.RunID, timeout time.Duration, targets []*target.Target, bundles []test.TestStepBundle) ([]byte, error) {
+	newCtx, cancel := xcontext.WithCancel(ctx)
+	test := &test.Test{
+		Name:             testName,
+		TestStepsBundles: bundles,
+	}
+	resCh := make(chan runRes)
+	go func() {
+		res, err := tr.Run(newCtx, test, targets, 1, runID, resumeState)
+		resCh <- runRes{res: res, err: err}
+	}()
+	var res runRes
+	select {
+	case res = <-resCh:
+	case <-time.After(timeout):
+		cancel()
+		assert.FailNow(t, "TestRunner should not time out")
+	}
+	return res.res, res.err
+}
+
+// Simple case: one target, one step, success.
+func Test1Step1Success(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newTestStep("Step 1", 0, "", ""),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents(""))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetOut]}
+`, getTargetEvents("T1"))
+}
+
+// Simple case: one target, one step that blocks for a bit, success.
+// We block for longer than the shutdown timeout of the test runner.
+func Test1StepLongerThanShutdown1Success(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunnerWithTimeouts(stepInjectTimeout, 100*time.Millisecond)
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newTestStep("Step 1", 0, "", "T1=500"),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents(""))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetOut]}
+`, getTargetEvents("T1"))
+}
+
+// Simple case: one target, one step, failure.
+func Test1Step1Fail(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newTestStep("Step 1", 100, "", ""),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents("Step 1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
+`, getTargetEvents("T1"))
+}
+
+// One step pipeline with two targets - one fails, one succeeds.
+func Test1Step1Success1Fail(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1"), tgt("T2")},
+		[]test.TestStepBundle{
+			newTestStep("Step 1", 0, "T1", "T2=100"),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents(""))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
+`, getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetOut]}
+`, getTargetEvents("T2"))
+}
+
+// Three-step pipeline, two targets: T1 fails at step 1, T2 fails at step 2,
+// step 3 is not reached and not even run.
+func Test3StepsNotReachedStepNotRun(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1"), tgt("T2")},
+		[]test.TestStepBundle{
+			newTestStep("Step 1", 0, "T1", ""),
+			newTestStep("Step 2", 0, "T2", ""),
+			newTestStep("Step 3", 0, "", ""),
+		},
+	)
+	require.NoError(t, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents("Step 1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents("Step 2"))
+	require.Equal(t, "\n\n", getStepEvents("Step 3"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
+`, getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetOut]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestFailedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetErr &"{\"Error\":\"target failed\"}"]}
+`, getTargetEvents("T2"))
+}
+
+// A misbehaving step that fails to shut down properly after processing targets
+// and does not return.
+func TestNoReturnStepWithCorrectTargetForwarding(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunnerWithTimeouts(100*time.Millisecond, 200*time.Millisecond)
+	ctx, cancel := xcontext.WithCancel(ctx)
+	defer cancel()
+	_, err := runWithTimeout(t, tr, ctx, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", noreturn.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepsNeverReturned{}, err)
+}
+
+// A misbehaving step that does not process any targets.
+func TestNoReturnStepWithoutTargetForwarding(t *testing.T) {
+	resetEventStorage()
+	tr := NewTestRunnerWithTimeouts(100*time.Millisecond, 200*time.Millisecond)
+	ctx, cancel := xcontext.WithCancel(ctx)
+	defer cancel()
+	_, err := runWithTimeout(t, tr, ctx, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", hanging.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestTargetInjectionTimedOut{}, err)
+}
+
+// A misbehaving step that panics.
+func TestStepPanics(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", panicstep.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepPaniced{}, err)
+	require.Equal(t, "\n\n", getTargetEvents("T1"))
+	require.Contains(t, getStepEvents("Step 1"), "step Step 1 paniced")
+}
+
+// A misbehaving step that closes its output channel.
+func TestStepClosesChannels(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("T1")},
+		[]test.TestStepBundle{
+			newStep("Step 1", channels.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepClosedChannels{}, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetOut]}
+`, getTargetEvents("T1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestError &"\"test step Step 1 closed output channels (api violation)\""]}
+`, getStepEvents("Step 1"))
+}
+
+// A misbehaving step that yields a result for a target that does not exist.
+func TestStepYieldsResultForNonexistentTarget(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("TExtra")},
+		[]test.TestStepBundle{
+			newStep("Step 1", badtargets.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepReturnedUnexpectedResult{}, err)
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "TExtra"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "TExtra"} TargetOut]}
+`, getTargetEvents("TExtra"))
+	require.Equal(t, "\n\n", getTargetEvents("TExtra2"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestError &"\"test step Step 1 returned unexpected result for TExtra2\""]}
+`, getStepEvents("Step 1"))
+}
+
+// A misbehaving step that yields a duplicate result for a target.
+func TestStepYieldsDuplicateResult(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("TGood"), tgt("TDup")},
+		[]test.TestStepBundle{
+			// TGood makes it past here unscathed and gets delayed in Step 2,
+			// TDup also emerges fine at first but is then returned again, and that's bad.
+			newStep("Step 1", badtargets.Name, nil),
+			newTestStep("Step 2", 0, "", "TGood=100"),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepReturnedDuplicateResult{}, err)
+}
+
+// A misbehaving step that loses targets.
+func TestStepLosesTargets(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("TGood"), tgt("TDrop")},
+		[]test.TestStepBundle{
+			newStep("Step 1", badtargets.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepLostTargets{}, err)
+	require.Contains(t, err.Error(), "TDrop")
+}
+
+// A misbehaving step that yields a result for a target that does exist
+// but is not currently waiting for it.
+func TestStepYieldsResultForUnexpectedTarget(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		[]*target.Target{tgt("TExtra"), tgt("TExtra2")},
+		[]test.TestStepBundle{
+			// TExtra2 fails here.
+			newTestStep("Step 1", 0, "TExtra2", ""),
+			// Yet, a result for it is returned here, which we did not expect.
+			newStep("Step 2", badtargets.Name, nil),
+		},
+	)
+	require.Error(t, err)
+	require.IsType(t, &cerrors.ErrTestStepReturnedUnexpectedResult{}, err)
+}
+
+// Larger, randomized test - a number of steps, some targets failing, some succeeding.
+func TestRandomizedMultiStep(t *testing.T) {
+	resetEventStorage()
+	tr := newTestRunner()
+	var targets []*target.Target
+	for i := 1; i <= 100; i++ {
+		targets = append(targets, tgt(fmt.Sprintf("T%d", i)))
+	}
+	_, err := runWithTimeout(t, tr, nil, nil, 1, 2*time.Second,
+		targets,
+		[]test.TestStepBundle{
+			newTestStep("Step 1", 0, "", "*=10"),  // All targets pass the first step, with a slight delay
+			newTestStep("Step 2", 25, "", ""),     // 25% don't make it past the second step
+			newTestStep("Step 3", 25, "", "*=10"), // Another 25% fail at the third step
+		},
+	)
+	require.NoError(t, err)
+	// Every target mush have started and finished the first step.
+	numFinished := 0
+	for _, tgt := range targets {
+		s1n := "Step 1"
+		require.Equal(t, fmt.Sprintf(`
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "%s"} TargetOut]}
+`, tgt.ID, tgt.ID, tgt.ID, tgt.ID),
+			common.GetTestEventsAsString(ctx, evs, testName, &tgt.ID, &s1n))
+		s3n := "Step 3"
+		if strings.Contains(common.GetTestEventsAsString(ctx, evs, testName, &tgt.ID, &s3n), "TestFinishedEvent") {
+			numFinished++
+		}
+	}
+	// At least some must have finished.
+	require.Greater(t, numFinished, 0)
+}
+
+// Test pausing/resuming a naive step that does not cooperate.
+// In this case we drain input, wait for all targets to emerge and exit gracefully.
+func TestPauseResumeSimple(t *testing.T) {
+	resetEventStorage()
+	var err error
+	var resumeState []byte
+	targets := []*target.Target{tgt("T1"), tgt("T2"), tgt("T3")}
+	steps := []test.TestStepBundle{
+		newTestStep("Step 1", 0, "T1", ""),
+		// T2 and T3 will be paused here, the step will be given time to finish.
+		newTestStep("Step 2", 0, "", "T2=200,T3=200"),
+		newTestStep("Step 3", 0, "", ""),
+	}
+	{
+		tr1 := newTestRunner()
+		ctx1, pause := xcontext.WithNotify(ctx, xcontext.Paused)
+		ctx1, cancel := xcontext.WithCancel(ctx1)
+		defer cancel()
+		go func() {
+			time.Sleep(100 * time.Millisecond)
+			ctx.Logger().Infof("TestPauseResumeNaive: pausing")
+			pause()
+		}()
+		resumeState, err = runWithTimeout(t, tr1, ctx1, nil, 1, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.IsType(t, xcontext.Paused, err)
+		require.NotNil(t, resumeState)
+	}
+	ctx.Logger().Debugf("Resume state: %s", string(resumeState))
+	// Make sure that resume state is validated.
+	{
+		tr := newTestRunner()
+		ctx, cancel := xcontext.WithCancel(ctx)
+		defer cancel()
+		resumeState2, err := runWithTimeout(
+			t, tr, ctx, []byte("FOO"), 2, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "invalid resume state")
+		require.Nil(t, resumeState2)
+	}
+	{
+		tr := newTestRunner()
+		ctx, cancel := xcontext.WithCancel(ctx)
+		defer cancel()
+		resumeState2 := strings.Replace(string(resumeState), `"v"`, `"Xv"`, 1)
+		_, err := runWithTimeout(
+			t, tr, ctx, []byte(resumeState2), 3, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "incompatible resume state")
+	}
+	{
+		tr := newTestRunner()
+		ctx, cancel := xcontext.WithCancel(ctx)
+		defer cancel()
+		resumeState2 := strings.Replace(string(resumeState), `"job_id":1`, `"job_id":2`, 1)
+		_, err := runWithTimeout(
+			t, tr, ctx, []byte(resumeState2), 4, 2*time.Second, targets, steps)
+		require.Error(t, err)
+		require.Contains(t, err.Error(), "wrong resume state")
+	}
+	// Finally, resume and finish the job.
+	{
+		tr2 := newTestRunner()
+		ctx2, cancel := xcontext.WithCancel(ctx)
+		defer cancel()
+		_, err := runWithTimeout(t, tr2, ctx2, resumeState, 5, 2*time.Second,
+			// Pass exactly the same targets and pipeline to resume properly.
+			// Don't use the same pointers ot make sure there is no reliance on that.
+			[]*target.Target{tgt("T1"), tgt("T2"), tgt("T3")},
+			[]test.TestStepBundle{
+				newTestStep("Step 1", 0, "T1", ""),
+				newTestStep("Step 2", 0, "", "T2=200,T3=200"),
+				newTestStep("Step 3", 0, "", ""),
+			},
+		)
+		require.NoError(t, err)
+	}
+	// Verify step events.
+	// Steps 1 and 2 are executed entirely within the first runner instance
+	// and never started in the second.
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 1][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents("Step 1"))
+	require.Equal(t, `
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepRunningEvent]}
+{[1 1 SimpleTest Step 2][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents("Step 2"))
+	// Step 3 did not get to start in the first instance and ran in the second.
+	require.Equal(t, `
+{[1 5 SimpleTest Step 3][(*Target)(nil) TestStepRunningEvent]}
+{[1 5 SimpleTest Step 3][(*Target)(nil) TestStepFinishedEvent]}
+`, getStepEvents("Step 3"))
+	// T1 failed entirely within the first run.
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TestFailedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T1"} TargetErr &"{\"Error\":\"target failed\"}"]}
+`, getTargetEvents("T1"))
+	// T2 and T3 ran in both.
+	require.Equal(t, `
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 1][Target{ID: "T2"} TargetOut]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetIn]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestStartedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TestFinishedEvent]}
+{[1 1 SimpleTest Step 2][Target{ID: "T2"} TargetOut]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TargetIn]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TestStartedEvent]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TestFinishedEvent]}
+{[1 5 SimpleTest Step 3][Target{ID: "T2"} TargetOut]}
+`, getTargetEvents("T2"))
+}
diff --git a/pkg/runner/wait_for_first_target.go b/pkg/runner/wait_for_first_target.go
deleted file mode 100644
index 355953f0..00000000
--- a/pkg/runner/wait_for_first_target.go
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"github.com/facebookincubator/contest/pkg/target"
-)
-
-// waitForFirstTarget copies targets from `in` to `out` and also
-// signals on first target or if `in` is closed and there was no targets
-// at all.
-//
-// To solve the initial problem there're two approaches:
-// * Just wait for a target in `stepCh.stepIn` and then insert
-// it back to the channel.
-// * Create substitute `stepCh.stepIn` with a new channel and just
-// copy all targets from original `stepCh.stepIn` to this new channel.
-// And we may detect when we receive the first target in the original
-// channel.
-//
-// The first approach is much simpler, but the second preserves
-// the order of targets (which is handy and not misleading
-// while reading logs). Here we implement the second one:
-func waitForFirstTarget(
-	cancelCh <-chan struct{},
-	in <-chan *target.Target,
-) (out chan *target.Target, onFirstTarget, onNoTargets <-chan struct{}) {
-	onFirstTargetCh := make(chan struct{})
-	onNoTargetsCh := make(chan struct{})
-	onFirstTarget, onNoTargets = onFirstTargetCh, onNoTargetsCh
-
-	out = make(chan *target.Target)
-	go func() {
-		select {
-		case t, ok := <-in:
-			if !ok {
-				close(out)
-				close(onNoTargetsCh)
-				return
-			}
-			close(onFirstTargetCh)
-			out <- t
-		case <-cancelCh:
-			return
-		}
-
-		for t := range in {
-			out <- t
-		}
-		close(out)
-	}()
-
-	return
-}
diff --git a/pkg/runner/wait_for_first_target_test.go b/pkg/runner/wait_for_first_target_test.go
deleted file mode 100644
index 494a335a..00000000
--- a/pkg/runner/wait_for_first_target_test.go
+++ /dev/null
@@ -1,142 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-package runner
-
-import (
-	"fmt"
-	"runtime"
-	"sync"
-	"testing"
-
-	"github.com/stretchr/testify/require"
-
-	"github.com/facebookincubator/contest/pkg/target"
-)
-
-func TestWaitForFirstTarget(t *testing.T) {
-	t.Run("100targets", func(t *testing.T) {
-		ch0 := make(chan *target.Target)
-		ch1, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(nil, ch0)
-
-		var wgBeforeFirstTarget, wgAfterSecondTarget sync.WaitGroup
-		wgBeforeFirstTarget.Add(1)
-		wgAfterSecondTarget.Add(1)
-
-		go func() {
-			wgBeforeFirstTarget.Wait()
-			for i := 0; i < 100; i++ {
-				ch0 <- &target.Target{
-					ID: fmt.Sprintf("%d", i),
-				}
-				if i == 1 {
-					// to avoid race conditions in this test we verify
-					// if there's a "onFirstTarget" signal after the second
-					// target
-					wgAfterSecondTarget.Done()
-				}
-			}
-		}()
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		wgBeforeFirstTarget.Done()
-
-		var wgReader sync.WaitGroup
-		wgReader.Add(1)
-		go func() {
-			defer wgReader.Done()
-			// Checking quantity and order
-			for i := 0; i < 100; i++ {
-				_target := <-ch1
-				require.Equal(t, &target.Target{
-					ID: fmt.Sprintf("%d", i),
-				}, _target)
-			}
-		}()
-
-		wgAfterSecondTarget.Wait()
-
-		select {
-		case <-onFirstTargetChan:
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-			t.Fatal("should not happen") // see wgAfterSecondTarget.Wait() above
-		}
-
-		wgReader.Wait()
-
-		runtime.Gosched()
-		require.Len(t, ch1, 0)
-	})
-
-	t.Run("no_target", func(t *testing.T) {
-		ch0 := make(chan *target.Target)
-		ch1, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(nil, ch0)
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		close(ch0)
-
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-		}
-
-		runtime.Gosched()
-		require.Len(t, ch1, 0)
-		_, isOpened := <-ch1
-		require.False(t, isOpened)
-	})
-
-	t.Run("cancel", func(t *testing.T) {
-		cancelCh := make(chan struct{})
-		ch0 := make(chan *target.Target)
-		ch1, onFirstTargetChan, onNoTargetsChan := waitForFirstTarget(nil, ch0)
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen") // see wgBeforeFirstTarget.Wait() above
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		close(cancelCh)
-
-		runtime.Gosched()
-		select {
-		case <-onFirstTargetChan:
-			t.Fatal("should not happen")
-		case <-onNoTargetsChan:
-			t.Fatal("should not happen")
-		default:
-		}
-
-		require.Len(t, ch1, 0)
-		select {
-		case <-ch1:
-			t.Fatal("should not happen")
-		default:
-		}
-	})
-}
diff --git a/pkg/storage/limits/limits_test.go b/pkg/storage/limits/limits_test.go
index 50af64f6..b95be939 100644
--- a/pkg/storage/limits/limits_test.go
+++ b/pkg/storage/limits/limits_test.go
@@ -39,7 +39,7 @@ var (
 )
 
 func TestServerID(t *testing.T) {
-	_, err := api.New(ctx, api.OptionServerID(strings.Repeat("A", limits.MaxServerIDLen+1)))
+	_, err := api.New(api.OptionServerID(strings.Repeat("A", limits.MaxServerIDLen+1)))
 	assertLenError(t, "Server ID", err)
 }
 
diff --git a/pkg/storage/storage.go b/pkg/storage/storage.go
index 1cafdf4f..642a39aa 100644
--- a/pkg/storage/storage.go
+++ b/pkg/storage/storage.go
@@ -36,6 +36,7 @@ type TransactionalStorage interface {
 
 // ResettableStorage is implemented by storage engines that support reset operation
 type ResettableStorage interface {
+	Storage
 	Reset() error
 }
 
diff --git a/plugins/listeners/httplistener/httplistener.go b/plugins/listeners/httplistener/httplistener.go
index fab33b9f..b19157d3 100644
--- a/plugins/listeners/httplistener/httplistener.go
+++ b/plugins/listeners/httplistener/httplistener.go
@@ -221,7 +221,7 @@ func listenWithCancellation(ctx xcontext.Context, s *http.Server) error {
 	select {
 	case err := <-errCh:
 		return err
-	case <-ctx.Done():
+	case <-ctx.WaitFor():
 		ctx.Logger().Debugf("Received server shut down request")
 		return s.Close()
 	}
@@ -239,6 +239,7 @@ func (h *HTTPListener) Serve(ctx xcontext.Context, a *api.API) error {
 		ReadTimeout:  10 * time.Second,
 		WriteTimeout: 10 * time.Second,
 	}
+	ctx.Logger().Debugf("Serving a listener")
 	if err := listenWithCancellation(ctx, &s); err != nil {
 		return fmt.Errorf("HTTP listener failed: %v", err)
 	}
diff --git a/plugins/storage/memory/memory.go b/plugins/storage/memory/memory.go
index fd6caabc..d71cab5b 100644
--- a/plugins/storage/memory/memory.go
+++ b/plugins/storage/memory/memory.go
@@ -296,7 +296,7 @@ func (m *Memory) Version() (uint64, error) {
 }
 
 // New create a new Memory events storage backend
-func New() (storage.Storage, error) {
+func New() (storage.ResettableStorage, error) {
 	m := Memory{lock: &sync.Mutex{}}
 	m.jobInfo = make(map[types.JobID]*jobInfo)
 	m.jobIDCounter = 1
diff --git a/plugins/teststeps/echo/echo.go b/plugins/teststeps/echo/echo.go
index 6df62322..35ea00ac 100644
--- a/plugins/teststeps/echo/echo.go
+++ b/plugins/teststeps/echo/echo.go
@@ -60,7 +60,7 @@ func (e Step) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.Te
 			}
 			ctx.Logger().Infof("Running on target %s with text '%s'", target, params.GetOne("text"))
 			ch.Out <- target
-		case <-ctx.WaitFor():
+		case <-ctx.Done():
 			return nil
 		}
 	}
diff --git a/plugins/teststeps/example/example.go b/plugins/teststeps/example/example.go
index f37db84d..673f82e4 100644
--- a/plugins/teststeps/example/example.go
+++ b/plugins/teststeps/example/example.go
@@ -12,13 +12,21 @@ import (
 	"github.com/facebookincubator/contest/pkg/cerrors"
 	"github.com/facebookincubator/contest/pkg/event"
 	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/test"
 	"github.com/facebookincubator/contest/pkg/xcontext"
+	"github.com/facebookincubator/contest/plugins/teststeps"
 )
 
 // Name is the name used to look this plugin up.
 var Name = "Example"
 
+// Params this step accepts.
+const (
+	// Fail this percentage of targets at random.
+	FailPctParam = "FailPct"
+)
+
 // events that we may emit during the plugin's lifecycle. This is used in Events below.
 // Note that you don't normally need to emit start/finish/cancellation events as
 // these are emitted automatically by the framework.
@@ -33,10 +41,10 @@ const (
 var Events = []event.Name{StartedEvent, FinishedEvent, FailedEvent}
 
 // Step is an example implementation of a TestStep which simply
-// consumes Targets in input and pipes them to the output channel with intermediate
-// buffering. It randomly decides if a Target has failed and forwards it on
-// the err channel.
+// consumes Targets in input and pipes them to the output or error channel
+// with intermediate buffering.
 type Step struct {
+	failPct int64
 }
 
 // Name returns the name of the Step
@@ -44,50 +52,48 @@ func (ts Step) Name() string {
 	return Name
 }
 
+func (ts *Step) shouldFail(t *target.Target) bool {
+	if ts.failPct > 0 {
+		roll := rand.Int63n(101)
+		return (roll <= ts.failPct)
+	}
+	return false
+}
+
 // Run executes the example step.
-func (ts *Step) Run(ctx xcontext.Context, ch test.TestStepChannels, _ test.TestStepParameters, ev testevent.Emitter) error {
-	for {
-
-		r := rand.Intn(3)
-		select {
-		case target := <-ch.In:
-			if target == nil {
-				return nil
-			}
-			ctx.Logger().Infof("Executing on target %s", target)
-			// NOTE: you may want more robust error handling here, possibly just
-			//       logging the error, or a retry mechanism. Returning an error
-			//       here means failing the entire job.
-			if err := ev.Emit(ctx, testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
-				return fmt.Errorf("failed to emit start event: %v", err)
+func (ts *Step) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
+	f := func(ctx xcontext.Context, target *target.Target) error {
+		ctx.Logger().Infof("Executing on target %s", target)
+		// NOTE: you may want more robust error handling here, possibly just
+		//       logging the error, or a retry mechanism. Returning an error
+		//       here means failing the entire job.
+		if err := ev.Emit(ctx, testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
+			return fmt.Errorf("failed to emit start event: %v", err)
+		}
+		if ts.shouldFail(target) {
+			if err := ev.Emit(ctx, testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit finished event: %v", err)
 			}
-			if r == 1 {
-				select {
-				case <-ctx.WaitFor():
-					return nil
-				case ch.Err <- cerrors.TargetError{Target: target, Err: fmt.Errorf("target failed")}:
-					if err := ev.Emit(ctx, testevent.Data{EventName: FinishedEvent, Target: target, Payload: nil}); err != nil {
-						return fmt.Errorf("failed to emit finished event: %v", err)
-					}
-				}
-			} else {
-				select {
-				case <-ctx.WaitFor():
-					return nil
-				case ch.Out <- target:
-					if err := ev.Emit(ctx, testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
-						return fmt.Errorf("failed to emit failed event: %v", err)
-					}
-				}
+			return fmt.Errorf("target failed")
+		} else {
+			if err := ev.Emit(ctx, testevent.Data{EventName: FinishedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit failed event: %v", err)
 			}
-		case <-ctx.WaitFor():
-			return nil
 		}
+		return nil
 	}
+	return teststeps.ForEachTarget(Name, ctx, ch, f)
 }
 
 // ValidateParameters validates the parameters associated to the TestStep
-func (ts *Step) ValidateParameters(_ xcontext.Context, _ test.TestStepParameters) error {
+func (ts *Step) ValidateParameters(_ xcontext.Context, params test.TestStepParameters) error {
+	if params.GetOne(FailPctParam).String() != "" {
+		if pct, err := params.GetInt(FailPctParam); err == nil {
+			ts.failPct = pct
+		} else {
+			return fmt.Errorf("invalid FailPct: %w", err)
+		}
+	}
 	return nil
 }
 
diff --git a/plugins/teststeps/randecho/randecho.go b/plugins/teststeps/randecho/randecho.go
index 5235e6e1..5e57dc20 100644
--- a/plugins/teststeps/randecho/randecho.go
+++ b/plugins/teststeps/randecho/randecho.go
@@ -80,7 +80,7 @@ func (e Step) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.Te
 				ctx.Logger().Infof("Run: target %s failed: %s", target, params.GetOne("text"))
 				ch.Err <- cerrors.TargetError{Target: target, Err: fmt.Errorf("target randomly failed")}
 			}
-		case <-ctx.WaitFor():
+		case <-ctx.Done():
 			return nil
 		}
 	}
diff --git a/plugins/teststeps/sshcmd/sshcmd.go b/plugins/teststeps/sshcmd/sshcmd.go
index beb47919..ef95ca82 100644
--- a/plugins/teststeps/sshcmd/sshcmd.go
+++ b/plugins/teststeps/sshcmd/sshcmd.go
@@ -247,7 +247,7 @@ func (ts *SSHCmd) Run(ctx xcontext.Context, ch test.TestStepChannels, params tes
 					ctx.Logger().Warnf("Stderr of command '%s' is '%s'", cmd, stderr.Bytes())
 				}
 				return err
-			case <-ctx.WaitFor():
+			case <-ctx.Done():
 				return session.Signal(ssh.SIGKILL)
 			case <-time.After(250 * time.Millisecond):
 				keepAliveCnt++
diff --git a/plugins/teststeps/terminalexpect/terminalexpect.go b/plugins/teststeps/terminalexpect/terminalexpect.go
index f00040de..1ca08d80 100644
--- a/plugins/teststeps/terminalexpect/terminalexpect.go
+++ b/plugins/teststeps/terminalexpect/terminalexpect.go
@@ -79,7 +79,7 @@ func (ts *TerminalExpect) Run(ctx xcontext.Context, ch test.TestStepChannels, pa
 			return err
 		case <-time.After(ts.Timeout):
 			return fmt.Errorf("timed out after %s", ts.Timeout)
-		case <-ctx.WaitFor():
+		case <-ctx.Done():
 			return nil
 		}
 	}
diff --git a/plugins/teststeps/teststeps.go b/plugins/teststeps/teststeps.go
index c8de4f9d..83171963 100644
--- a/plugins/teststeps/teststeps.go
+++ b/plugins/teststeps/teststeps.go
@@ -32,15 +32,15 @@ func ForEachTarget(pluginName string, ctx xcontext.Context, ch test.TestStepChan
 			ctx.Logger().Errorf("%s: ForEachTarget: failed to apply test step function on target %s: %v", pluginName, t, err)
 			select {
 			case ch.Err <- cerrors.TargetError{Target: t, Err: err}:
-			case <-ctx.WaitFor():
-				ctx.Logger().Debugf("%s: ForEachTarget: received cancellation/pause signal while reporting error", pluginName)
+			case <-ctx.Done():
+				ctx.Logger().Debugf("%s: ForEachTarget: received cancellation signal while reporting error", pluginName)
 			}
 		} else {
 			ctx.Logger().Debugf("%s: ForEachTarget: target %s completed successfully", pluginName, t)
 			select {
 			case ch.Out <- t:
-			case <-ctx.WaitFor():
-				ctx.Logger().Debugf("%s: ForEachTarget: received cancellation/pause signal while reporting success", pluginName)
+			case <-ctx.Done():
+				ctx.Logger().Debugf("%s: ForEachTarget: received pause signal while reporting success", pluginName)
 			}
 		}
 	}
@@ -50,12 +50,11 @@ func ForEachTarget(pluginName string, ctx xcontext.Context, ch test.TestStepChan
 		for {
 			select {
 			case tgt := <-ch.In:
-				ctx.Logger().Debugf("%s: ForEachTarget: received target %s", pluginName, tgt)
 				if tgt == nil {
 					ctx.Logger().Debugf("%s: ForEachTarget: all targets have been received", pluginName)
 					return
 				}
-
+				ctx.Logger().Debugf("%s: ForEachTarget: received target %s", pluginName, tgt)
 				wg.Add(1)
 				go func() {
 					defer wg.Done()
@@ -66,9 +65,6 @@ func ForEachTarget(pluginName string, ctx xcontext.Context, ch test.TestStepChan
 			case <-ctx.Done():
 				ctx.Logger().Debugf("%s: ForEachTarget: incoming loop canceled", pluginName)
 				return
-			case <-ctx.WaitFor(xcontext.Paused):
-				ctx.Logger().Debugf("%s: ForEachTarget: incoming loop paused", pluginName)
-				return
 			}
 		}
 	}()
diff --git a/tests/common/get_events.go b/tests/common/get_events.go
new file mode 100644
index 00000000..5c22813a
--- /dev/null
+++ b/tests/common/get_events.go
@@ -0,0 +1,48 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package common
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/storage"
+	"github.com/facebookincubator/contest/pkg/xcontext"
+)
+
+func eventToStringNoTime(ev testevent.Event) string {
+	// Omit the timestamp to make output stable.
+	return fmt.Sprintf("{%s%s}", ev.Header, ev.Data)
+}
+
+// GetTestEventsAsString queries storage for particular test's events,
+// further filtering by target ID and/or step label.
+func GetTestEventsAsString(ctx xcontext.Context, st storage.EventStorage, testName string, targetID, stepLabel *string) string {
+	q, _ := testevent.BuildQuery(testevent.QueryTestName(testName))
+	results, _ := st.GetTestEvents(ctx, q)
+	var resultsForTarget []string
+	for _, r := range results {
+		if targetID != nil {
+			if r.Data.Target == nil {
+				continue
+			}
+			if *targetID != "" && r.Data.Target.ID != *targetID {
+				continue
+			}
+		}
+		if stepLabel != nil {
+			if *stepLabel != "" && r.Header.TestStepLabel != *stepLabel {
+				continue
+			}
+			if targetID == nil && r.Data.Target != nil {
+				continue
+			}
+		}
+		resultsForTarget = append(resultsForTarget, eventToStringNoTime(r))
+	}
+	return "\n" + strings.Join(resultsForTarget, "\n") + "\n"
+}
diff --git a/tests/common/goroutine_leak_check.go b/tests/common/goroutine_leak_check/goroutine_leak_check.go
similarity index 97%
rename from tests/common/goroutine_leak_check.go
rename to tests/common/goroutine_leak_check/goroutine_leak_check.go
index 3bbbc955..9da703d4 100644
--- a/tests/common/goroutine_leak_check.go
+++ b/tests/common/goroutine_leak_check/goroutine_leak_check.go
@@ -3,7 +3,7 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 
-package common
+package goroutine_leak_check
 
 import (
 	"fmt"
@@ -144,6 +144,10 @@ func LeakCheckingTestMain(m *testing.M, funcWhitelist ...string) {
 	ret := m.Run()
 	if ret == 0 {
 		time.Sleep(20 * time.Millisecond) // Give stragglers some time to exit.
+		runtime.Gosched()
+		runtime.GC()
+		runtime.Gosched()
+		runtime.GC()
 		if err := CheckLeakedGoRoutines(funcWhitelist...); err != nil {
 			fmt.Fprintf(os.Stderr, "%s", err)
 			ret = 1
diff --git a/tests/common/goroutine_leak_check_test.go b/tests/common/goroutine_leak_check/goroutine_leak_check_test.go
similarity index 86%
rename from tests/common/goroutine_leak_check_test.go
rename to tests/common/goroutine_leak_check/goroutine_leak_check_test.go
index 7cb1d0a5..e361a951 100644
--- a/tests/common/goroutine_leak_check_test.go
+++ b/tests/common/goroutine_leak_check/goroutine_leak_check_test.go
@@ -3,7 +3,7 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 
-package common
+package goroutine_leak_check
 
 import (
 	"sync"
@@ -33,7 +33,7 @@ func TestLeaks(t *testing.T) {
 	dump, err := checkLeakedGoRoutines()
 	require.Errorf(t, err, "===\n%s\n===", dump)
 	require.Contains(t, err.Error(), "goroutine_leak_check_test.go")
-	require.Contains(t, err.Error(), "common.Func1")
+	require.Contains(t, err.Error(), "goroutine_leak_check.Func1")
 	c.Signal()
 	// Give some time for the goroutine to exit
 	// otherwise it causes flakes in case of multiple test runs (-count=N).
@@ -47,7 +47,7 @@ func TestLeaksWhitelisted(t *testing.T) {
 	c.Wait()
 	c.L.Unlock()
 	require.NoError(t, CheckLeakedGoRoutines(
-		"github.com/facebookincubator/contest/tests/common.Func1"))
+		"github.com/facebookincubator/contest/tests/common/goroutine_leak_check.Func1"))
 	c.Signal()
 	time.Sleep(10 * time.Millisecond)
 }
diff --git a/tests/integ/events/frameworkevents/common.go b/tests/integ/events/frameworkevents/common.go
index 0428f0dc..eef35e07 100644
--- a/tests/integ/events/frameworkevents/common.go
+++ b/tests/integ/events/frameworkevents/common.go
@@ -16,6 +16,8 @@ import (
 	"github.com/facebookincubator/contest/pkg/event/frameworkevent"
 	"github.com/facebookincubator/contest/pkg/storage"
 	"github.com/facebookincubator/contest/pkg/types"
+	"github.com/facebookincubator/contest/pkg/xcontext/bundles/logrusctx"
+	"github.com/facebookincubator/contest/pkg/xcontext/logger"
 	"github.com/facebookincubator/contest/tests/integ/common"
 
 	"github.com/stretchr/testify/assert"
@@ -23,6 +25,10 @@ import (
 	"github.com/stretchr/testify/suite"
 )
 
+var (
+	ctx = logrusctx.NewContext(logger.LevelDebug)
+)
+
 func mustBuildQuery(t require.TestingT, queryFields ...frameworkevent.QueryField) *frameworkevent.Query {
 	eventQuery, err := frameworkevent.BuildQuery(queryFields...)
 	require.NoError(t, err)
@@ -37,11 +43,11 @@ func populateFrameworkEvents(backend storage.Storage, emitTime time.Time) error
 	eventFirst := frameworkevent.Event{JobID: 1, EventName: "AFrameworkEvent", Payload: payload, EmitTime: emitTime}
 	eventSecond := frameworkevent.Event{JobID: 1, EventName: "BFrameworkEvent", Payload: payload, EmitTime: emitTime}
 
-	err := backend.StoreFrameworkEvent(eventFirst)
+	err := backend.StoreFrameworkEvent(ctx, eventFirst)
 	if err != nil {
 		return err
 	}
-	return backend.StoreFrameworkEvent(eventSecond)
+	return backend.StoreFrameworkEvent(ctx, eventSecond)
 }
 
 func assertFrameworkEvents(t *testing.T, ev []frameworkevent.Event, emitTime time.Time) {
@@ -98,7 +104,7 @@ func (suite *FrameworkEventsSuite) TestRetrieveSingleFrameworkEvent() {
 	require.NoError(suite.T(), err)
 
 	eventQuery := mustBuildQuery(suite.T(), frameworkevent.QueryEventName("AFrameworkEvent"))
-	results, err := suite.txStorage.GetFrameworkEvent(eventQuery)
+	results, err := suite.txStorage.GetFrameworkEvent(ctx, eventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 1, len(results))
@@ -108,7 +114,7 @@ func (suite *FrameworkEventsSuite) TestRetrieveSingleFrameworkEvent() {
 func (suite *FrameworkEventsSuite) TestRetrieveMultipleFrameworkEvents() {
 
 	eventQuery := mustBuildQuery(suite.T(), frameworkevent.QueryJobID(1))
-	results, err := suite.txStorage.GetFrameworkEvent(eventQuery)
+	results, err := suite.txStorage.GetFrameworkEvent(ctx, eventQuery)
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 0, len(results))
 
@@ -117,7 +123,7 @@ func (suite *FrameworkEventsSuite) TestRetrieveMultipleFrameworkEvents() {
 	require.NoError(suite.T(), err)
 
 	eventQuery = mustBuildQuery(suite.T(), frameworkevent.QueryJobID(1))
-	results, err = suite.txStorage.GetFrameworkEvent(eventQuery)
+	results, err = suite.txStorage.GetFrameworkEvent(ctx, eventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 2, len(results))
@@ -136,14 +142,14 @@ func (suite *FrameworkEventsSuite) TestRetrieveSingleFrameworkEventByEmitTime()
 	require.NoError(suite.T(), err)
 
 	eventQuery := mustBuildQuery(suite.T(), frameworkevent.QueryEmittedStartTime(emitTime))
-	results, err := suite.txStorage.GetFrameworkEvent(eventQuery)
+	results, err := suite.txStorage.GetFrameworkEvent(ctx, eventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 2, len(results))
 	assertFrameworkEvents(suite.T(), results, emitTime)
 
 	eventQuery = mustBuildQuery(suite.T(), frameworkevent.QueryEmittedStartTime(emitTime.Add(-delta)))
-	results, err = suite.txStorage.GetFrameworkEvent(eventQuery)
+	results, err = suite.txStorage.GetFrameworkEvent(ctx, eventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 4, len(results))
diff --git a/tests/integ/events/testevents/common.go b/tests/integ/events/testevents/common.go
index faa86b94..7a555004 100644
--- a/tests/integ/events/testevents/common.go
+++ b/tests/integ/events/testevents/common.go
@@ -17,12 +17,18 @@ import (
 	"github.com/facebookincubator/contest/pkg/storage"
 	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/types"
+	"github.com/facebookincubator/contest/pkg/xcontext/bundles/logrusctx"
+	"github.com/facebookincubator/contest/pkg/xcontext/logger"
 	"github.com/facebookincubator/contest/tests/integ/common"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/stretchr/testify/suite"
 )
 
+var (
+	ctx = logrusctx.NewContext(logger.LevelDebug)
+)
+
 func mustBuildQuery(t require.TestingT, queryFields ...testevent.QueryField) *testevent.Query {
 	eventQuery, err := testevent.BuildQuery(queryFields...)
 	require.NoError(t, err)
@@ -45,11 +51,11 @@ func populateTestEvents(backend storage.Storage, emitTime time.Time) error {
 	eventFirst := testevent.Event{Header: &hdrFirst, Data: &dataFirst, EmitTime: emitTime}
 	eventSecond := testevent.Event{Header: &hdrSecond, Data: &dataSecond, EmitTime: emitTime}
 
-	err := backend.StoreTestEvent(eventFirst)
+	err := backend.StoreTestEvent(ctx, eventFirst)
 	if err != nil {
 		return err
 	}
-	return backend.StoreTestEvent(eventSecond)
+	return backend.StoreTestEvent(ctx, eventSecond)
 }
 
 func assertTestEvents(t *testing.T, ev []testevent.Event, emitTime time.Time) {
@@ -104,7 +110,7 @@ func (suite *TestEventsSuite) TestRetrieveSingleTestEvent() {
 	require.NoError(suite.T(), err)
 
 	testEventQuery := mustBuildQuery(suite.T(), testevent.QueryTestName("ATestName"))
-	results, err := suite.txStorage.GetTestEvents(testEventQuery)
+	results, err := suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 1, len(results))
@@ -119,7 +125,7 @@ func (suite *TestEventsSuite) TestRetrieveMultipleTestEvents() {
 	require.NoError(suite.T(), err)
 
 	testEventQuery := mustBuildQuery(suite.T(), testevent.QueryTestStepLabel("TestStepLabel"))
-	results, err := suite.txStorage.GetTestEvents(testEventQuery)
+	results, err := suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 2, len(results))
@@ -138,14 +144,14 @@ func (suite *TestEventsSuite) TestRetrievesSingleTestEventByEmitTime() {
 	require.NoError(suite.T(), err)
 
 	testEventQuery := mustBuildQuery(suite.T(), testevent.QueryEmittedStartTime(emitTime))
-	results, err := suite.txStorage.GetTestEvents(testEventQuery)
+	results, err := suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 2, len(results))
 	assertTestEvents(suite.T(), results, emitTime)
 
 	testEventQuery = mustBuildQuery(suite.T(), testevent.QueryEmittedStartTime(emitTime.Add(-delta)))
-	results, err = suite.txStorage.GetTestEvents(testEventQuery)
+	results, err = suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 4, len(results))
@@ -162,7 +168,7 @@ func (suite *TestEventsSuite) TestRetrievesMultipleTestEventsByName() {
 
 	eventNames := []event.Name{event.Name("AEventName"), event.Name("BEventName")}
 	testEventQuery := mustBuildQuery(suite.T(), testevent.QueryEventNames(eventNames))
-	results, err := suite.txStorage.GetTestEvents(testEventQuery)
+	results, err := suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 2, len(results))
@@ -176,7 +182,7 @@ func (suite *TestEventsSuite) TestRetrieveSingleTestEventsByName() {
 	require.NoError(suite.T(), err)
 
 	testEventQuery := mustBuildQuery(suite.T(), testevent.QueryEventName(event.Name("AEventName")))
-	results, err := suite.txStorage.GetTestEvents(testEventQuery)
+	results, err := suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 1, len(results))
@@ -193,7 +199,7 @@ func (suite *TestEventsSuite) TestRetrieveSingleTestEventsByNameAndJobID() {
 		testevent.QueryEventName(event.Name("AEventName")),
 		testevent.QueryJobID(1),
 	)
-	results, err := suite.txStorage.GetTestEvents(testEventQuery)
+	results, err := suite.txStorage.GetTestEvents(ctx, testEventQuery)
 
 	require.NoError(suite.T(), err)
 	assert.Equal(suite.T(), 1, len(results))
diff --git a/tests/integ/job/common.go b/tests/integ/job/common.go
index 0b903da6..3f0b13a2 100644
--- a/tests/integ/job/common.go
+++ b/tests/integ/job/common.go
@@ -10,6 +10,8 @@ package test
 import (
 	"time"
 
+	"github.com/facebookincubator/contest/pkg/xcontext/bundles/logrusctx"
+	"github.com/facebookincubator/contest/pkg/xcontext/logger"
 	"github.com/stretchr/testify/require"
 	"github.com/stretchr/testify/suite"
 
@@ -49,6 +51,10 @@ var testDescs = `
 ]
 `
 
+var (
+	ctx = logrusctx.NewContext(logger.LevelDebug)
+)
+
 type JobSuite struct {
 	suite.Suite
 
@@ -78,7 +84,7 @@ func (suite *JobSuite) TestGetJobRequest() {
 		RequestTime:   time.Now(),
 		JobDescriptor: jobDescriptorFirst,
 	}
-	jobIDa, err := suite.txStorage.StoreJobRequest(&jobRequestFirst)
+	jobIDa, err := suite.txStorage.StoreJobRequest(ctx, &jobRequestFirst)
 	require.NoError(t, err)
 
 	jobRequestSecond := job.Request{
@@ -87,10 +93,10 @@ func (suite *JobSuite) TestGetJobRequest() {
 		RequestTime:   time.Now(),
 		JobDescriptor: jobDescriptorSecond,
 	}
-	jobIDb, err := suite.txStorage.StoreJobRequest(&jobRequestSecond)
+	jobIDb, err := suite.txStorage.StoreJobRequest(ctx, &jobRequestSecond)
 	require.NoError(t, err)
 
-	request, err := suite.txStorage.GetJobRequest(jobIDa)
+	request, err := suite.txStorage.GetJobRequest(ctx, jobIDa)
 
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), types.JobID(jobIDa), request.JobID)
@@ -103,7 +109,7 @@ func (suite *JobSuite) TestGetJobRequest() {
 	require.True(t, request.RequestTime.After(time.Now().Add(-2*time.Second)))
 	require.True(t, request.RequestTime.Before(time.Now().Add(2*time.Second)))
 
-	request, err = suite.txStorage.GetJobRequest(jobIDb)
+	request, err = suite.txStorage.GetJobRequest(ctx, jobIDb)
 
 	// Creation timestamp corresponds to the timestamp of the insertion into the
 	// database. Assert that the timestamp retrieved from the database is within
@@ -133,7 +139,7 @@ func (suite *JobSuite) TestListJobs() {
 		RequestTime:   time.Now(),
 		JobDescriptor: jobDescriptorFirst,
 	}
-	jobIDa, err := suite.txStorage.StoreJobRequest(&jobRequestFirst)
+	jobIDa, err := suite.txStorage.StoreJobRequest(ctx, &jobRequestFirst)
 	require.NoError(t, err)
 
 	jobRequestSecond := job.Request{
@@ -142,75 +148,75 @@ func (suite *JobSuite) TestListJobs() {
 		RequestTime:   time.Now(),
 		JobDescriptor: jobDescriptorSecond,
 	}
-	jobIDb, err := suite.txStorage.StoreJobRequest(&jobRequestSecond)
+	jobIDb, err := suite.txStorage.StoreJobRequest(ctx, &jobRequestSecond)
 	require.NoError(t, err)
 
 	var res []types.JobID
 
 	// No match criteria - returns all jobs.
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t))
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t))
 	require.NoError(t, err)
 	require.Equal(t, []types.JobID{jobIDa, jobIDb}, res)
 
 	// List by states - matching any state is enough.
 	// Both jobs are currently in Unknown state.
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobStates(job.JobStateCompleted, job.JobStateFailed),
 	))
 	require.NoError(t, err)
 	require.Empty(t, res)
 	require.NotNil(t, res)
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobStates(job.JobStateCompleted, job.JobStateFailed, job.JobStateUnknown),
 	))
 	require.NoError(t, err)
 	require.Equal(t, []types.JobID{jobIDa, jobIDb}, res)
 
 	// Tag matching - single tag.
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobTags("tests")))
 	require.NoError(t, err)
 	require.Equal(t, []types.JobID{jobIDa}, res)
 
 	// Tag matching - must match all tags.
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobTags("integ", "tests")))
 	require.NoError(t, err)
 	require.Equal(t, []types.JobID{jobIDa}, res)
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobTags("integ", "tests", "no_such_tag")))
 	require.NoError(t, err)
 	require.Empty(t, res)
 	require.NotNil(t, res)
 
 	// Match by state - without any events both jobs are in unknown state.
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobStates(job.JobStateUnknown)))
 	require.NoError(t, err)
 	require.Equal(t, []types.JobID{jobIDa, jobIDb}, res)
 
 	// Inject state events
-	require.NoError(t, suite.txStorage.StoreFrameworkEvent(frameworkevent.Event{
+	require.NoError(t, suite.txStorage.StoreFrameworkEvent(ctx, frameworkevent.Event{
 		JobID: jobIDa, EventName: job.EventJobStarted, EmitTime: time.Unix(1, 0),
 	}))
-	require.NoError(t, suite.txStorage.StoreFrameworkEvent(frameworkevent.Event{
+	require.NoError(t, suite.txStorage.StoreFrameworkEvent(ctx, frameworkevent.Event{
 		JobID: jobIDb, EventName: job.EventJobStarted, EmitTime: time.Unix(2, 0),
 	}))
-	require.NoError(t, suite.txStorage.StoreFrameworkEvent(frameworkevent.Event{
+	require.NoError(t, suite.txStorage.StoreFrameworkEvent(ctx, frameworkevent.Event{
 		JobID: jobIDb, EventName: job.EventJobFailed, EmitTime: time.Unix(3, 0),
 	}))
-	require.NoError(t, suite.txStorage.StoreFrameworkEvent(frameworkevent.Event{
+	require.NoError(t, suite.txStorage.StoreFrameworkEvent(ctx, frameworkevent.Event{
 		JobID: jobIDa, EventName: job.EventJobCompleted, EmitTime: time.Unix(4, 0),
 	}))
 
 	// Multiple states - match any of them
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobStates(job.JobStateCompleted, job.JobStateFailed, job.JobStatePaused)))
 	require.NoError(t, err)
 	require.Equal(t, []types.JobID{jobIDa, jobIDb}, res)
 
 	// State and tag match - must match both
-	res, err = suite.txStorage.ListJobs(mustBuildQuery(t,
+	res, err = suite.txStorage.ListJobs(ctx, mustBuildQuery(t,
 		storage.QueryJobStates(job.JobStateCompleted, job.JobStateFailed, job.JobStatePaused),
 		storage.QueryJobTags("tests", "foo"),
 	))
diff --git a/tests/integ/jobmanager/common.go b/tests/integ/jobmanager/common.go
index 05f7488f..0d548037 100644
--- a/tests/integ/jobmanager/common.go
+++ b/tests/integ/jobmanager/common.go
@@ -83,6 +83,7 @@ type TestListener struct {
 // Serve implements the main logic of a dummy listener which talks to the API
 // layer to trigger actions in the JobManager
 func (tl *TestListener) Serve(ctx xcontext.Context, contestApi *api.API) error {
+	ctx.Logger().Debugf("Serving mock listener")
 	for {
 		select {
 		case command := <-tl.commandCh:
@@ -114,7 +115,7 @@ func (tl *TestListener) Serve(ctx xcontext.Context, contestApi *api.API) error {
 			default:
 				return nil
 			}
-		case <-ctx.Done():
+		case <-ctx.WaitFor():
 			return nil
 		}
 	}
@@ -130,7 +131,7 @@ func pollForEvent(eventManager frameworkevent.EmitterFetcher, ev event.Name, job
 				frameworkevent.QueryJobID(jobID),
 				frameworkevent.QueryEventName(ev),
 			}
-			ev, err := eventManager.Fetch(queryFields...)
+			ev, err := eventManager.Fetch(ctx, queryFields...)
 			if err != nil {
 				return nil, err
 			}
@@ -346,7 +347,7 @@ func (suite *TestJobManagerSuite) testExit(
 	}
 
 	// JobManager will emit a paused or cancelled event when the job completes
-	ev, err = suite.eventManager.Fetch(
+	ev, err = suite.eventManager.Fetch(ctx,
 		frameworkevent.QueryJobID(jobID),
 		frameworkevent.QueryEventName(expectedEvent),
 	)
@@ -368,10 +369,10 @@ func (suite *TestJobManagerSuite) TestJobManagerJobStartSingle() {
 	jobID, err := suite.startJob(jobDescriptorNoop)
 	require.NoError(suite.T(), err)
 
-	_, err = suite.jsm.GetJobRequest(types.JobID(jobID))
+	_, err = suite.jsm.GetJobRequest(ctx, types.JobID(jobID))
 	require.NoError(suite.T(), err)
 
-	r, err := suite.jsm.GetJobRequest(types.JobID(jobID + 1))
+	r, err := suite.jsm.GetJobRequest(ctx, types.JobID(jobID + 1))
 	require.Error(suite.T(), err)
 	require.NotEqual(suite.T(), nil, r)
 
@@ -398,14 +399,14 @@ func (suite *TestJobManagerSuite) TestJobManagerJobReport() {
 	require.Equal(suite.T(), 1, len(ev))
 
 	// A Report must be persisted for the Job
-	jobReport, err := suite.jsm.GetJobReport(types.JobID(jobID))
+	jobReport, err := suite.jsm.GetJobReport(ctx, types.JobID(jobID))
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), 1, len(jobReport.RunReports))
 	require.Equal(suite.T(), 0, len(jobReport.FinalReports))
 
 	// Any other Job should not have a Job report, but fetching the
 	// report should not error out
-	jobReport, err = suite.jsm.GetJobReport(types.JobID(2))
+	jobReport, err = suite.jsm.GetJobReport(ctx, types.JobID(2))
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), &job.JobReport{JobID: 2}, jobReport)
 }
@@ -453,7 +454,7 @@ func (suite *TestJobManagerSuite) TestJobManagerJobNotSuccessful() {
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), 1, len(ev))
 
-	jobReport, err := suite.jsm.GetJobReport(types.JobID(jobID))
+	jobReport, err := suite.jsm.GetJobReport(ctx, types.JobID(jobID))
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), 1, len(jobReport.RunReports))
 	require.Equal(suite.T(), 0, len(jobReport.FinalReports))
@@ -471,7 +472,7 @@ func (suite *TestJobManagerSuite) TestJobManagerJobFailure() {
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), 1, len(ev))
 
-	jobReport, err := suite.jsm.GetJobReport(types.JobID(jobID))
+	jobReport, err := suite.jsm.GetJobReport(ctx, types.JobID(jobID))
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), 1, len(jobReport.RunReports))
 	require.Equal(suite.T(), 0, len(jobReport.FinalReports))
@@ -489,7 +490,7 @@ func (suite *TestJobManagerSuite) TestJobManagerJobCrash() {
 	require.NoError(suite.T(), err)
 	require.Equal(suite.T(), 1, len(ev))
 	require.Contains(suite.T(), string(*ev[0].Payload), "TestStep crashed")
-	jobReport, err := suite.jsm.GetJobReport(types.JobID(jobID))
+	jobReport, err := suite.jsm.GetJobReport(ctx, types.JobID(jobID))
 
 	require.NoError(suite.T(), err)
 	// no reports are expected if the job crashes
@@ -605,7 +606,7 @@ func (suite *TestJobManagerSuite) TestJobManagerDifferentInstances() {
 		jobID, err = suite.startJob(jobDescriptorNoop)
 		require.NoError(suite.T(), err)
 
-		_, err = suite.jsm.GetJobRequest(types.JobID(jobID))
+		_, err = suite.jsm.GetJobRequest(ctx, types.JobID(jobID))
 		require.NoError(suite.T(), err)
 
 		ev, err := pollForEvent(suite.eventManager, job.EventJobCompleted, jobID, 1*time.Second)
diff --git a/tests/integ/test/testrunner_test.go b/tests/integ/test/testrunner_test.go
index 36153341..78dde750 100644
--- a/tests/integ/test/testrunner_test.go
+++ b/tests/integ/test/testrunner_test.go
@@ -13,10 +13,11 @@ import (
 	"testing"
 	"time"
 
-	"github.com/stretchr/testify/assert"
+	"github.com/facebookincubator/contest/pkg/xcontext"
+	"github.com/facebookincubator/contest/pkg/xcontext/bundles/logrusctx"
+	"github.com/facebookincubator/contest/pkg/xcontext/logger"
 	"github.com/stretchr/testify/require"
 
-	"github.com/facebookincubator/contest/pkg/cerrors"
 	"github.com/facebookincubator/contest/pkg/event"
 	"github.com/facebookincubator/contest/pkg/pluginregistry"
 	"github.com/facebookincubator/contest/pkg/runner"
@@ -24,7 +25,6 @@ import (
 	"github.com/facebookincubator/contest/pkg/target"
 	"github.com/facebookincubator/contest/pkg/test"
 	"github.com/facebookincubator/contest/pkg/types"
-	"github.com/facebookincubator/contest/pkg/xcontext"
 	"github.com/facebookincubator/contest/plugins/storage/memory"
 	"github.com/facebookincubator/contest/plugins/teststeps/cmd"
 	"github.com/facebookincubator/contest/plugins/teststeps/echo"
@@ -43,6 +43,10 @@ var (
 	successTimeout = 5 * time.Second
 )
 
+var (
+	ctx = logrusctx.NewContext(logger.LevelDebug)
+)
+
 var testSteps = map[string]test.TestStepFactory{
 	echo.Name:      echo.New,
 	example.Name:   example.New,
@@ -69,7 +73,7 @@ var testStepsEvents = map[string][]event.Name{
 
 func TestMain(m *testing.M) {
 
-	pluginRegistry = pluginregistry.NewPluginRegistry()
+	pluginRegistry = pluginregistry.NewPluginRegistry(ctx)
 	// Setup the PluginRegistry by registering TestSteps
 	for name, tsfactory := range testSteps {
 		if _, ok := testStepsEvents[name]; !ok {
@@ -127,7 +131,7 @@ func TestSuccessfulCompletion(t *testing.T) {
 
 	go func() {
 		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
+		_, err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID, nil)
 		errCh <- err
 	}()
 	select {
@@ -138,180 +142,6 @@ func TestSuccessfulCompletion(t *testing.T) {
 	}
 }
 
-func TestPanicStep(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Panic")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		test.TestStepBundle{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	errCh := make(chan error, 1)
-	stateCtx := xcontext.Background()
-
-	go func() {
-		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-	select {
-	case err = <-errCh:
-		require.Error(t, err)
-	case <-time.After(successTimeout):
-		t.Errorf("test should return within timeout: %+v", successTimeout)
-	}
-}
-
-func TestNoReturnStepWithCorrectTargetForwarding(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("NoReturn")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, Parameters: params, TestStepLabel: "NoReturn"},
-		test.TestStepBundle{TestStep: ts2, Parameters: params, TestStepLabel: "Example"},
-	}
-
-	stateCtx := xcontext.Background()
-	errCh := make(chan error, 1)
-
-	timeouts := runner.TestRunnerTimeouts{
-		StepInjectTimeout:   30 * time.Second,
-		MessageTimeout:      5 * time.Second,
-		ShutdownTimeout:     1 * time.Second,
-		StepShutdownTimeout: 1 * time.Second,
-	}
-	go func() {
-		tr := runner.NewTestRunnerWithTimeouts(timeouts)
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-	select {
-	case err = <-errCh:
-		require.Error(t, err)
-		if _, ok := err.(*cerrors.ErrTestStepsNeverReturned); !ok {
-			errString := fmt.Sprintf("Error returned by TestRunner should be of type ErrTestStepsNeverReturned: %v", err)
-			assert.FailNow(t, errString)
-		}
-		assert.NotNil(t, err.(*cerrors.ErrTestStepsNeverReturned))
-	case <-time.After(successTimeout):
-		t.Errorf("test should return within timeout: %+v", successTimeout)
-	}
-}
-
-func TestNoReturnStepWithoutTargetForwarding(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Hanging")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		test.TestStepBundle{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	stateCtx, cancel := xcontext.WithCancel(nil)
-	errCh := make(chan error, 1)
-
-	var (
-		StepInjectTimeout   = 30 * time.Second
-		MessageTimeout      = 5 * time.Second
-		ShutdownTimeout     = 1 * time.Second
-		StepShutdownTimeout = 1 * time.Second
-	)
-	timeouts := runner.TestRunnerTimeouts{
-		StepInjectTimeout:   StepInjectTimeout,
-		MessageTimeout:      MessageTimeout,
-		ShutdownTimeout:     ShutdownTimeout,
-		StepShutdownTimeout: StepShutdownTimeout,
-	}
-
-	go func() {
-		tr := runner.NewTestRunnerWithTimeouts(timeouts)
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-
-	testTimeout := 2 * time.Second
-	testShutdownTimeout := ShutdownTimeout + StepShutdownTimeout + 2
-	select {
-	case err = <-errCh:
-		// The TestRunner should never return and should instead time out
-		assert.FailNow(t, "TestRunner should not return, received an error instead: %v", err)
-	case <-time.After(testTimeout):
-		// Assert cancellation signal and expect the TestRunner to return within
-		// testShutdownTimeout
-		cancel()
-		select {
-		case err = <-errCh:
-			// The test timed out, which is an error from the perspective of the JobManager
-			require.Error(t, err)
-			if _, ok := err.(*cerrors.ErrTestStepsNeverReturned); !ok {
-				errString := fmt.Sprintf("Error returned by TestRunner should be of type ErrTestStepsNeverReturned: %v", err)
-				assert.FailNow(t, errString)
-			}
-		case <-time.After(testShutdownTimeout):
-			assert.FailNow(t, "TestRunner should return after cancellation before timeout")
-		}
-	}
-}
-
-func TestStepClosesChannels(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Channels")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Example")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		test.TestStepBundle{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		test.TestStepBundle{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	stateCtx := xcontext.Background()
-	errCh := make(chan error, 1)
-
-	go func() {
-		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-
-	testTimeout := 2 * time.Second
-	select {
-	case err = <-errCh:
-		if _, ok := err.(*cerrors.ErrTestStepClosedChannels); !ok {
-			errString := fmt.Sprintf("Error returned by TestRunner should be of type ErrTestStepClosedChannels, got %T(%v)", err, err)
-			assert.FailNow(t, errString)
-		}
-	case <-time.After(testTimeout):
-		assert.FailNow(t, "TestRunner should not time out")
-	}
-}
-
 func TestCmdPlugin(t *testing.T) {
 
 	jobID := types.JobID(1)
@@ -337,7 +167,7 @@ func TestCmdPlugin(t *testing.T) {
 
 	go func() {
 		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
+		_, err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID, nil)
 		errCh <- err
 	}()
 
@@ -352,37 +182,3 @@ func TestCmdPlugin(t *testing.T) {
 		t.Errorf("test should return within timeout: %+v", successTimeout)
 	}
 }
-
-func TestNoRunTestStepIfNoTargets(t *testing.T) {
-
-	jobID := types.JobID(1)
-	runID := types.RunID(1)
-
-	ts1, err := pluginRegistry.NewTestStep("Fail")
-	require.NoError(t, err)
-	ts2, err := pluginRegistry.NewTestStep("Crash")
-	require.NoError(t, err)
-
-	params := make(test.TestStepParameters)
-	testSteps := []test.TestStepBundle{
-		{TestStep: ts1, TestStepLabel: "StageOne", Parameters: params},
-		{TestStep: ts2, TestStepLabel: "StageTwo", Parameters: params},
-	}
-
-	stateCtx := xcontext.Background()
-	errCh := make(chan error, 1)
-
-	go func() {
-		tr := runner.NewTestRunner()
-		err := tr.Run(stateCtx, &test.Test{TestStepsBundles: testSteps}, targets, jobID, runID)
-		errCh <- err
-	}()
-
-	testTimeout := 2 * time.Second
-	select {
-	case err = <-errCh:
-		assert.Nil(t, err, "the Crash TestStep shouldn't be ran")
-	case <-time.After(testTimeout):
-		assert.FailNow(t, "TestRunner should not time out")
-	}
-}
diff --git a/tests/plugins/teststeps/badtargets/badtargets.go b/tests/plugins/teststeps/badtargets/badtargets.go
new file mode 100644
index 00000000..e7fa2e93
--- /dev/null
+++ b/tests/plugins/teststeps/badtargets/badtargets.go
@@ -0,0 +1,111 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package badtargets
+
+import (
+	"fmt"
+
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/target"
+	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/pkg/xcontext"
+)
+
+// Name is the name used to look this plugin up.
+const Name = "BadTargets"
+
+// Events defines the events that a TestStep is allow to emit
+var Events = []event.Name{}
+
+type badTargets struct {
+}
+
+// Name returns the name of the Step
+func (ts *badTargets) Name() string {
+	return Name
+}
+
+// Run executes a step that messes up the flow of targets.
+func (ts *badTargets) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
+	for {
+		select {
+		case tgt, ok := <-ch.In:
+			if !ok {
+				return nil
+			}
+			switch tgt.ID {
+			case "TDrop":
+				// ... crickets ...
+			case "TGood":
+				// We should not depend on pointer matching, so emit a copy.
+				tgt2 := *tgt
+				select {
+				case ch.Out <- &tgt2:
+				case <-ctx.Done():
+					return xcontext.Canceled
+				}
+			case "TDup":
+				select {
+				case ch.Out <- tgt:
+				case <-ctx.Done():
+					return xcontext.Canceled
+				}
+				select {
+				case ch.Out <- tgt:
+				case <-ctx.Done():
+					return xcontext.Canceled
+				}
+			case "TExtra":
+				tgt2 := &target.Target{ID: "TExtra2"}
+				select {
+				case ch.Out <- tgt:
+				case <-ctx.Done():
+					return xcontext.Canceled
+				}
+				select {
+				case ch.Out <- tgt2:
+				case <-ctx.Done():
+					return xcontext.Canceled
+				}
+			case "T1":
+				// Mangle the returned target name.
+				tgt2 := &target.Target{ID: tgt.ID + "XXX"}
+				select {
+				case ch.Out <- tgt2:
+				case <-ctx.Done():
+					return xcontext.Canceled
+				}
+			default:
+				return fmt.Errorf("Unexpected target name: %q", tgt.ID)
+			}
+		case <-ctx.Done():
+			return xcontext.Canceled
+		}
+	}
+}
+
+// ValidateParameters validates the parameters associated to the TestStep
+func (ts *badTargets) ValidateParameters(ctx xcontext.Context, params test.TestStepParameters) error {
+	return nil
+}
+
+// Resume tries to resume a previously interrupted test step. ExampleTestStep
+// cannot resume.
+func (ts *badTargets) Resume(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.EmitterFetcher) error {
+	return &cerrors.ErrResumeNotSupported{StepName: Name}
+}
+
+// CanResume tells whether this step is able to resume.
+func (ts *badTargets) CanResume() bool {
+	return false
+}
+
+// New creates a new badTargets step
+func New() test.TestStep {
+	return &badTargets{}
+}
diff --git a/tests/plugins/teststeps/channels/channels.go b/tests/plugins/teststeps/channels/channels.go
index 706e5bee..04879cc0 100644
--- a/tests/plugins/teststeps/channels/channels.go
+++ b/tests/plugins/teststeps/channels/channels.go
@@ -16,7 +16,7 @@ import (
 // Name is the name used to look this plugin up.
 var Name = "Channels"
 
-// Events defines the events that a TestStep is allow to emit
+// Events defines the events that a TestStep is allowed to emit
 var Events = []event.Name{}
 
 type channels struct {
@@ -27,12 +27,14 @@ func (ts *channels) Name() string {
 	return Name
 }
 
-// Run executes a step which does never return.s
+// Run executes a step that runs fine but closes its output channels on exit.
 func (ts *channels) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
 	for target := range ch.In {
 		ch.Out <- target
 	}
+	// This is bad, do not do this.
 	close(ch.Out)
+	close(ch.Err)
 	return nil
 }
 
diff --git a/tests/plugins/teststeps/fail/fail.go b/tests/plugins/teststeps/fail/fail.go
index dcccefcc..544ed5f9 100644
--- a/tests/plugins/teststeps/fail/fail.go
+++ b/tests/plugins/teststeps/fail/fail.go
@@ -38,7 +38,7 @@ func (ts *fail) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.
 				return nil
 			}
 			ch.Err <- cerrors.TargetError{Target: target, Err: fmt.Errorf("Integration test failure for %v", target)}
-		case <-ctx.WaitFor():
+		case <-ctx.Done():
 			return nil
 		}
 	}
diff --git a/tests/plugins/teststeps/hanging/hanging.go b/tests/plugins/teststeps/hanging/hanging.go
index 0c42e218..1b8ec769 100644
--- a/tests/plugins/teststeps/hanging/hanging.go
+++ b/tests/plugins/teststeps/hanging/hanging.go
@@ -27,7 +27,7 @@ func (ts *hanging) Name() string {
 	return Name
 }
 
-// Run executes a step which does never return.
+// Run executes a step that does not process any targets and never returns.
 func (ts *hanging) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
 	channel := make(chan struct{})
 	<-channel
diff --git a/tests/plugins/teststeps/noreturn/noreturn.go b/tests/plugins/teststeps/noreturn/noreturn.go
index fb4b7bd8..af8d432c 100644
--- a/tests/plugins/teststeps/noreturn/noreturn.go
+++ b/tests/plugins/teststeps/noreturn/noreturn.go
@@ -27,7 +27,7 @@ func (ts *noreturnStep) Name() string {
 	return Name
 }
 
-// Run executes a step which does never return.
+// Run executes a step that never returns.
 func (ts *noreturnStep) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
 	for target := range ch.In {
 		ch.Out <- target
diff --git a/tests/plugins/teststeps/teststep/teststep.go b/tests/plugins/teststeps/teststep/teststep.go
new file mode 100644
index 00000000..fece1df4
--- /dev/null
+++ b/tests/plugins/teststeps/teststep/teststep.go
@@ -0,0 +1,160 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+package teststep
+
+import (
+	"fmt"
+	"math/rand"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/facebookincubator/contest/pkg/cerrors"
+	"github.com/facebookincubator/contest/pkg/event"
+	"github.com/facebookincubator/contest/pkg/event/testevent"
+	"github.com/facebookincubator/contest/pkg/target"
+	"github.com/facebookincubator/contest/pkg/test"
+	"github.com/facebookincubator/contest/pkg/xcontext"
+	"github.com/facebookincubator/contest/plugins/teststeps"
+)
+
+var Name = "Test"
+
+const (
+	// A comma-delimited list of target IDs to fail on.
+	FailTargetsParam = "FailTargets"
+	// Alternatively, fail this percentage of targets at random.
+	FailPctParam = "FailPct"
+	// A comma-delimited list of target IDs to delay and by how much, ID=delay_ms.
+	DelayTargetsParam = "DelayTargets"
+)
+
+const (
+	StartedEvent      = event.Name("TestStartedEvent")
+	FinishedEvent     = event.Name("TestFinishedEvent")
+	FailedEvent       = event.Name("TestFailedEvent")
+	StepRunningEvent  = event.Name("TestStepRunningEvent")
+	StepFinishedEvent = event.Name("TestStepFinishedEvent")
+)
+
+var Events = []event.Name{StartedEvent, FinishedEvent, FailedEvent, StepRunningEvent, StepFinishedEvent}
+
+type Step struct {
+	failPct      int64
+	failTargets  map[string]bool
+	delayTargets map[string]time.Duration
+}
+
+// Name returns the name of the Step
+func (ts Step) Name() string {
+	return Name
+}
+
+func (ts *Step) shouldFail(t *target.Target, params test.TestStepParameters) bool {
+	if ts.failTargets[t.ID] {
+		return true
+	}
+	if ts.failPct > 0 {
+		roll := rand.Int63n(101)
+		return (roll <= ts.failPct)
+	}
+	return false
+}
+
+// Run executes the example step.
+func (ts *Step) Run(ctx xcontext.Context, ch test.TestStepChannels, params test.TestStepParameters, ev testevent.Emitter) error {
+	f := func(ctx xcontext.Context, target *target.Target) error {
+		// Sleep to ensure TargetIn fires first. This simplifies test assertions.
+		time.Sleep(20 * time.Millisecond)
+		if err := ev.Emit(ctx, testevent.Data{EventName: StartedEvent, Target: target, Payload: nil}); err != nil {
+			return fmt.Errorf("failed to emit start event: %v", err)
+		}
+		delay := ts.delayTargets[target.ID]
+		if delay == 0 {
+			delay = ts.delayTargets["*"]
+		}
+		select {
+		case <-time.After(delay):
+		case <-ctx.Done():
+			return xcontext.Canceled
+		}
+		if ts.shouldFail(target, params) {
+			if err := ev.Emit(ctx, testevent.Data{EventName: FailedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit finished event: %v", err)
+			}
+			return fmt.Errorf("target failed")
+		} else {
+			if err := ev.Emit(ctx, testevent.Data{EventName: FinishedEvent, Target: target, Payload: nil}); err != nil {
+				return fmt.Errorf("failed to emit failed event: %v", err)
+			}
+		}
+		return nil
+	}
+	if err := ev.Emit(ctx, testevent.Data{EventName: StepRunningEvent}); err != nil {
+		return fmt.Errorf("failed to emit failed event: %v", err)
+	}
+	res := teststeps.ForEachTarget(Name, ctx, ch, f)
+	if err := ev.Emit(ctx, testevent.Data{EventName: StepFinishedEvent}); err != nil {
+		return fmt.Errorf("failed to emit failed event: %v", err)
+	}
+	return res
+}
+
+// ValidateParameters validates the parameters associated to the TestStep
+func (ts *Step) ValidateParameters(_ xcontext.Context, params test.TestStepParameters) error {
+	targetsToFail := params.GetOne(FailTargetsParam).String()
+	if len(targetsToFail) > 0 {
+		for _, t := range strings.Split(targetsToFail, ",") {
+			ts.failTargets[t] = true
+		}
+	}
+	targetsToDelay := params.GetOne(DelayTargetsParam).String()
+	if len(targetsToDelay) > 0 {
+		for _, e := range strings.Split(targetsToDelay, ",") {
+			kv := strings.Split(e, "=")
+			if len(kv) != 2 {
+				continue
+			}
+			v, err := strconv.Atoi(kv[1])
+			if err != nil {
+				return fmt.Errorf("invalid FailTargets: %w", err)
+			}
+			ts.delayTargets[kv[0]] = time.Duration(v) * time.Millisecond
+		}
+	}
+	if params.GetOne(FailPctParam).String() != "" {
+		if pct, err := params.GetInt(FailPctParam); err == nil {
+			ts.failPct = pct
+		} else {
+			return fmt.Errorf("invalid FailPct: %w", err)
+		}
+	}
+	return nil
+}
+
+// Resume tries to resume a previously interrupted test step. TestTestStep
+// cannot resume.
+func (ts *Step) Resume(ctx xcontext.Context, ch test.TestStepChannels, _ test.TestStepParameters, ev testevent.EmitterFetcher) error {
+	return &cerrors.ErrResumeNotSupported{StepName: Name}
+}
+
+// CanResume tells whether this step is able to resume.
+func (ts *Step) CanResume() bool {
+	return false
+}
+
+// New initializes and returns a new TestStep.
+func New() test.TestStep {
+	return &Step{
+		failTargets:  make(map[string]bool),
+		delayTargets: make(map[string]time.Duration),
+	}
+}
+
+// Load returns the name, factory and events which are needed to register the step.
+func Load() (string, test.TestStepFactory, []event.Name) {
+	return Name, New, Events
+}