Skip to content

Commit

Permalink
feat(retries) Design and implement retries
Browse files Browse the repository at this point in the history
closes #221
  • Loading branch information
joseblas authored and tekton-robot committed May 6, 2019
1 parent cdda5d4 commit 295bf9c
Show file tree
Hide file tree
Showing 14 changed files with 861 additions and 212 deletions.
20 changes: 20 additions & 0 deletions docs/pipelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ This document defines `Pipelines` and their capabilities.
- [Pipeline Tasks](#pipeline-tasks)
- [From](#from)
- [RunAfter](#runafter)
- [Retries](#retries)
- [Ordering](#ordering)
- [Examples](#examples)

Expand Down Expand Up @@ -41,6 +42,7 @@ following fields:
- [`runAfter`](#runAfter) - Used when the [Pipeline Task](#pipeline-task)
should be executed after another Pipeline Task, but there is no
[output linking](#from) required
- [`retries`](#retries) - Used when the task is wanted to be executed if it fails. Could a network error or a missing dependency. It does not apply to cancellations.

[kubernetes-overview]:
https://kubernetes.io/docs/concepts/overview/working-with-objects/kubernetes-objects/#required-fields
Expand Down Expand Up @@ -255,6 +257,24 @@ is no output from `test-app`, so `build-app` uses `runAfter` to indicate that
`test-app` should run before it, regardless of the order they appear in the
spec.

#### retries

Sometimes is needed some policy for retrying tasks for various reasons such as network errors, missing dependencies or upload problems.
Any of those issue must be reflected as False (corev1.ConditionFalse) within the TaskRun Status Succeeded Condition.
For that reason there is an optional attribute called `retries` which declares how many times that task should be retried in case of failure,

By default and in its absence there are no retries; its value is 0.

```yaml
tasks:
- name: build-the-image
retries: 1
taskRef:
name: build-push
```

In this example, the task "build-the-image" will be executed and if the first run fails a second one would triggered. But, if that fails no more would triggered: a max of two executions.

## Ordering

The [Pipeline Tasks](#pipeline-tasks) in a `Pipeline` can be connected and run
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/pipeline/v1alpha1/pipeline_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ type PipelineTask struct {
Name string `json:"name,omitempty"`
TaskRef TaskRef `json:"taskRef"`

// Retries represents how many times this task should be retried in case of task failure: ConditionSucceeded set to False
// +optional
Retries int `json:"retries",omitempty`

// RunAfter is the list of PipelineTask names that should be executed before
// this Task executes. (Used to force a specific ordering in graph execution.)
// +optional
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/pipeline/v1alpha1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ type TaskRunStatus struct {
// Steps describes the state of each build step container.
// +optional
Steps []StepState `json:"steps,omitempty"`
// RetriesStatus contains the history of TaskRunStatus in case of a retry in order to keep record of failures.
// All TaskRunStatus stored in RetriesStatus will have no date within the RetriesStatus as is redundant.
// +optional
RetriesStatus []TaskRunStatus `json:"retriesStatus,omitempty"`
}

// GetCondition returns the Condition matching the given type.
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/pipeline/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 39 additions & 6 deletions pkg/reconciler/v1alpha1/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ func NewController(
// converge the two. It then updates the Status block of the Pipeline Run
// resource with the current status of the resource.
func (c *Reconciler) Reconcile(ctx context.Context, key string) error {

c.Logger.Infof("Reconciling %v", time.Now())

// Convert the namespace/name string into a distinct namespace and name
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
Expand Down Expand Up @@ -275,6 +278,9 @@ func (c *Reconciler) reconcile(ctx context.Context, pr *v1alpha1.PipelineRun) er
func(name string) (v1alpha1.TaskInterface, error) {
return c.taskLister.Tasks(pr.Namespace).Get(name)
},
func(name string) (*v1alpha1.TaskRun, error) {
return c.taskRunLister.TaskRuns(pr.Namespace).Get(name)
},
func(name string) (v1alpha1.TaskInterface, error) {
return c.clusterTaskLister.Get(name)
},
Expand Down Expand Up @@ -311,6 +317,13 @@ func (c *Reconciler) reconcile(ctx context.Context, pr *v1alpha1.PipelineRun) er
}
return nil
}

if pipelineState.IsDone() && pr.IsDone() {
c.timeoutHandler.Release(pr)
c.Recorder.Event(pr, corev1.EventTypeNormal, eventReasonSucceeded, "PipelineRun completed successfully.")
return nil
}

if err := resources.ValidateFrom(pipelineState); err != nil {
// This Run has failed, so we need to mark it as failed and stop reconciling it
pr.Status.SetCondition(&apis.Condition{
Expand Down Expand Up @@ -338,11 +351,6 @@ func (c *Reconciler) reconcile(ctx context.Context, pr *v1alpha1.PipelineRun) er
}
}

err = resources.ResolveTaskRuns(c.taskRunLister.TaskRuns(pr.Namespace).Get, pipelineState)
if err != nil {
return fmt.Errorf("Error getting TaskRuns for Pipeline %s: %s", p.Name, err)
}

// If the pipelinerun is cancelled, cancel tasks and update status
if pr.IsCancelled() {
return cancelPipelineRun(pr, pipelineState, c.PipelineClientSet)
Expand Down Expand Up @@ -438,7 +446,19 @@ func (c *Reconciler) createTaskRun(logger *zap.SugaredLogger, rprt *resources.Re
}
labels[pipeline.GroupName+pipeline.PipelineRunLabelKey] = pr.Name

tr := &v1alpha1.TaskRun{
tr, _ := c.taskRunLister.TaskRuns(pr.Namespace).Get(rprt.TaskRunName)

if tr != nil {
//is a retry
addRetryHistory(tr)
clearStatus(tr)
tr.Status.SetCondition(&apis.Condition{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionUnknown,
})
return c.PipelineClientSet.TektonV1alpha1().TaskRuns(pr.Namespace).UpdateStatus(tr)
}
tr = &v1alpha1.TaskRun{
ObjectMeta: metav1.ObjectMeta{
Name: rprt.TaskRunName,
Namespace: pr.Namespace,
Expand All @@ -464,6 +484,19 @@ func (c *Reconciler) createTaskRun(logger *zap.SugaredLogger, rprt *resources.Re
return c.PipelineClientSet.TektonV1alpha1().TaskRuns(pr.Namespace).Create(tr)
}

func addRetryHistory(tr *v1alpha1.TaskRun) {
newStatus := *tr.Status.DeepCopy()
newStatus.RetriesStatus = nil
tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, newStatus)
}

func clearStatus(tr *v1alpha1.TaskRun) {
tr.Status.StartTime = nil
tr.Status.CompletionTime = nil
tr.Status.Results = nil
tr.Status.PodName = ""
}

func (c *Reconciler) updateStatus(pr *v1alpha1.PipelineRun) (*v1alpha1.PipelineRun, error) {
newPr, err := c.pipelineRunLister.PipelineRuns(pr.Namespace).Get(pr.Name)
if err != nil {
Expand Down
158 changes: 158 additions & 0 deletions pkg/reconciler/v1alpha1/pipelinerun/pipelinerun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package pipelinerun

import (
"context"
"fmt"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -512,6 +513,15 @@ func TestReconcileOnCompletedPipelineRun(t *testing.T) {
if actual == nil {
t.Errorf("Expected a PipelineRun to be updated, but it wasn't.")
}
actions := clients.Pipeline.Actions()
for _, action := range actions {
if action != nil {
resource := action.GetResource().Resource
if resource != "pipelineruns" {
t.Fatalf("Expected client to not have created a TaskRun for the completed PipelineRun, but it did")
}
}
}

// Check that the PipelineRun was reconciled correctly
reconciledRun, err := clients.Pipeline.Tekton().PipelineRuns("foo").Get("test-pipeline-run-completed", metav1.GetOptions{})
Expand Down Expand Up @@ -679,6 +689,55 @@ func TestReconcileWithTimeout(t *testing.T) {
t.Errorf("TaskRun timeout %s should be less than or equal to PipelineRun timeout %s", actual.Spec.Timeout.Duration.String(), prs[0].Spec.Timeout.Duration.String())
}
}
func TestReconcileCancelledPipelineRun(t *testing.T) {
ps := []*v1alpha1.Pipeline{tb.Pipeline("test-pipeline", "foo", tb.PipelineSpec(
tb.PipelineTask("hello-world-1", "hello-world", tb.Retries(1)),
))}
prs := []*v1alpha1.PipelineRun{tb.PipelineRun("test-pipeline-run-with-timeout", "foo",
tb.PipelineRunSpec("test-pipeline",
tb.PipelineRunCancelled,
),
)}
ts := []*v1alpha1.Task{tb.Task("hello-world", "foo")}

d := test.Data{
PipelineRuns: prs,
Pipelines: ps,
Tasks: ts,
}

// create fake recorder for testing
fr := record.NewFakeRecorder(2)

testAssets := getPipelineRunController(d, fr)
c := testAssets.Controller
clients := testAssets.Clients

err := c.Reconciler.Reconcile(context.Background(), "foo/test-pipeline-run-with-timeout")
if err != nil {
t.Errorf("Did not expect to see error when reconciling completed PipelineRun but saw %s", err)
}

// Check that the PipelineRun was reconciled correctly
reconciledRun, err := clients.Pipeline.Tekton().PipelineRuns("foo").Get("test-pipeline-run-with-timeout", metav1.GetOptions{})
if err != nil {
t.Fatalf("Somehow had error getting completed reconciled run out of fake client: %s", err)
}

// The PipelineRun should be still cancelled.
if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason != "PipelineRunCancelled" {
t.Errorf("Expected PipelineRun to be cancelled, but condition reason is %s", reconciledRun.Status.GetCondition(apis.ConditionSucceeded))
}

// Check that no TaskRun is created or run
actions := clients.Pipeline.Actions()
for _, action := range actions {
actionType := fmt.Sprintf("%T", action)
if !(actionType == "testing.UpdateActionImpl" || actionType == "testing.GetActionImpl") {
t.Errorf("Expected a TaskRun to be get/updated, but it was %s", actionType)
}
}
}

func TestReconcilePropagateLabels(t *testing.T) {
names.TestingSeed()
Expand Down Expand Up @@ -742,3 +801,102 @@ func TestReconcilePropagateLabels(t *testing.T) {
t.Errorf("expected to see TaskRun %v created. Diff %s", expectedTaskRun, d)
}
}

func TestReconcileWithTimeoutAndRetry(t *testing.T) {

tcs := []struct {
name string
retries int
conditionSucceeded corev1.ConditionStatus
}{
{
name: "One try has to be done",
retries: 1,
conditionSucceeded: corev1.ConditionFalse,
},
{
name: "No more retries are needed",
retries: 2,
conditionSucceeded: corev1.ConditionUnknown,
},
}

for _, tc := range tcs {

t.Run(tc.name, func(t *testing.T) {
ps := []*v1alpha1.Pipeline{tb.Pipeline("test-pipeline-retry", "foo", tb.PipelineSpec(
tb.PipelineTask("hello-world-1", "hello-world", tb.Retries(tc.retries)),
))}
prs := []*v1alpha1.PipelineRun{tb.PipelineRun("test-pipeline-retry-run-with-timeout", "foo",
tb.PipelineRunSpec("test-pipeline-retry",
tb.PipelineRunServiceAccount("test-sa"),
tb.PipelineRunTimeout(&metav1.Duration{Duration: 12 * time.Hour}),
),
tb.PipelineRunStatus(
tb.PipelineRunStartTime(time.Now().AddDate(0, 0, -1))),
)}

ts := []*v1alpha1.Task{
tb.Task("hello-world", "foo"),
}
trs := []*v1alpha1.TaskRun{
tb.TaskRun("hello-world-1", "foo",
tb.TaskRunStatus(
tb.PodName("my-pod-name"),
tb.Condition(apis.Condition{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
}),
tb.Retry(v1alpha1.TaskRunStatus{
Status: duckv1beta1.Status{
Conditions: []apis.Condition{{
Type: apis.ConditionSucceeded,
Status: corev1.ConditionFalse,
}},
},
}),
)),
}

prtrs := &v1alpha1.PipelineRunTaskRunStatus{
PipelineTaskName: "hello-world-1",
Status: &trs[0].Status,
}
prs[0].Status.TaskRuns = make(map[string]*v1alpha1.PipelineRunTaskRunStatus)
prs[0].Status.TaskRuns["hello-world-1"] = prtrs

d := test.Data{
PipelineRuns: prs,
Pipelines: ps,
Tasks: ts,
TaskRuns: trs,
}

fr := record.NewFakeRecorder(2)

testAssets := getPipelineRunController(d, fr)
c := testAssets.Controller
clients := testAssets.Clients

err := c.Reconciler.Reconcile(context.Background(), "foo/test-pipeline-retry-run-with-timeout")
if err != nil {
t.Errorf("Did not expect to see error when reconciling completed PipelineRun but saw %s", err)
}

// Check that the PipelineRun was reconciled correctly
reconciledRun, err := clients.Pipeline.TektonV1alpha1().PipelineRuns("foo").Get("test-pipeline-retry-run-with-timeout", metav1.GetOptions{})
if err != nil {
t.Fatalf("Somehow had error getting completed reconciled run out of fake client: %s", err)
}

if len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus) != tc.retries {
t.Fatalf(" %d retry expected but %d ", tc.retries, len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus))
}

if status := reconciledRun.Status.TaskRuns["hello-world-1"].Status.GetCondition(apis.ConditionSucceeded).Status; status != tc.conditionSucceeded {
t.Fatalf("Succedded expected to be %s but is %s", tc.conditionSucceeded, status)
}

})
}
}
Loading

0 comments on commit 295bf9c

Please sign in to comment.