Skip to content

Commit

Permalink
orchestrator: Flag tasks that shouldn't be restarted
Browse files Browse the repository at this point in the history
Previously, restart conditions other than "OnAny" were honored on a
best-effort basis. A service-level reconciliation, for example after a
leader election, would see that not enough tasks were running, and start
replacement tasks regardless of the restart policy. This limited the
usefulness of the other restart conditions.

This change adds a DontRestart flag to Task. It can be set by the
restart supervisor when it shuts down a task and decides not to start a
replacement task. The orchestrators look for the presence of this flag
and honor it when doing service-level reconciliation. If the flag is
set, the dead task is passed to the updater along with the running
tasks, so the updater can start a replacement if and only if the service
definition has changed relative to the dead task.

The task reaper has been modified so it will never delete the last task
in a slot, if that task has the DontRestart flag set.

Signed-off-by: Aaron Lehmann <[email protected]>
  • Loading branch information
aaronlehmann committed Jul 21, 2017
1 parent 2cb8399 commit 2d279a2
Show file tree
Hide file tree
Showing 10 changed files with 380 additions and 204 deletions.
232 changes: 137 additions & 95 deletions api/objects.pb.go

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions api/objects.proto
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ message Task {
// is only written by the manager.
TaskState desired_state = 10;

// DontRestart indicates that the restart supervisor decided not to
// start a replacement task for this task. This flag records the
// decision so that orchestrators can honor it when they do
// service-level reconciliation.
bool dont_restart = 16;

// List of network attachments by the task.
repeated NetworkAttachment networks = 11;

Expand Down
9 changes: 6 additions & 3 deletions manager/orchestrator/global/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,11 @@ func (g *Orchestrator) reconcileServices(ctx context.Context, serviceIDs []strin
nodeTasks[serviceID] = make(map[string][]*api.Task)

for _, t := range tasks {
if t.DesiredState <= api.TaskStateRunning {
// Collect all running instances of this service
if t.DesiredState <= api.TaskStateRunning || t.DontRestart {
// Collect all runnable instances of this service,
// and instances that were not be restarted due
// to restart policy but may be updated if the
// service spec changed.
nodeTasks[serviceID][t.NodeID] = append(nodeTasks[serviceID][t.NodeID], t)
}
}
Expand Down Expand Up @@ -405,7 +408,7 @@ func (g *Orchestrator) reconcileOneNode(ctx context.Context, node *api.Node) {
if t.ServiceID != serviceID {
continue
}
if t.DesiredState <= api.TaskStateRunning {
if t.DesiredState <= api.TaskStateRunning || t.DontRestart {
tasks[serviceID] = append(tasks[serviceID], t)
}
}
Expand Down
28 changes: 21 additions & 7 deletions manager/orchestrator/global/global_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ func TestNodeAvailability(t *testing.T) {
testutils.Expect(t, watch, state.EventCommit{})

// updating the service shouldn't restart the task
updateService(t, store, service1)
updateService(t, store, service1, true)
testutils.Expect(t, watch, api.EventUpdateService{})
testutils.Expect(t, watch, state.EventCommit{})
select {
Expand Down Expand Up @@ -241,7 +241,7 @@ func TestNodeAvailability(t *testing.T) {
testutils.Expect(t, watch, state.EventCommit{})

// updating the service shouldn't restart the task
updateService(t, store, service1)
updateService(t, store, service1, true)
testutils.Expect(t, watch, api.EventUpdateService{})
testutils.Expect(t, watch, state.EventCommit{})
select {
Expand Down Expand Up @@ -277,7 +277,7 @@ func TestNodeState(t *testing.T) {
testutils.Expect(t, watch, state.EventCommit{})

// updating the service shouldn't restart the task
updateService(t, store, service1)
updateService(t, store, service1, true)
testutils.Expect(t, watch, api.EventUpdateService{})
testutils.Expect(t, watch, state.EventCommit{})
select {
Expand Down Expand Up @@ -425,8 +425,20 @@ func TestTaskFailure(t *testing.T) {
case <-time.After(100 * time.Millisecond):
}

// update the service. now the task should be recreated.
updateService(t, store, serviceNoRestart)
// update the service with no spec changes, to trigger a
// reconciliation. the task should still not be updated.
updateService(t, store, serviceNoRestart, false)
testutils.Expect(t, watch, api.EventUpdateService{})
testutils.Expect(t, watch, state.EventCommit{})

select {
case event := <-watch:
t.Fatalf("got unexpected event %T: %+v", event, event)
case <-time.After(100 * time.Millisecond):
}

// update the service with spec changes. now the task should be recreated.
updateService(t, store, serviceNoRestart, true)
testutils.Expect(t, watch, api.EventUpdateService{})
testutils.Expect(t, watch, state.EventCommit{})

Expand All @@ -444,11 +456,13 @@ func addService(t *testing.T, s *store.MemoryStore, service *api.Service) {
})
}

func updateService(t *testing.T, s *store.MemoryStore, service *api.Service) {
func updateService(t *testing.T, s *store.MemoryStore, service *api.Service, force bool) {
s.Update(func(tx store.Tx) error {
service := store.GetService(tx, service.ID)
require.NotNil(t, service)
service.Spec.Task.ForceUpdate++
if force {
service.Spec.Task.ForceUpdate++
}
assert.NoError(t, store.UpdateService(tx, service))
return nil
})
Expand Down
Loading

0 comments on commit 2d279a2

Please sign in to comment.