Skip to content

Commit

Permalink
feat: support USER_CANCELLED exited reason (#637)
Browse files Browse the repository at this point in the history
  • Loading branch information
brainhart authored Jun 3, 2020
1 parent d1146d3 commit 4bad652
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 15 deletions.
20 changes: 11 additions & 9 deletions master/internal/trial.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ func (t *trial) processAssigned(ctx *actor.Context, msg scheduler.Assigned) erro
}

func (t *trial) processCompletedWorkload(ctx *actor.Context, msg searcher.CompletedMessage) error {
if !t.replaying && msg.ExitedReason == nil {
if !t.replaying && (msg.ExitedReason == nil || *msg.ExitedReason == searcher.UserCanceled) {
if err := markWorkloadCompleted(t.db, msg); err != nil {
ctx.Log().Error(err)
}
Expand All @@ -481,20 +481,22 @@ func (t *trial) processCompletedWorkload(ctx *actor.Context, msg searcher.Comple
// metrics, the Experiment will respond with whether or not we should take a checkpoint.
experimentFuture := ctx.Ask(ctx.Self().Parent(), msg)

if msg.ExitedReason == nil {
if err := t.sequencer.WorkloadCompleted(msg, experimentFuture); err != nil {
return errors.Wrap(err, "Error passing CompletedMessage to sequencer")
}
if err := t.sequencer.WorkloadCompleted(msg, experimentFuture); err != nil {
return errors.Wrap(err, "Error passing CompletedMessage to sequencer")
}

// Decide what to do next.
terminateNow := false
if msg.ExitedReason != nil {
ctx.Log().Info("exiting trial early")
t.earlyExit = true
if *msg.ExitedReason == searcher.Errored {
return nil
}
}
var w searcher.Workload
var err error
switch {
case msg.ExitedReason != nil:
ctx.Log().Info("exiting trial early")
t.earlyExit = true
// We have another workload to run.
case !t.pendingGracefulTermination && !t.sequencer.UpToDate():
w, err = t.sequencer.Workload()
Expand All @@ -520,7 +522,7 @@ func (t *trial) processCompletedWorkload(ctx *actor.Context, msg searcher.Comple
}

// Command the trial runner to do the thing we decided on (if this is not a replay).
if !t.replaying && !t.earlyExit {
if !t.replaying {
var msg interface{}
if terminateNow {
w = *t.sequencer.TerminateWorkload()
Expand Down
17 changes: 12 additions & 5 deletions master/internal/trial_workload_sequencer.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,18 @@ func (s *trialWorkloadSequencer) WorkloadCompleted(
case searcher.RunStep:
s.curStep++
s.curStepDone = stepInfo{}
if s.minValidationNeeded() {
s.steps[msg.Workload.StepID].hasValidation = true
}
if s.minCheckpointNeeded() {
s.steps[msg.Workload.StepID].hasCheckpoint = true
if msg.ExitedReason != nil {
s.steps = s.steps[:msg.Workload.StepID+1]
if *msg.ExitedReason == searcher.UserCanceled {
s.steps[msg.Workload.StepID].hasCheckpoint = true
}
} else {
if s.minValidationNeeded() {
s.steps[msg.Workload.StepID].hasValidation = true
}
if s.minCheckpointNeeded() {
s.steps[msg.Workload.StepID].hasCheckpoint = true
}
}
case searcher.CheckpointModel:
// During replay, a checkpoint can show up for earlier than the current step ID if the
Expand Down
4 changes: 3 additions & 1 deletion master/pkg/searcher/message.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ type CompletedMessage struct {
Workload Workload `json:"workload"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
ExitedReason *ExitedReason `json:"exited_reason"`
RawMetrics json.RawMessage `json:"metrics,omitempty"`
CheckpointMetrics *CheckpointMetrics
ValidationMetrics *ValidationMetrics
RunMetrics map[string]interface{}
ExitedReason *ExitedReason
}

// UnmarshalJSON unmarshals the provided bytes into a workload.CompletedMessage. An error is
Expand Down Expand Up @@ -82,4 +82,6 @@ type ExitedReason string
const (
// Errored signals the searcher that the workload errored out.
Errored ExitedReason = "ERRORED"
// UserCanceled signals the searcher that the user requested a cancelation.
UserCanceled ExitedReason = "USER_CANCELED"
)

0 comments on commit 4bad652

Please sign in to comment.