Skip to content

Commit

Permalink
fix: Ensure exit sent on lost dispatch [FE-3] (#857)
Browse files Browse the repository at this point in the history
If a dispatch suddenly disappeared from the luancher (404) without any action
monitoring was dropped without any notification of job completion.
On 404, notify that the job was lost and terminate.
  • Loading branch information
jerryharrow authored and determined-ci committed Feb 2, 2024
1 parent 22e756c commit f92e520
Showing 1 changed file with 11 additions and 7 deletions.
18 changes: 11 additions & 7 deletions master/internal/rm/dispatcherrm/dispatcher_monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -676,16 +676,20 @@ func (m *launcherMonitor) updateJobStatus(ctx *actor.Context, job *launcherJob)

// Dispatch was not found.
if !ok {
missingDispatchMsg := "The job was canceled"
if job.jobWasTerminated {
ctx.Log().WithField("dispatch-id", dispatchID).Infof("The job was canceled")

ctx.Tell(ctx.Self(), DispatchExited{
DispatchID: dispatchID,
ExitCode: -1,
Message: "Job was canceled",
})
ctx.Log().WithField("dispatch-id", dispatchID).Info(missingDispatchMsg)
} else {
missingDispatchMsg = "The job was lost"
ctx.Log().WithField("dispatch-id", dispatchID).Error(missingDispatchMsg)
}

ctx.Tell(ctx.Self(), DispatchExited{
DispatchID: dispatchID,
ExitCode: -1,
Message: missingDispatchMsg,
})

return true
}

Expand Down

0 comments on commit f92e520

Please sign in to comment.