Skip to content

Commit

Permalink
fix: Revert back to LaunchAsync for Slurm jobs [FOUNDENG-465] (#696)
Browse files Browse the repository at this point in the history
With synchronous launch the jobs is fully created (and potentially running)
before we get back the dispatchID and associate it with the allocation id.
If that happens we cannot associate the NotifyContainerRunning event with
the dispatch ID and we leave the container in PULLING state.
  • Loading branch information
jerryharrow authored and determined-ci committed Apr 18, 2024
1 parent 2fa7243 commit 050951e
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions master/internal/rm/dispatcherrm/dispatcher_resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,12 +660,18 @@ func (m *dispatcherResourceManager) receiveRequestMsg(ctx *actor.Context) error
return nil
}

foundMonitoredDispatch := false
for _, dispatch := range dispatches {
dispatchID := dispatch.DispatchID
if m.jobWatcher.isJobBeingMonitored(dispatchID) {
foundMonitoredDispatch = true
m.jobWatcher.notifyContainerRunning(ctx, dispatchID, msg.Rank, msg.NumPeers, msg.NodeName)
}
}
if !foundMonitoredDispatch {
ctx.Log().WithField("allocation-id", msg.AllocationID).Warnf(
"NotifyContainerRunning did not find an active, monitored dispatch")
}

case KillDispatcherResources:
ctx.Log().Debugf("Received request to terminate jobs associated with AllocationID %s",
Expand Down Expand Up @@ -1635,10 +1641,10 @@ func (m *dispatcherResourceManager) sendManifestToDispatcher(
impersonatedUser string,
) (string, error) {
/*
* Ask the launcher to run the job. We switched from using "LaunchAsync()"
* to using "Launch()" to deal with an issue where the dispatcher monitor
* asks for job status before the launcher sets up the job status record
* keeping, which causes the job status request to get a 404 Not Found.
* "LaunchAsync()" does not wait for the "launcher" to move the job to the "RUNNING"
* state and returns right away while the job is still in the "PENDING" state. If it
* becomes necessary to wait for the job to be in the "RUNNING" state, we can switch
* to using "Launch()".
*
* The "manifest" describes the job to be launched and includes any environment
* variables, mount points, etc., that are needed by the job.
Expand All @@ -1648,7 +1654,7 @@ func (m *dispatcherResourceManager) sendManifestToDispatcher(
* (e.g. "/etc/passwd"), LDAP, or some other authentication mechanism.
*/
dispatchInfo, response, err := m.apiClient.LaunchApi.
Launch(m.authContext(ctx)).
LaunchAsync(m.authContext(ctx)).
Manifest(*manifest).
Impersonate(impersonatedUser).
Execute() //nolint:bodyclose
Expand All @@ -1658,7 +1664,7 @@ func (m *dispatcherResourceManager) sendManifestToDispatcher(
// So we can show the HTTP status code, if available.
httpStatus = fmt.Sprintf("(HTTP status %d)", response.StatusCode)
}
return "", errors.Wrapf(err, "LaunchApi.Launch() returned an error %s, response: {%v}. "+
return "", errors.Wrapf(err, "LaunchApi.LaunchAsync() returned an error %s, response: {%v}. "+
"Verify that the launcher service is up and reachable. Try a restart the "+
"launcher service followed by a restart of the determined-master service.", httpStatus, response)
}
Expand Down

0 comments on commit 050951e

Please sign in to comment.