From 11b7ee484b44779a593350b896add8d430998250 Mon Sep 17 00:00:00 2001
From: "Jerry J. Harrow" <84593277+jerryharrow@users.noreply.github.com>
Date: Tue, 16 May 2023 14:25:39 -0400
Subject: [PATCH] fix: Tighten up status check on delayed launch [DET-9474]
 (#854)

If the synchronous launch takes too long, we may run into a
job monitoring poll which will detect the dispatchID as 404/NOT_FOUND
and stop monitoring. This would cause status updates to stop and
the job to show only QUEUED even though it is executing.
Add a launchInProgress flag to the job monitor to cause 404/NOT_FOUND
to be ignored until the launch has succeeded to avoid this case.
---
 .../rm/dispatcherrm/dispatcher_monitor.go     | 45 ++++++++++++++++---
 .../dispatcherrm/dispatcher_monitor_test.go   |  2 +-
 .../dispatcher_resource_manager.go            |  7 ++-
 3 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/master/internal/rm/dispatcherrm/dispatcher_monitor.go b/master/internal/rm/dispatcherrm/dispatcher_monitor.go
index d3a4d763fc40..ad59f9d2c3b5 100644
--- a/master/internal/rm/dispatcherrm/dispatcher_monitor.go
+++ b/master/internal/rm/dispatcherrm/dispatcher_monitor.go
@@ -67,6 +67,7 @@ type launcherJob struct {
 	totalContainers        int
 	runningContainers      map[int]containerInfo
 	jobWasTerminated       bool
+	launchInProgress       bool // Launch proceeding concurrent with monitoring
 }
 
 // launcherMonitor describes the monitoring of jobs created by the launcher.
@@ -98,7 +99,10 @@ func newDispatchWatcher(apiClient *launcherAPIClient) *launcherMonitor {
 
 // monitorJob adds the specified job to the collection of jobs whose status is monitored.
 // payload name may be empty (when reconnecting), monitor will retrieve if necessary.
-func (m *launcherMonitor) monitorJob(user string, dispatchID string, payloadName string) {
+// When launchPending is true, a later call to notifyJobLaunched is required once the
+// launcher has initiated the launch and the dispatchID will be valid for status checks.
+func (m *launcherMonitor) monitorJob(
+	user string, dispatchID string, payloadName string, launchPending bool) {
 	m.newLauncherJob <- launcherJob{
 		user:                   user,
 		dispatcherID:           dispatchID,
@@ -107,9 +111,28 @@ func (m *launcherMonitor) monitorJob(user string, dispatchID string, payloadName
 		totalContainers:        0,
 		runningContainers:      make(map[int]containerInfo),
 		jobWasTerminated:       false,
+		launchInProgress:       launchPending,
 	}
 }
 
+// When monitoring commences with parameter isLaunching:true specified, 404/NOT_FOUND
+// status returns are ignored for this job until notifyJobLaunched is called to enable
+// normal monitoring.
+func (m *launcherMonitor) notifyJobLaunched(
+	ctx *actor.Context,
+	dispatchID string,
+) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if job, ok := m.monitoredJobs[dispatchID]; ok {
+		job.launchInProgress = false
+		return
+	}
+
+	ctx.Log().Errorf("Could not find dispatchID %s to mark as launched", dispatchID)
+}
+
 // removeJob removes the specified job from the collection of jobs whose status is monitored.
 func (m *launcherMonitor) removeJob(dispatchID string) {
 	m.removeLauncherJob <- launcherJob{
@@ -244,6 +267,9 @@ func filterOutSuperfluousMessages(allMessages []string) []string {
 	return messagesMatchingPattern
 }
 
+// Receive notification that one of the ranks of the job has started
+// execution.  It generates experiment log messages as the ranks
+// start, and updates job info so we know when all have started.
 func (m *launcherMonitor) notifyContainerRunning(
 	ctx *actor.Context,
 	dispatchID string,
@@ -261,6 +287,7 @@ func (m *launcherMonitor) notifyContainerRunning(
 		return
 	}
 
+	job.launchInProgress = false
 	job.totalContainers = int(numPeers)
 
 	switch existingEntry, ok := job.runningContainers[int(rank)]; {
@@ -645,7 +672,7 @@ func (m *launcherMonitor) updateJobStatus(ctx *actor.Context, job *launcherJob)
 
 	ctx.Log().WithField("dispatch-id", dispatchID).Debug("Checking status of launcher job")
 
-	resp, ok := m.getDispatchStatus(ctx, owner, dispatchID)
+	resp, ok := m.getDispatchStatus(ctx, owner, dispatchID, job.launchInProgress)
 
 	// Dispatch was not found.
 	if !ok {
@@ -761,7 +788,7 @@ dispatch exists, or false if it no longer exists (i.e. 404).
 */
 func (m *launcherMonitor) getDispatchStatus(
 	ctx *actor.Context, owner string,
-	dispatchID string,
+	dispatchID string, ignoreNotFound bool,
 ) (dispatchInfo launcher.DispatchInfo, dispatchFound bool) {
 	resp, r, err := m.apiClient.MonitoringApi.
 		GetEnvironmentStatus(m.apiClient.withAuth(context.TODO()), owner, dispatchID).
@@ -777,8 +804,8 @@ func (m *launcherMonitor) getDispatchStatus(
 			ctx.Log().WithField("dispatch-id", dispatchID).
 				Infof("The job status could not be obtained because the launcher returned HTTP code 404")
 
-			// No details, but we know dispatch does not exist
-			return launcher.DispatchInfo{}, false
+			// No details, but we know dispatch does not exist (false unless ignoreNotFound)
+			return launcher.DispatchInfo{}, ignoreNotFound
 		}
 
 		if r != nil && (r.StatusCode == http.StatusUnauthorized ||
@@ -933,13 +960,19 @@ func (m *launcherMonitor) getTaskLogsFromDispatcher(
 
 /*
 Return true if the specified dispatch is in a non-terminal (still running) state.
+Or launch is still in progress.
 */
 func (m *launcherMonitor) isDispatchInProgress(
 	ctx *actor.Context,
 	owner string,
 	dispatchID string,
 ) bool {
-	resp, ok := m.getDispatchStatus(ctx, owner, dispatchID)
+	ignoreNotFound := false
+	if job, ok := m.getJobByDispatchID(dispatchID); ok {
+		ignoreNotFound = job.launchInProgress
+	}
+
+	resp, ok := m.getDispatchStatus(ctx, owner, dispatchID, ignoreNotFound)
 
 	// Dispatch was not found.
 	if !ok {
diff --git a/master/internal/rm/dispatcherrm/dispatcher_monitor_test.go b/master/internal/rm/dispatcherrm/dispatcher_monitor_test.go
index e208378783cb..01ebe1e22002 100644
--- a/master/internal/rm/dispatcherrm/dispatcher_monitor_test.go
+++ b/master/internal/rm/dispatcherrm/dispatcher_monitor_test.go
@@ -239,7 +239,7 @@ func TestMonitorJobOperations(t *testing.T) {
 	job := getJob()
 
 	// Add the job to the monitored jobs.
-	jobWatcher.monitorJob(job.user, job.dispatcherID, job.payloadName)
+	jobWatcher.monitorJob(job.user, job.dispatcherID, job.payloadName, false)
 	// Wait for the job to be added to the monitored jobs with a timeout of 30 seconds.
 	timeout := time.Now().Add(30 * time.Second)
 	for !(jobWatcher.isJobBeingMonitored(job.dispatcherID)) {
diff --git a/master/internal/rm/dispatcherrm/dispatcher_resource_manager.go b/master/internal/rm/dispatcherrm/dispatcher_resource_manager.go
index bf829b10bf04..cd7613079481 100644
--- a/master/internal/rm/dispatcherrm/dispatcher_resource_manager.go
+++ b/master/internal/rm/dispatcherrm/dispatcher_resource_manager.go
@@ -1020,7 +1020,7 @@ func (m *dispatcherResourceManager) startLauncherJob(
 	// Pre-register dispatchID (which is now the AllocationID) with the job
 	// monitor such that notifyContainerRunning calls that might be delivered prior
 	// to the synchronous launch returning will be handled properly.
-	m.jobWatcher.monitorJob(impersonatedUser, dispatchID, payloadName)
+	m.jobWatcher.monitorJob(impersonatedUser, dispatchID, payloadName, true)
 
 	tempDispatchID, err := m.sendManifestToDispatcher(
 		ctx, manifest, impersonatedUser, string(msg.AllocationID))
@@ -1040,6 +1040,9 @@ func (m *dispatcherResourceManager) startLauncherJob(
 
 		sendResourceStateChangedErrorResponse(ctx, err, msg,
 			"unable to create the launcher job")
+	} else {
+		// Successful launch, clear launchInProgress status
+		m.jobWatcher.notifyJobLaunched(ctx, dispatchID)
 	}
 
 	if tempDispatchID != dispatchID {
@@ -1674,7 +1677,7 @@ func (m *dispatcherResourceManager) assignResources(
 			ctx.Log().Infof("Reconnecting ResourceID %s, DispatchID %s, ImpersontatedUser: %s",
 				rID, dispatchID, impersonatedUser)
 
-			m.jobWatcher.monitorJob(impersonatedUser, dispatchID, "")
+			m.jobWatcher.monitorJob(impersonatedUser, dispatchID, "", false)
 		}
 	} else {
 		ctx.Log().