broadinstitute · jgainerdewar · Sep 23, 2022 · Sep 20, 2022 · Sep 20, 2022 · Sep 21, 2022
@@ -244,6 +244,8 @@ trait StandardAsyncExecutionActor
     }
   }
 
+  lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty
+
   /**
     * Returns the shell scripting for finding all files listed within a directory.
     *
@@ -1311,13 +1313,15 @@ trait StandardAsyncExecutionActor
             case Success(returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 =>
               val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(StderrNonEmpty(jobDescriptor.key.tag, stderrSize, stderrAsOption), Option(returnCodeAsInt), None))
               retryElseFail(executionHandle)
-            case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) =>
-              Future.successful(AbortedExecutionHandle)
             case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) =>
               handleExecutionSuccess(status, oldHandle, returnCodeAsInt)
-            case Success(returnCodeAsInt) if retryWithMoreMemory  =>
+            // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case;
+            // if it was caused by OOM killer, want to handle as OOM and not job abort.
+            case Success(returnCodeAsInt) if retryWithMoreMemory && memoryRetryRequested  =>
-            case Success(returnCodeAsInt) if retryWithMoreMemory && memoryRetryRequested  =>
+            case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested  =>
-            case Success(returnCodeAsInt) if retryWithMoreMemory && memoryRetryRequested  =>
+            case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested  =>
               val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), Option(returnCodeAsInt), None))
               retryElseFail(executionHandle, retryWithMoreMemory)
+            case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) =>
+              Future.successful(AbortedExecutionHandle)
             case Success(returnCodeAsInt) =>
               val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption), Option(returnCodeAsInt), None))
               retryElseFail(executionHandle)
@@ -1326,7 +1330,7 @@ trait StandardAsyncExecutionActor
           }
         } else {
           tryReturnCodeAsInt match {
-            case Success(returnCodeAsInt) if retryWithMoreMemory && !continueOnReturnCode.continueFor(returnCodeAsInt) =>
+            case Success(returnCodeAsInt) if retryWithMoreMemory && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) =>
               val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), Option(returnCodeAsInt), None))
               retryElseFail(executionHandle, retryWithMoreMemory)
             case _ =>

@@ -0,0 +1,22 @@
+version 1.0
+
+task imitate_oom_error {
+  command {
+    printf "Exception in thread "main" java.lang.OutOfMemoryError: testing\n\tat Test.main(Test.java:1)\n" >&2
+    touch foo
+    exit 137
+  }
+  output {
+    File foo = "foo"
+  }
+  runtime {
+    docker: "python:latest"
+    memory: "1 GB"
+    maxRetries: 2
+    backend: "Papiv2"
+  }
+}
+
+workflow retry_with_more_memory_after_137 {
+  call imitate_oom_error
+}
@@ -0,0 +1,21 @@
+name: retry_with_more_memory_after_137
+testFormat: workflowfailure
+backends: [Papiv2]
+
+files {
+  workflow: retry_with_more_memory/retry_with_more_memory_after_137.wdl
+  options: retry_with_more_memory/retry_with_more_memory.options
+}
+
+metadata {
+  workflowName: retry_with_more_memory_after_137
+  status: Failed
+  "failures.0.message": "Workflow failed"
+  "failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory_after_137.imitate_oom_error:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory."
+  "retry_with_more_memory_after_137.imitate_oom_error.-1.1.executionStatus": "RetryableFailure"
+  "retry_with_more_memory_after_137.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB"
+  "retry_with_more_memory_after_137.imitate_oom_error.-1.2.executionStatus": "RetryableFailure"
+  "retry_with_more_memory_after_137.imitate_oom_error.-1.2.runtimeAttributes.memory": "1.1 GB"
+  "retry_with_more_memory_after_137.imitate_oom_error.-1.3.executionStatus": "Failed"
+  "retry_with_more_memory_after_137.imitate_oom_error.-1.3.runtimeAttributes.memory": "1.2100000000000002 GB"
+}