diff --git a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala index 0f1e2c931a8..c98e429c63d 100644 --- a/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala +++ b/backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala @@ -244,6 +244,8 @@ trait StandardAsyncExecutionActor } } + lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty + /** * Returns the shell scripting for finding all files listed within a directory. * @@ -1259,6 +1261,7 @@ trait StandardAsyncExecutionActor def handleExecutionResult(status: StandardAsyncRunState, oldHandle: StandardAsyncPendingExecutionHandle): Future[ExecutionHandle] = { + // Returns true if the task has written an RC file that indicates OOM, false otherwise def memoryRetryRC: Future[Boolean] = { def returnCodeAsBoolean(codeAsOption: Option[String]): Boolean = { codeAsOption match { @@ -1299,11 +1302,11 @@ trait StandardAsyncExecutionActor // Only check stderr size if we need to, otherwise this results in a lot of unnecessary I/O that // may fail due to race conditions on quickly-executing jobs. stderrSize <- if (failOnStdErr) asyncIo.sizeAsync(stderr) else Future.successful(0L) - retryWithMoreMemory <- memoryRetryRC - } yield (stderrSize, returnCodeAsString, retryWithMoreMemory) + outOfMemoryDetected <- memoryRetryRC + } yield (stderrSize, returnCodeAsString, outOfMemoryDetected) stderrSizeAndReturnCodeAndMemoryRetry flatMap { - case (stderrSize, returnCodeAsString, retryWithMoreMemory) => + case (stderrSize, returnCodeAsString, outOfMemoryDetected) => val tryReturnCodeAsInt = Try(returnCodeAsString.trim.toInt) if (isDone(status)) { @@ -1311,13 +1314,15 @@ trait StandardAsyncExecutionActor case Success(returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 => val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(StderrNonEmpty(jobDescriptor.key.tag, stderrSize, stderrAsOption), Option(returnCodeAsInt), None)) retryElseFail(executionHandle) - case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => - Future.successful(AbortedExecutionHandle) case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) => handleExecutionSuccess(status, oldHandle, returnCodeAsInt) - case Success(returnCodeAsInt) if retryWithMoreMemory => + // It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case; + // if it was caused by OOM killer, want to handle as OOM and not job abort. + case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested => val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), Option(returnCodeAsInt), None)) - retryElseFail(executionHandle, retryWithMoreMemory) + retryElseFail(executionHandle, outOfMemoryDetected) + case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => + Future.successful(AbortedExecutionHandle) case Success(returnCodeAsInt) => val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption), Option(returnCodeAsInt), None)) retryElseFail(executionHandle) @@ -1326,9 +1331,9 @@ trait StandardAsyncExecutionActor } } else { tryReturnCodeAsInt match { - case Success(returnCodeAsInt) if retryWithMoreMemory && !continueOnReturnCode.continueFor(returnCodeAsInt) => + case Success(returnCodeAsInt) if outOfMemoryDetected && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) => val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), Option(returnCodeAsInt), None)) - retryElseFail(executionHandle, retryWithMoreMemory) + retryElseFail(executionHandle, outOfMemoryDetected) case _ => val failureStatus = handleExecutionFailure(status, tryReturnCodeAsInt.toOption) retryElseFail(failureStatus) diff --git a/centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_after_137.wdl b/centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_after_137.wdl new file mode 100644 index 00000000000..2fe434475c6 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/retry_with_more_memory/retry_with_more_memory_after_137.wdl @@ -0,0 +1,22 @@ +version 1.0 + +task imitate_oom_error { + command { + printf "Exception in thread "main" java.lang.OutOfMemoryError: testing\n\tat Test.main(Test.java:1)\n" >&2 + touch foo + exit 137 + } + output { + File foo = "foo" + } + runtime { + docker: "python:latest" + memory: "1 GB" + maxRetries: 2 + backend: "Papiv2" + } +} + +workflow retry_with_more_memory_after_137 { + call imitate_oom_error +} diff --git a/centaur/src/main/resources/standardTestCases/retry_with_more_memory_after_137.test b/centaur/src/main/resources/standardTestCases/retry_with_more_memory_after_137.test new file mode 100644 index 00000000000..a69290ca511 --- /dev/null +++ b/centaur/src/main/resources/standardTestCases/retry_with_more_memory_after_137.test @@ -0,0 +1,21 @@ +name: retry_with_more_memory_after_137 +testFormat: workflowfailure +backends: [Papiv2] + +files { + workflow: retry_with_more_memory/retry_with_more_memory_after_137.wdl + options: retry_with_more_memory/retry_with_more_memory.options +} + +metadata { + workflowName: retry_with_more_memory_after_137 + status: Failed + "failures.0.message": "Workflow failed" + "failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory_after_137.imitate_oom_error:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory." + "retry_with_more_memory_after_137.imitate_oom_error.-1.1.executionStatus": "RetryableFailure" + "retry_with_more_memory_after_137.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB" + "retry_with_more_memory_after_137.imitate_oom_error.-1.2.executionStatus": "RetryableFailure" + "retry_with_more_memory_after_137.imitate_oom_error.-1.2.runtimeAttributes.memory": "1.1 GB" + "retry_with_more_memory_after_137.imitate_oom_error.-1.3.executionStatus": "Failed" + "retry_with_more_memory_after_137.imitate_oom_error.-1.3.runtimeAttributes.memory": "1.2100000000000002 GB" +}