-
Notifications
You must be signed in to change notification settings - Fork 361
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SUP-692 Retry with more memory after RC 137 #6912
Changes from 5 commits
3d61e8a
0dfa39f
f5f412a
b6be4f7
8ad86dc
88694cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -244,6 +244,8 @@ trait StandardAsyncExecutionActor | |||||
} | ||||||
} | ||||||
|
||||||
lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty | ||||||
|
||||||
/** | ||||||
* Returns the shell scripting for finding all files listed within a directory. | ||||||
* | ||||||
|
@@ -1311,13 +1313,15 @@ trait StandardAsyncExecutionActor | |||||
case Success(returnCodeAsInt) if failOnStdErr && stderrSize.intValue > 0 => | ||||||
val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(StderrNonEmpty(jobDescriptor.key.tag, stderrSize, stderrAsOption), Option(returnCodeAsInt), None)) | ||||||
retryElseFail(executionHandle) | ||||||
case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => | ||||||
Future.successful(AbortedExecutionHandle) | ||||||
case Success(returnCodeAsInt) if continueOnReturnCode.continueFor(returnCodeAsInt) => | ||||||
handleExecutionSuccess(status, oldHandle, returnCodeAsInt) | ||||||
case Success(returnCodeAsInt) if retryWithMoreMemory => | ||||||
// It's important that we check retryWithMoreMemory case before isAbort. RC could be 137 in either case; | ||||||
// if it was caused by OOM killer, want to handle as OOM and not job abort. | ||||||
case Success(returnCodeAsInt) if retryWithMoreMemory && memoryRetryRequested => | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a more accurate name for this variable? As written now,
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, good call. I did find that name a little confusing when reading the code. I was trying not to change more than I had do, but I think this is worth doing. |
||||||
val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), Option(returnCodeAsInt), None)) | ||||||
retryElseFail(executionHandle, retryWithMoreMemory) | ||||||
case Success(returnCodeAsInt) if isAbort(returnCodeAsInt) => | ||||||
Future.successful(AbortedExecutionHandle) | ||||||
case Success(returnCodeAsInt) => | ||||||
val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(WrongReturnCode(jobDescriptor.key.tag, returnCodeAsInt, stderrAsOption), Option(returnCodeAsInt), None)) | ||||||
retryElseFail(executionHandle) | ||||||
|
@@ -1326,7 +1330,7 @@ trait StandardAsyncExecutionActor | |||||
} | ||||||
} else { | ||||||
tryReturnCodeAsInt match { | ||||||
case Success(returnCodeAsInt) if retryWithMoreMemory && !continueOnReturnCode.continueFor(returnCodeAsInt) => | ||||||
case Success(returnCodeAsInt) if retryWithMoreMemory && memoryRetryRequested && !continueOnReturnCode.continueFor(returnCodeAsInt) => | ||||||
val executionHandle = Future.successful(FailedNonRetryableExecutionHandle(RetryWithMoreMemory(jobDescriptor.key.tag, stderrAsOption, memoryRetryErrorKeys, log), Option(returnCodeAsInt), None)) | ||||||
retryElseFail(executionHandle, retryWithMoreMemory) | ||||||
case _ => | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
version 1.0 | ||
|
||
task imitate_oom_error { | ||
command { | ||
printf "Exception in thread "main" java.lang.OutOfMemoryError: testing\n\tat Test.main(Test.java:1)\n" >&2 | ||
touch foo | ||
exit 137 | ||
} | ||
output { | ||
File foo = "foo" | ||
} | ||
runtime { | ||
docker: "python:latest" | ||
memory: "1 GB" | ||
maxRetries: 2 | ||
backend: "Papiv2" | ||
} | ||
} | ||
|
||
workflow retry_with_more_memory_after_137 { | ||
call imitate_oom_error | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: retry_with_more_memory_after_137 | ||
testFormat: workflowfailure | ||
backends: [Papiv2] | ||
|
||
files { | ||
workflow: retry_with_more_memory/retry_with_more_memory_after_137.wdl | ||
options: retry_with_more_memory/retry_with_more_memory.options | ||
} | ||
|
||
metadata { | ||
workflowName: retry_with_more_memory_after_137 | ||
status: Failed | ||
"failures.0.message": "Workflow failed" | ||
"failures.0.causedBy.0.message": "stderr for job `retry_with_more_memory_after_137.imitate_oom_error:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory." | ||
"retry_with_more_memory_after_137.imitate_oom_error.-1.1.executionStatus": "RetryableFailure" | ||
"retry_with_more_memory_after_137.imitate_oom_error.-1.1.runtimeAttributes.memory": "1 GB" | ||
"retry_with_more_memory_after_137.imitate_oom_error.-1.2.executionStatus": "RetryableFailure" | ||
"retry_with_more_memory_after_137.imitate_oom_error.-1.2.runtimeAttributes.memory": "1.1 GB" | ||
"retry_with_more_memory_after_137.imitate_oom_error.-1.3.executionStatus": "Failed" | ||
"retry_with_more_memory_after_137.imitate_oom_error.-1.3.runtimeAttributes.memory": "1.2100000000000002 GB" | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check for
memoryRetryRequested
is currently redundant because the memory retry RC file is only written when a memory retry multiplier is provided. However, that behavior is specific to the GCP backend, so it didn't feel appropriate to rely on it here.