Skip to content

Commit

Permalink
Add proper support for AM failure validity interval
Browse files Browse the repository at this point in the history
  • Loading branch information
Kimahriman committed Aug 1, 2023
1 parent 518a97f commit 0943fac
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ private[spark] class ApplicationMaster(
sparkConf.get(MAX_EXECUTOR_FAILURES).getOrElse(defaultMaxNumExecutorFailures)
}

// If a AM failure validity interval was given, we need to store information on the previous
// attempts so we can determine whether this should be the last attempt or not
private val amFailureValidity = sparkConf.get(AM_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS)

@volatile private var exitCode = 0
@volatile private var unregistered = false
@volatile private var finished = false
Expand Down Expand Up @@ -210,6 +214,46 @@ private[spark] class ApplicationMaster(
resources.toMap
}

/**
* Determines if this should be the last attempt or not. If no validity interval was defined,
* this simply compares the current attempt ID to the max number of attempts. If a validity
* interval is defined, we do our best to replicate Yarn's logic for determining which previous
* attempts should count toward the max attempt limit.
*/
private def isLastAttempt: Boolean = {
val previousAttempts = if (amFailureValidity.isDefined) {
logInfo("Loading previous ApplicationMaster attempts")
client.getPreviousAttempts(yarnConf, appAttemptId)
} else {
Seq.empty
}
val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
amFailureValidity.map { interval =>
val ignoredExitStatuses = Seq(
ContainerExitStatus.PREEMPTED,
ContainerExitStatus.ABORTED,
ContainerExitStatus.DISKS_FAILED,
ContainerExitStatus.KILLED_BY_RESOURCEMANAGER
)
val validAfter = System.currentTimeMillis() - interval
val validAttempts = previousAttempts.filter { case (attempt, exitStatus) =>
val validFinishTime = attempt.getFinishTime() >= validAfter

// If we don't have the exit status, or it's not one of the statuses yarn ignores, count it
// toward out total failures.
val validStatusCode = !exitStatus.exists(ignoredExitStatuses.contains(_))

val valid = validFinishTime && validStatusCode
valid
}

// Include this attempt
validAttempts.length + 1 >= maxAppAttempts
}.getOrElse {
appAttemptId.getAttemptId() >= maxAppAttempts
}
}

final def run(): Int = {
try {
val attemptID = if (isClusterMode) {
Expand Down Expand Up @@ -247,9 +291,6 @@ private[spark] class ApplicationMaster(
val priority = ShutdownHookManager.SPARK_CONTEXT_SHUTDOWN_PRIORITY - 1
ShutdownHookManager.addShutdownHook(priority) { () =>
try {
val maxAppAttempts = client.getMaxRegAttempts(sparkConf, yarnConf)
val isLastAttempt = appAttemptId.getAttemptId() >= maxAppAttempts

if (!finished) {
// The default state of ApplicationMaster is failed if it is invoked by shut down hook.
// This behavior is different compared to 1.x version.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
package org.apache.spark.deploy.yarn

import scala.collection.JavaConverters._
import scala.util.control.NonFatal

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.yarn.api.records._
import org.apache.hadoop.yarn.client.api.AMRMClient
import org.apache.hadoop.yarn.client.api.{AMRMClient, YarnClient}
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.webapp.util.WebAppUtils
Expand Down Expand Up @@ -133,6 +134,37 @@ private[spark] class YarnRMClient extends Logging {
}
}

/**
* Returns a list of previous attempts and their AM container exit status. This should only
* be called once, so we create a new YarnClient and stop it when finished.
*/
def getPreviousAttempts(
yarnConf: YarnConfiguration,
appAttemptId: ApplicationAttemptId): Seq[(ApplicationAttemptReport, Option[Int])] = {
val yarnClient = YarnClient.createYarnClient()
yarnClient.init(yarnConf)
yarnClient.start()
try {
val attempts = yarnClient
.getApplicationAttempts(appAttemptId.getApplicationId)
.asScala
.filter(_.getApplicationAttemptId != appAttemptId)
.toSeq
attempts.map { attempt =>
try {
val report = yarnClient.getContainerReport(attempt.getAMContainerId)
(attempt, Some(report.getContainerExitStatus))
} catch {
case NonFatal(e) =>
logWarning("Failed to get previous attempt AM container exit status", e)
(attempt, None)
}
}
} finally {
yarnClient.stop()
}
}

private def getUrlByRmId(conf: Configuration, rmId: String): String = {
val addressPropertyPrefix = if (YarnConfiguration.useHttps(conf)) {
YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS
Expand Down

0 comments on commit 0943fac

Please sign in to comment.