apache · foxish · Sep 15, 2017 · Oct 17, 2017 · Oct 17, 2017 · Oct 17, 2017
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -111,5 +111,14 @@ private[spark] object Config extends Logging {
       .stringConf
       .createOptional
 
+  val KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS =
+    ConfigBuilder("spark.kubernetes.executor.lostCheck.maxAttempts")
+      .doc("Maximum number of attempts allowed for checking the reason of an executor loss " +
+        "before it is assumed that the executor failed.")
+      .intConf
+      .checkValue(value => value > 0, "Maximum attempts of checks of executor lost reason " +
+        "must be a positive integer")
+      .createWithDefault(5)
+
   val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
 }
diff --git a/...main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/...main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -90,6 +90,9 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private val podAllocationSize = conf.get(KUBERNETES_ALLOCATION_BATCH_SIZE)
 
+  private val executorLostReasonCheckMaxAttempts = conf.get(
+    KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS)
+
   private val allocatorRunnable = new Runnable {
 
     // Maintains a map of executor id to count of checks performed to learn the loss reason
@@ -174,7 +177,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
     def removeExecutorOrIncrementLossReasonCheckCount(executorId: String): Unit = {
       val reasonCheckCount = executorReasonCheckAttemptCounts.getOrElse(executorId, 0)
-      if (reasonCheckCount >= MAX_EXECUTOR_LOST_REASON_CHECKS) {
+      if (reasonCheckCount >= executorLostReasonCheckMaxAttempts) {
         removeExecutor(executorId, SlaveLost("Executor lost for unknown reasons."))
         deleteExecutorFromClusterAndDataStructures(executorId)
       } else {
@@ -427,7 +430,4 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
 private object KubernetesClusterSchedulerBackend {
   private val UNKNOWN_EXIT_CODE = -1
-  // Number of times we are allowed check for the loss reason for an executor before we give up
-  // and assume the executor failed for good, and attribute it to a framework fault.
-  val MAX_EXECUTOR_LOST_REASON_CHECKS = 10
 }
diff --git a/.../core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/.../core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -123,8 +123,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
       ENV_EXECUTOR_CORES -> "1",
       ENV_EXECUTOR_MEMORY -> "1g",
       ENV_APPLICATION_ID -> "dummy",
-      ENV_EXECUTOR_POD_IP -> null,
-      ENV_EXECUTOR_PORT -> "10000") ++ additionalEnvVars
+      ENV_EXECUTOR_POD_IP -> null) ++ additionalEnvVars
 
     assert(executor.getSpec.getContainers.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getEnv.size() === defaultEnvs.size)

diff --git a/...scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/...scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -330,6 +330,8 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     sparkConf
       .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
       .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
+    val executorLostReasonCheckMaxAttempts = sparkConf.get(
+      KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS)
 
     val scheduler = newSchedulerBackend()
     scheduler.start()
@@ -346,7 +348,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
       .apply(registerFirstExecutorMessage)
 
     driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
-    1 to KubernetesClusterSchedulerBackend.MAX_EXECUTOR_LOST_REASON_CHECKS foreach { _ =>
+    1 to executorLostReasonCheckMaxAttempts foreach { _ =>
       allocatorRunnable.getValue.run()
       verify(podOperations, never()).delete(FIRST_EXECUTOR_POD)
     }