This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 57d27e9 [SPARK-31125][K8S] Terminating pods have a deletion timestamp
but they are not yet dead
57d27e9 is described below
commit 57d27e900f79e6c5699b9a23db236aae98e761ad
Author: Holden Karau <[email protected]>
AuthorDate: Tue Mar 17 12:04:06 2020 -0700
[SPARK-31125][K8S] Terminating pods have a deletion timestamp but they are
not yet dead
### What changes were proposed in this pull request?
Change what we consider a deleted pod to not include "Terminating"
### Why are the changes needed?
If we get a new snapshot while a pod is in the process of being cleaned up
we shouldn't delete the executor until it is fully terminated.
### Does this PR introduce any user-facing change?
No
### How was this patch tested?
This should be covered by the decommissioning tests in that they currently
are flaky because we sometimes delete the executor instead of allowing it to
decommission all the way.
I also ran this in a loop locally ~80 times with the only failures being
the PV suite because of unrelated minikube mount issues.
Closes #27905 from holdenk/SPARK-31125-Processing-state-snapshots-incorrect.
Authored-by: Holden Karau <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../spark/scheduler/cluster/k8s/ExecutorPodStates.scala | 2 ++
.../spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala | 11 ++++++++++-
2 files changed, 12 insertions(+), 1 deletion(-)
diff --git
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
index 83daddf..34fca29 100644
---
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
+++
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodStates.scala
@@ -34,4 +34,6 @@ case class PodFailed(pod: Pod) extends FinalPodState
case class PodDeleted(pod: Pod) extends FinalPodState
+case class PodTerminating(pod: Pod) extends FinalPodState
+
case class PodUnknown(pod: Pod) extends ExecutorPodState
diff --git
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
index 435a5f1..30030ab 100644
---
a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
+++
b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsSnapshot.scala
@@ -64,6 +64,8 @@ object ExecutorPodsSnapshot extends Logging {
PodFailed(pod)
case "succeeded" =>
PodSucceeded(pod)
+ case "terminating" =>
+ PodTerminating(pod)
case _ =>
logWarning(s"Received unknown phase $phase for executor pod with
name" +
s" ${pod.getMetadata.getName} in namespace
${pod.getMetadata.getNamespace}")
@@ -72,5 +74,12 @@ object ExecutorPodsSnapshot extends Logging {
}
}
- private def isDeleted(pod: Pod): Boolean =
pod.getMetadata.getDeletionTimestamp != null
+ private def isDeleted(pod: Pod): Boolean = {
+ (pod.getMetadata.getDeletionTimestamp != null &&
+ (
+ pod.getStatus == null ||
+ pod.getStatus.getPhase == null ||
+ pod.getStatus.getPhase.toLowerCase(Locale.ROOT) != "terminating"
+ ))
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]