Repository: spark Updated Branches: refs/heads/master 556b5d215 -> 96798d14f
[SPARK-22172][CORE] Worker hangs when the external shuffle service port is already in use ## What changes were proposed in this pull request? Handling the NonFatal exceptions while starting the external shuffle service, if there are any NonFatal exceptions it logs and continues without the external shuffle service. ## How was this patch tested? I verified it manually, it logs the exception and continues to serve without external shuffle service when BindException occurs. Author: Devaraj K <[email protected]> Closes #19396 from devaraj-kavali/SPARK-22172. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/96798d14 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/96798d14 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/96798d14 Branch: refs/heads/master Commit: 96798d14f07208796fa0a90af0ab369879bacd6c Parents: 556b5d2 Author: Devaraj K <[email protected]> Authored: Wed Nov 1 18:07:39 2017 +0800 Committer: jerryshao <[email protected]> Committed: Wed Nov 1 18:07:39 2017 +0800 ---------------------------------------------------------------------- .../scala/org/apache/spark/deploy/worker/Worker.scala | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/96798d14/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index ed5fa4b..3962d42 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -199,7 +199,7 @@ private[deploy] class Worker( logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}") logInfo("Spark home: " + sparkHome) createWorkDir() - shuffleService.startIfEnabled() + startExternalShuffleService() webUi = new WorkerWebUI(this, workDir, webUiPort) webUi.bind() @@ -367,6 +367,16 @@ private[deploy] class Worker( } } + private def startExternalShuffleService() { + try { + shuffleService.startIfEnabled() + } catch { + case e: Exception => + logError("Failed to start external shuffle service", e) + System.exit(1) + } + } + private def sendRegisterMessageToMaster(masterEndpoint: RpcEndpointRef): Unit = { masterEndpoint.send(RegisterWorker( workerId, --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
