This is an automated email from the ASF dual-hosted git repository. zjffdu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/zeppelin.git
The following commit(s) were added to refs/heads/master by this push: new 1f00bb9 [ZEPPELIN-4668]. Use yarn proxy link for spark webui in yarn mode 1f00bb9 is described below commit 1f00bb942a6af7521d061c2ccd574d753a2238b5 Author: Jeff Zhang <zjf...@apache.org> AuthorDate: Thu Mar 5 17:46:03 2020 +0800 [ZEPPELIN-4668]. Use yarn proxy link for spark webui in yarn mode ### What is this PR for? Due to security reason, in some company user can not access spark web ui via the direct url. Instead they have to use yarn proxy url. This PR introduce property `spark.webui.yarn.useProxy` to allow user to use yarn proxy url for spark job url. ### What type of PR is it? [Improvement ] ### Todos * [ ] - Task ### What is the Jira issue? * https://issues.apache.org/jira/browse/ZEPPELIN-4668 ### How should this be tested? * CI pass & manually tested ### Screenshots (if appropriate) ### Questions: * Does the licenses files need update? No * Is there breaking changes for older versions? No * Does this needs documentation? No Author: Jeff Zhang <zjf...@apache.org> Closes #3677 from zjffdu/ZEPPELIN-4668 and squashes the following commits: 28cdae1eb [Jeff Zhang] [ZEPPELIN-4668]. Use yarn proxy link for spark webui in yarn mode --- docs/interpreter/spark.md | 4 ++++ .../src/main/resources/interpreter-setting.json | 7 +++++++ .../zeppelin/spark/BaseSparkScalaInterpreter.scala | 24 ++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/docs/interpreter/spark.md b/docs/interpreter/spark.md index ef79959..277efb3 100644 --- a/docs/interpreter/spark.md +++ b/docs/interpreter/spark.md @@ -186,6 +186,10 @@ You can also set other Spark properties which are not listed in the table. For a <td></td> <td>Overrides Spark UI default URL. Value should be a full URL (ex: http://{hostName}/{uniquePath}</td> </tr> + <td>spark.webui.yarn.useProxy</td> + <td>false</td> + <td>whether use yarn proxy url as spark weburl, e.g. http://localhost:8088/proxy/application_1583396598068_0004</td> + </tr> </table> Without any configuration, Spark interpreter works out of box in local mode. But if you want to connect to your Spark cluster, you'll need to follow below two simple steps. diff --git a/spark/interpreter/src/main/resources/interpreter-setting.json b/spark/interpreter/src/main/resources/interpreter-setting.json index 33d8e8e..29a3644 100644 --- a/spark/interpreter/src/main/resources/interpreter-setting.json +++ b/spark/interpreter/src/main/resources/interpreter-setting.json @@ -119,6 +119,13 @@ "description": "Whether to hide spark ui in zeppelin ui", "type": "checkbox" }, + "spark.webui.yarn.useProxy": { + "envName": null, + "propertyName": "", + "defaultValue": false, + "description": "whether use yarn proxy url as spark weburl, e.g. http://localhost:8088/proxy/application_1583396598068_0004", + "type": "checkbox" + }, "zeppelin.spark.scala.color": { "envName": null, "propertyName": "zeppelin.spark.scala.color", diff --git a/spark/spark-scala-parent/src/main/scala/org/apache/zeppelin/spark/BaseSparkScalaInterpreter.scala b/spark/spark-scala-parent/src/main/scala/org/apache/zeppelin/spark/BaseSparkScalaInterpreter.scala index 104e543..2d6a1da 100644 --- a/spark/spark-scala-parent/src/main/scala/org/apache/zeppelin/spark/BaseSparkScalaInterpreter.scala +++ b/spark/spark-scala-parent/src/main/scala/org/apache/zeppelin/spark/BaseSparkScalaInterpreter.scala @@ -24,6 +24,9 @@ import java.nio.file.Paths import java.util.concurrent.atomic.AtomicInteger import org.apache.commons.lang3.StringUtils +import org.apache.hadoop.yarn.client.api.YarnClient +import org.apache.hadoop.yarn.conf.YarnConfiguration +import org.apache.hadoop.yarn.util.ConverterUtils import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} import org.apache.zeppelin.interpreter.util.InterpreterOutputStream @@ -277,6 +280,7 @@ abstract class BaseSparkScalaInterpreter(val conf: SparkConf, case Some(url) => sparkUrl = url case None => } + useYarnProxyURLIfNeeded() bind("spark", sparkSession.getClass.getCanonicalName, sparkSession, List("""@transient""")) bind("sc", "org.apache.spark.SparkContext", sc, List("""@transient""")) @@ -303,6 +307,8 @@ abstract class BaseSparkScalaInterpreter(val conf: SparkConf, if (StringUtils.isBlank(webUiUrl)) { webUiUrl = sparkUrl; } + useYarnProxyURLIfNeeded() + sparkShims.setupSparkListener(sc.master, webUiUrl, InterpreterContext.get) z = new SparkZeppelinContext(sc, sparkShims, @@ -311,6 +317,24 @@ abstract class BaseSparkScalaInterpreter(val conf: SparkConf, bind("z", z.getClass.getCanonicalName, z, List("""@transient""")) } + private def useYarnProxyURLIfNeeded() { + if (properties.getProperty("spark.webui.yarn.useProxy", "false").toBoolean) { + if (sc.getConf.get("spark.master").startsWith("yarn")) { + val appId = sc.applicationId + val yarnClient = YarnClient.createYarnClient + val yarnConf = new YarnConfiguration() + // disable timeline service as we only query yarn app here. + // Otherwise we may hit this kind of ERROR: + // java.lang.ClassNotFoundException: com.sun.jersey.api.client.config.ClientConfig + yarnConf.set("yarn.timeline-service.enabled", "false") + yarnClient.init(yarnConf) + yarnClient.start() + val appReport = yarnClient.getApplicationReport(ConverterUtils.toApplicationId(appId)) + this.sparkUrl = appReport.getTrackingUrl + } + } + } + private def isSparkSessionPresent(): Boolean = { try { Class.forName("org.apache.spark.sql.SparkSession")