This is an automated email from the ASF dual-hosted git repository. zjffdu pushed a commit to branch branch-0.9 in repository https://gitbox.apache.org/repos/asf/zeppelin.git
The following commit(s) were added to refs/heads/branch-0.9 by this push: new d8879df [ZEPPELIN-4934] add jars to sys.path as done in pyspark shell d8879df is described below commit d8879df0b8158a95d411c79bccd06dc231aa290a Author: Dalitso Banda <daba...@microsoft.com> AuthorDate: Sun Jul 5 19:50:47 2020 -0700 [ZEPPELIN-4934] add jars to sys.path as done in pyspark shell ### What is this PR for? A few sentences describing the overall goals of the pull request's commits. First time? Check out the contributing guide - https://zeppelin.apache.org/contribution/contributions.html Some jars include python code. These must be added to sys.path to support importing those packages in python. pyspark include these jars to the path at context initialization. From the spark repo see: * core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala * python/pyspark/context.py * python/pyspark/shell.py In pyspark, the jars passed with "–packages" are passed onto "spark.submit.pyFiles" (in prepareSubmitEnvironment function) and then added to sys.path by the context initialization. ### What type of PR is it? Bug Fix ### What is the Jira issue? https://issues.apache.org/jira/browse/ZEPPELIN-4934 ### How should this be tested? * First time? Setup Travis CI as described on https://zeppelin.apache.org/contribution/contributions.html#continuous-integration * Strongly recommended: add automated unit tests for any new or changed behavior * Outline any manual steps to test the PR here. ### Screenshots (if appropriate) ### Questions: * Does the licenses files need update? * Is there breaking changes for older versions? * Does this needs documentation? Author: Dalitso Banda <daba...@microsoft.com> Closes #3838 from dbanda/master and squashes the following commits: ba88964b3 [Dalitso Banda] check path instead of explicit import for compatibility db04a2cc0 [Dalitso Banda] fixing spark < 2.2 does not work with spark.jars.packages ac7df0d4c [Dalitso Banda] compatibility with spark 1 fix f98125145 [Dalitso Banda] unit test for pyspark jar imports f0cf1e140 [Dalitso Banda] add jars to sys.path as done in pyspark shell (cherry picked from commit f0669e5fb1199f98b1b94f00ab02f9488ba463ca) Signed-off-by: Jeff Zhang <zjf...@apache.org> --- .../src/main/resources/python/zeppelin_ipyspark.py | 5 +++++ .../src/main/resources/python/zeppelin_pyspark.py | 5 +++++ .../zeppelin/integration/ZeppelinSparkClusterTest.java | 14 ++++++++++++++ 3 files changed, 24 insertions(+) diff --git a/spark/interpreter/src/main/resources/python/zeppelin_ipyspark.py b/spark/interpreter/src/main/resources/python/zeppelin_ipyspark.py index 4b9f67f..d8ec931 100644 --- a/spark/interpreter/src/main/resources/python/zeppelin_ipyspark.py +++ b/spark/interpreter/src/main/resources/python/zeppelin_ipyspark.py @@ -73,3 +73,8 @@ class IPySparkZeppelinContext(PyZeppelinContext): super(IPySparkZeppelinContext, self).show(obj, **kwargs) z = __zeppelin__ = IPySparkZeppelinContext(intp.getZeppelinContext(), gateway) + +# add jars to path +import sys +jarlist = map(lambda url: url.replace("file:/", "/"), (conf.get("spark.jars") or "").split(",")) +sys.path.extend(filter(lambda jar: jar not in sys.path, jarlist)) diff --git a/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py b/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py index 9a02cd2..b710721 100644 --- a/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py +++ b/spark/interpreter/src/main/resources/python/zeppelin_pyspark.py @@ -71,3 +71,8 @@ class PySparkZeppelinContext(PyZeppelinContext): z = __zeppelin__ = PySparkZeppelinContext(intp.getZeppelinContext(), gateway) __zeppelin__._setup_matplotlib() + +# add jars to path +import sys +jarlist = map(lambda url: url.replace("file:/", "/"), (conf.get("spark.jars") or "").split(",")) +sys.path.extend(filter(lambda jar: jar not in sys.path, jarlist)) diff --git a/zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/ZeppelinSparkClusterTest.java b/zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/ZeppelinSparkClusterTest.java index 838d6f8..2205b21 100644 --- a/zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/ZeppelinSparkClusterTest.java +++ b/zeppelin-interpreter-integration/src/test/java/org/apache/zeppelin/integration/ZeppelinSparkClusterTest.java @@ -1038,6 +1038,20 @@ public abstract class ZeppelinSparkClusterTest extends AbstractTestRestApi { p1.setText("%spark\nimport com.databricks.spark.csv._"); note.run(p1.getId(), true); assertEquals(Status.FINISHED, p1.getStatus()); + + // test pyspark imports path + Paragraph p2 = note.addNewParagraph(anonymous); + p2.setText("%spark.pyspark\nimport sys\nsys.path"); + note.run(p2.getId(), true); + assertEquals(Status.FINISHED, p2.getStatus()); + assertTrue(p2.getReturn().toString().contains("databricks_spark")); + + Paragraph p3 = note.addNewParagraph(anonymous); + p3.setText("%spark.ipyspark\nimport sys\nsys.path"); + note.run(p3.getId(), true); + assertEquals(Status.FINISHED, p3.getStatus()); + assertTrue(p3.getReturn().toString().contains("databricks_spark")); + } finally { if (null != note) { TestUtils.getInstance(Notebook.class).removeNote(note, anonymous);