Repository: spark Updated Branches: refs/heads/master 5f74148bb -> 330c3e33b
[SPARK-13330][PYSPARK] PYTHONHASHSEED is not propgated to python worker ## What changes were proposed in this pull request? self.environment will be propagated to executor. Should set PYTHONHASHSEED as long as the python version is greater than 3.3 ## How was this patch tested? Manually tested it. Author: Jeff Zhang <[email protected]> Closes #11211 from zjffdu/SPARK-13330. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/330c3e33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/330c3e33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/330c3e33 Branch: refs/heads/master Commit: 330c3e33bd10f035f49cf3d13357eb2d6d90dabc Parents: 5f74148 Author: Jeff Zhang <[email protected]> Authored: Fri Feb 24 15:04:42 2017 -0800 Committer: Holden Karau <[email protected]> Committed: Fri Feb 24 15:04:42 2017 -0800 ---------------------------------------------------------------------- core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala | 1 + python/pyspark/context.py | 6 ++---- python/pyspark/rdd.py | 3 ++- .../src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 1 + 4 files changed, 6 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala index 0b1cec2..a8f732b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala @@ -85,6 +85,7 @@ object PythonRunner { // pass conf spark.pyspark.python to python process, the only way to pass info to // python process is through environment variable. sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _)) + sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _)) builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize try { val process = builder.start() http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/python/pyspark/context.py ---------------------------------------------------------------------- diff --git a/python/pyspark/context.py b/python/pyspark/context.py index ac4b2b0..2961cda 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -173,10 +173,8 @@ class SparkContext(object): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v - if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ: - # disable randomness of hash of string in worker, if this is not - # launched by spark-submit - self.environment["PYTHONHASHSEED"] = "0" + + self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0") # Create the Java SparkContext through Py4J self._jsc = jsc or self._initialize_context(self._conf._jconf) http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/python/pyspark/rdd.py ---------------------------------------------------------------------- diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index b384b2b..a5e6e2b 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -68,7 +68,8 @@ def portable_hash(x): >>> portable_hash((None, 1)) & 0xffffffff 219750521 """ - if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ: + + if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ: raise Exception("Randomness of hash of string should be disabled via PYTHONHASHSEED") if x is None: http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala ---------------------------------------------------------------------- diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index fa99cd3..e86bd54 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -817,6 +817,7 @@ private[spark] class Client( sys.env.get(envname).foreach(env(envname) = _) } } + sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _)) } sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp => --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
