Repository: spark
Updated Branches:
  refs/heads/master 5f74148bb -> 330c3e33b


[SPARK-13330][PYSPARK] PYTHONHASHSEED is not propgated to python worker

## What changes were proposed in this pull request?
self.environment will be propagated to executor. Should set PYTHONHASHSEED as 
long as the python version is greater than 3.3

## How was this patch tested?
Manually tested it.

Author: Jeff Zhang <[email protected]>

Closes #11211 from zjffdu/SPARK-13330.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/330c3e33
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/330c3e33
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/330c3e33

Branch: refs/heads/master
Commit: 330c3e33bd10f035f49cf3d13357eb2d6d90dabc
Parents: 5f74148
Author: Jeff Zhang <[email protected]>
Authored: Fri Feb 24 15:04:42 2017 -0800
Committer: Holden Karau <[email protected]>
Committed: Fri Feb 24 15:04:42 2017 -0800

----------------------------------------------------------------------
 core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala | 1 +
 python/pyspark/context.py                                      | 6 ++----
 python/pyspark/rdd.py                                          | 3 ++-
 .../src/main/scala/org/apache/spark/deploy/yarn/Client.scala   | 1 +
 4 files changed, 6 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala 
b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
index 0b1cec2..a8f732b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala
@@ -85,6 +85,7 @@ object PythonRunner {
     // pass conf spark.pyspark.python to python process, the only way to pass 
info to
     // python process is through environment variable.
     sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _))
+    sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _))
     builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr 
to synchronize
     try {
       val process = builder.start()

http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/python/pyspark/context.py
----------------------------------------------------------------------
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index ac4b2b0..2961cda 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -173,10 +173,8 @@ class SparkContext(object):
             if k.startswith("spark.executorEnv."):
                 varName = k[len("spark.executorEnv."):]
                 self.environment[varName] = v
-        if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
-            # disable randomness of hash of string in worker, if this is not
-            # launched by spark-submit
-            self.environment["PYTHONHASHSEED"] = "0"
+
+        self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", 
"0")
 
         # Create the Java SparkContext through Py4J
         self._jsc = jsc or self._initialize_context(self._conf._jconf)

http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index b384b2b..a5e6e2b 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -68,7 +68,8 @@ def portable_hash(x):
     >>> portable_hash((None, 1)) & 0xffffffff
     219750521
     """
-    if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
+
+    if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:
         raise Exception("Randomness of hash of string should be disabled via 
PYTHONHASHSEED")
 
     if x is None:

http://git-wip-us.apache.org/repos/asf/spark/blob/330c3e33/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
----------------------------------------------------------------------
diff --git 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index fa99cd3..e86bd54 100644
--- 
a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ 
b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -817,6 +817,7 @@ private[spark] class Client(
           sys.env.get(envname).foreach(env(envname) = _)
         }
       }
+      sys.env.get("PYTHONHASHSEED").foreach(env.put("PYTHONHASHSEED", _))
     }
 
     sys.env.get(ENV_DIST_CLASSPATH).foreach { dcp =>


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to