Repository: spark
Updated Branches:
  refs/heads/master 2585d2b32 -> 37c617e4f


[MINOR][SQL][DOCS] Add notes of the deterministic assumption on UDF functions

## What changes were proposed in this pull request?

Spark assumes that UDF functions are deterministic. This PR adds explicit notes 
about that.

## How was this patch tested?

It's only about docs.

Author: Dongjoon Hyun <[email protected]>

Closes #13087 from dongjoon-hyun/SPARK-15282.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37c617e4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37c617e4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37c617e4

Branch: refs/heads/master
Commit: 37c617e4f580482b59e1abbe3c0c27c7125cf605
Parents: 2585d2b
Author: Dongjoon Hyun <[email protected]>
Authored: Mon May 23 14:19:25 2016 -0700
Committer: Michael Armbrust <[email protected]>
Committed: Mon May 23 14:19:25 2016 -0700

----------------------------------------------------------------------
 python/pyspark/sql/functions.py                                   | 3 +++
 .../org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala      | 1 +
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala     | 3 +++
 sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala   | 3 +++
 .../src/main/scala/org/apache/spark/sql/UDFRegistration.scala     | 1 +
 .../org/apache/spark/sql/expressions/UserDefinedFunction.scala    | 3 +++
 .../main/scala/org/apache/spark/sql/internal/SessionState.scala   | 1 +
 7 files changed, 15 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/python/pyspark/sql/functions.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index dac842c..716b16f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1756,6 +1756,9 @@ class UserDefinedFunction(object):
 @since(1.3)
 def udf(f, returnType=StringType()):
     """Creates a :class:`Column` expression representing a user defined 
function (UDF).
+    Note that the user-defined functions must be deterministic. Due to 
optimization,
+    duplicate invocations may be eliminated or the function may even be 
invoked more times than
+    it is present in the query.
 
     >>> from pyspark.sql.types import IntegerType
     >>> slen = udf(lambda s: len(s), IntegerType())

http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 0038cf6..2139064 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * User-defined function.
+ * Note that the user-defined functions must be deterministic.
  * @param function  The user defined scala function to run.
  *                  Note that if you use primitive parameters, you are not 
able to check if it is
  *                  null or not, and the UDF will return null for you if the 
primitive input is

http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 14d12d3..7013e31 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -199,6 +199,9 @@ class SQLContext private[sql](
 
   /**
    * A collection of methods for registering user-defined functions (UDF).
+   * Note that the user-defined functions must be deterministic. Due to 
optimization,
+   * duplicate invocations may be eliminated or the function may even be 
invoked more times than
+   * it is present in the query.
    *
    * The following example registers a Scala closure as UDF:
    * {{{

http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index f697769..5c87c84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -145,6 +145,9 @@ class SparkSession private(
 
   /**
    * A collection of methods for registering user-defined functions (UDF).
+   * Note that the user-defined functions must be deterministic. Due to 
optimization,
+   * duplicate invocations may be eliminated or the function may even be 
invoked more times than
+   * it is present in the query.
    *
    * The following example registers a Scala closure as UDF:
    * {{{

http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 3a043dc..b006236 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * Functions for registering user-defined functions. Use [[SQLContext.udf]] to 
access this.
+ * Note that the user-defined functions must be deterministic.
  *
  * @since 1.3.0
  */

http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index bd35d19..49fdec5 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -25,6 +25,9 @@ import org.apache.spark.sql.types.DataType
 
 /**
  * A user-defined function. To create one, use the `udf` functions in 
[[functions]].
+ * Note that the user-defined functions must be deterministic. Due to 
optimization,
+ * duplicate invocations may be eliminated or the function may even be invoked 
more times than
+ * it is present in the query.
  * As an example:
  * {{{
  *   // Defined a UDF that returns true or false based on some numeric score.

http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
index 939b919..c9cc2ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
@@ -100,6 +100,7 @@ private[sql] class SessionState(sparkSession: SparkSession) 
{
 
   /**
    * Interface exposed to the user for registering user-defined functions.
+   * Note that the user-defined functions must be deterministic.
    */
   lazy val udf: UDFRegistration = new UDFRegistration(functionRegistry)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to