Repository: spark Updated Branches: refs/heads/master 2585d2b32 -> 37c617e4f
[MINOR][SQL][DOCS] Add notes of the deterministic assumption on UDF functions ## What changes were proposed in this pull request? Spark assumes that UDF functions are deterministic. This PR adds explicit notes about that. ## How was this patch tested? It's only about docs. Author: Dongjoon Hyun <[email protected]> Closes #13087 from dongjoon-hyun/SPARK-15282. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37c617e4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37c617e4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37c617e4 Branch: refs/heads/master Commit: 37c617e4f580482b59e1abbe3c0c27c7125cf605 Parents: 2585d2b Author: Dongjoon Hyun <[email protected]> Authored: Mon May 23 14:19:25 2016 -0700 Committer: Michael Armbrust <[email protected]> Committed: Mon May 23 14:19:25 2016 -0700 ---------------------------------------------------------------------- python/pyspark/sql/functions.py | 3 +++ .../org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala | 1 + sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala | 3 +++ sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala | 3 +++ .../src/main/scala/org/apache/spark/sql/UDFRegistration.scala | 1 + .../org/apache/spark/sql/expressions/UserDefinedFunction.scala | 3 +++ .../main/scala/org/apache/spark/sql/internal/SessionState.scala | 1 + 7 files changed, 15 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/python/pyspark/sql/functions.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index dac842c..716b16f 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1756,6 +1756,9 @@ class UserDefinedFunction(object): @since(1.3) def udf(f, returnType=StringType()): """Creates a :class:`Column` expression representing a user defined function (UDF). + Note that the user-defined functions must be deterministic. Due to optimization, + duplicate invocations may be eliminated or the function may even be invoked more times than + it is present in the query. >>> from pyspark.sql.types import IntegerType >>> slen = udf(lambda s: len(s), IntegerType()) http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala index 0038cf6..2139064 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.types.DataType /** * User-defined function. + * Note that the user-defined functions must be deterministic. * @param function The user defined scala function to run. * Note that if you use primitive parameters, you are not able to check if it is * null or not, and the UDF will return null for you if the primitive input is http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 14d12d3..7013e31 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -199,6 +199,9 @@ class SQLContext private[sql]( /** * A collection of methods for registering user-defined functions (UDF). + * Note that the user-defined functions must be deterministic. Due to optimization, + * duplicate invocations may be eliminated or the function may even be invoked more times than + * it is present in the query. * * The following example registers a Scala closure as UDF: * {{{ http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index f697769..5c87c84 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -145,6 +145,9 @@ class SparkSession private( /** * A collection of methods for registering user-defined functions (UDF). + * Note that the user-defined functions must be deterministic. Due to optimization, + * duplicate invocations may be eliminated or the function may even be invoked more times than + * it is present in the query. * * The following example registers a Scala closure as UDF: * {{{ http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index 3a043dc..b006236 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -32,6 +32,7 @@ import org.apache.spark.sql.types.DataType /** * Functions for registering user-defined functions. Use [[SQLContext.udf]] to access this. + * Note that the user-defined functions must be deterministic. * * @since 1.3.0 */ http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index bd35d19..49fdec5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -25,6 +25,9 @@ import org.apache.spark.sql.types.DataType /** * A user-defined function. To create one, use the `udf` functions in [[functions]]. + * Note that the user-defined functions must be deterministic. Due to optimization, + * duplicate invocations may be eliminated or the function may even be invoked more times than + * it is present in the query. * As an example: * {{{ * // Defined a UDF that returns true or false based on some numeric score. http://git-wip-us.apache.org/repos/asf/spark/blob/37c617e4/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index 939b919..c9cc2ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -100,6 +100,7 @@ private[sql] class SessionState(sparkSession: SparkSession) { /** * Interface exposed to the user for registering user-defined functions. + * Note that the user-defined functions must be deterministic. */ lazy val udf: UDFRegistration = new UDFRegistration(functionRegistry) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
