[spark] branch master updated: [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors into error class

ruifengz Sun, 23 Apr 2023 22:28:05 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 593d3d86e50 [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors 
into error class
593d3d86e50 is described below

commit 593d3d86e5071dde4fe2071b4d5f0a4de0f4bbcb
Author: itholic <[email protected]>
AuthorDate: Mon Apr 24 13:27:40 2023 +0800

    [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors into error class
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to migrate Expression errors into PySpark error framework.
    
    ### Why are the changes needed?
    
    To leverage the PySpark error framework so that we can provide more 
actionable and consistent errors for users.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    The existing CI should pass.
    
    Closes #40869 from itholic/error_connect_expr.
    
    Authored-by: itholic <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/errors/error_classes.py             | 25 +++++++++
 python/pyspark/sql/connect/expressions.py          | 63 ++++++++++++++++++----
 .../sql/tests/connect/test_connect_column.py       | 23 +++++---
 .../sql/tests/connect/test_connect_function.py     | 13 +++--
 4 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/python/pyspark/errors/error_classes.py 
b/python/pyspark/errors/error_classes.py
index d7c1b00d115..4425ed79928 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -54,11 +54,21 @@ ERROR_CLASSES_JSON = """
       "Cannot convert column into bool: please use '&' for 'and', '|' for 
'or', '~' for 'not' when building DataFrame boolean expressions."
     ]
   },
+  "CANNOT_INFER_ARRAY_TYPE": {
+    "message": [
+      "Can not infer Array Type from an list with None as the first element."
+    ]
+  },
   "CANNOT_PARSE_DATATYPE": {
     "message": [
       "Unable to parse datatype from schema. <error>."
     ]
   },
+  "CANNOT_PROVIDE_METADATA": {
+    "message": [
+      "metadata can only be provided for a single column."
+    ]
+  },
   "CANNOT_SET_TOGETHER": {
     "message": [
       "<arg_list> should not be set together."
@@ -319,6 +329,16 @@ ERROR_CLASSES_JSON = """
       "State is either not defined or has already been removed."
     ]
   },
+  "UNSUPPORTED_DATA_TYPE" : {
+    "message" : [
+      "Unsupported DataType `<data_type>`."
+    ]
+  },
+  "UNSUPPORTED_LITERAL" : {
+    "message" : [
+      "Unsupported Literal '<literal>'."
+    ]
+  },
   "UNSUPPORTED_NUMPY_ARRAY_SCALAR" : {
     "message" : [
       "The type of array scalar '<dtype>' is not supported."
@@ -354,6 +374,11 @@ ERROR_CLASSES_JSON = """
       "Value for `<arg_name>` must be True, got '<arg_value>'."
     ]
   },
+  "VALUE_OUT_OF_BOUND" : {
+    "message" : [
+      "Value for `<arg_name>` must be between <min> and <max>."
+    ]
+  },
   "WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION" : {
     "message" : [
       "Function `<func_name>` should take between 1 and 3 arguments, but 
provided function takes <num_args>."
diff --git a/python/pyspark/sql/connect/expressions.py 
b/python/pyspark/sql/connect/expressions.py
index 8ed365091fc..4fc0147d29b 100644
--- a/python/pyspark/sql/connect/expressions.py
+++ b/python/pyspark/sql/connect/expressions.py
@@ -73,6 +73,7 @@ from pyspark.sql.connect.types import (
     pyspark_types_to_proto_types,
     proto_schema_to_pyspark_data_type,
 )
+from pyspark.errors import PySparkTypeError, PySparkValueError
 
 if TYPE_CHECKING:
     from pyspark.sql.connect.client import SparkConnectClient
@@ -160,7 +161,10 @@ class ColumnAlias(Expression):
             return exp
         else:
             if self._metadata:
-                raise ValueError("metadata can only be provided for a single 
column")
+                raise PySparkValueError(
+                    error_class="CANNOT_PROVIDE_METADATA",
+                    message_parameters={},
+                )
             exp = proto.Expression()
             exp.alias.name.extend(self._alias)
             exp.alias.expr.CopyFrom(self._parent.to_plan(session))
@@ -255,7 +259,10 @@ class LiteralExpression(Expression):
             elif isinstance(dataType, ArrayType):
                 assert isinstance(value, list)
             else:
-                raise TypeError(f"Unsupported Data Type {dataType}")
+                raise PySparkTypeError(
+                    error_class="UNSUPPORTED_DATA_TYPE",
+                    message_parameters={"data_type": str(dataType)},
+                )
 
         self._value = value
         self._dataType = dataType
@@ -274,7 +281,14 @@ class LiteralExpression(Expression):
             elif JVM_LONG_MIN <= value <= JVM_LONG_MAX:
                 return LongType()
             else:
-                raise ValueError(f"integer {value} out of bounds")
+                raise PySparkValueError(
+                    error_class="VALUE_OUT_OF_BOUND",
+                    message_parameters={
+                        "arg_name": "value",
+                        "min": str(JVM_LONG_MIN),
+                        "max": str(JVM_SHORT_MAX),
+                    },
+                )
         elif isinstance(value, float):
             return DoubleType()
         elif isinstance(value, str):
@@ -297,15 +311,22 @@ class LiteralExpression(Expression):
             # follow the 'infer_array_from_first_element' strategy in 
'sql.types._infer_type'
             # right now, it's dedicated for pyspark.ml params like array<...>, 
array<array<...>>
             if len(value) == 0:
-                raise TypeError("Can not infer Array Type from an empty list")
+                raise PySparkValueError(
+                    error_class="CANNOT_BE_EMPTY",
+                    message_parameters={"item": "value"},
+                )
             first = value[0]
             if first is None:
-                raise TypeError(
-                    "Can not infer Array Type from an list with None as the 
first element"
+                raise PySparkTypeError(
+                    error_class="CANNOT_INFER_ARRAY_TYPE",
+                    message_parameters={},
                 )
             return ArrayType(LiteralExpression._infer_type(first), True)
 
-        raise TypeError(f"Unsupported Data Type {type(value).__name__}")
+        raise PySparkTypeError(
+            error_class="UNSUPPORTED_DATA_TYPE",
+            message_parameters={"data_type": type(value).__name__},
+        )
 
     @classmethod
     def _from_value(cls, value: Any) -> "LiteralExpression":
@@ -366,7 +387,10 @@ class LiteralExpression(Expression):
                 assert elementType == dataType.elementType
             return [LiteralExpression._to_value(v, elementType) for v in 
literal.array.elements]
 
-        raise TypeError(f"Unsupported Literal Value {literal}")
+        raise PySparkTypeError(
+            error_class="UNSUPPORTED_LITERAL",
+            message_parameters={"literal": str(literal)},
+        )
 
     def to_plan(self, session: "SparkConnectClient") -> "proto.Expression":
         """Converts the literal expression to the literal in proto."""
@@ -413,7 +437,10 @@ class LiteralExpression(Expression):
                     LiteralExpression(v, element_type).to_plan(session).literal
                 )
         else:
-            raise ValueError(f"Unsupported Data Type {self._dataType}")
+            raise PySparkTypeError(
+                error_class="UNSUPPORTED_DATA_TYPE",
+                message_parameters={"data_type": str(self._dataType)},
+            )
 
         return expr
 
@@ -940,7 +967,14 @@ class WindowExpression(Expression):
                 elif JVM_INT_MIN <= start <= JVM_INT_MAX:
                     expr.window.frame_spec.lower.value.literal.integer = start
                 else:
-                    raise ValueError(f"start is out of bound: {start}")
+                    raise PySparkValueError(
+                        error_class="VALUE_OUT_OF_BOUND",
+                        message_parameters={
+                            "arg_name": "start",
+                            "min": str(JVM_INT_MIN),
+                            "max": str(JVM_INT_MAX),
+                        },
+                    )
 
                 end = self._windowSpec._frame._end
                 if end == 0:
@@ -950,7 +984,14 @@ class WindowExpression(Expression):
                 elif JVM_INT_MIN <= end <= JVM_INT_MAX:
                     expr.window.frame_spec.upper.value.literal.integer = end
                 else:
-                    raise ValueError(f"end is out of bound: {end}")
+                    raise PySparkValueError(
+                        error_class="VALUE_OUT_OF_BOUND",
+                        message_parameters={
+                            "arg_name": "end",
+                            "min": str(JVM_INT_MIN),
+                            "max": str(JVM_INT_MAX),
+                        },
+                    )
 
             else:
                 expr.window.frame_spec.frame_type = (
diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py 
b/python/pyspark/sql/tests/connect/test_connect_column.py
index 5703f8d2a3c..a62f4dcfebf 100644
--- a/python/pyspark/sql/tests/connect/test_connect_column.py
+++ b/python/pyspark/sql/tests/connect/test_connect_column.py
@@ -513,18 +513,25 @@ class SparkConnectColumnTests(SparkConnectSQLTestCase):
         self.assertEqual(cdf1.schema, sdf1.schema)
         self.assert_eq(cdf1.toPandas(), sdf1.toPandas())
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "integer 9223372036854775808 out of bounds",
-        ):
+        # negative test for incorrect type
+        with self.assertRaises(PySparkValueError) as pe:
             cdf.select(CF.lit(JVM_LONG_MAX + 1)).show()
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "integer -9223372036854775809 out of bounds",
-        ):
+        self.check_error(
+            exception=pe.exception,
+            error_class="VALUE_OUT_OF_BOUND",
+            message_parameters={"arg_name": "value", "min": 
"-9223372036854775808", "max": "32767"},
+        )
+
+        with self.assertRaises(PySparkValueError) as pe:
             cdf.select(CF.lit(JVM_LONG_MIN - 1)).show()
 
+        self.check_error(
+            exception=pe.exception,
+            error_class="VALUE_OUT_OF_BOUND",
+            message_parameters={"arg_name": "value", "min": 
"-9223372036854775808", "max": "32767"},
+        )
+
     def test_cast(self):
         # SPARK-41412: test basic Column.cast
         df = self.connect.read.table(self.tbl_name)
diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py 
b/python/pyspark/sql/tests/connect/test_connect_function.py
index 57b39310fe8..38a7ed4df62 100644
--- a/python/pyspark/sql/tests/connect/test_connect_function.py
+++ b/python/pyspark/sql/tests/connect/test_connect_function.py
@@ -17,7 +17,7 @@
 import os
 import unittest
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkTypeError, PySparkValueError
 from pyspark.sql import SparkSession as PySparkSession
 from pyspark.sql.types import StringType, StructType, StructField, ArrayType, 
IntegerType
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
@@ -862,12 +862,15 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, 
PandasOnSparkTestUtils, S
                 )
 
         # check error
-        with self.assertRaisesRegex(
-            ValueError,
-            "end is out of bound",
-        ):
+        with self.assertRaises(PySparkValueError) as pe:
             cdf.select(CF.sum("a").over(CW.orderBy("b").rowsBetween(0, (1 << 
33)))).show()
 
+        self.check_error(
+            exception=pe.exception,
+            error_class="VALUE_OUT_OF_BOUND",
+            message_parameters={"arg_name": "end", "min": "-2147483648", 
"max": "2147483647"},
+        )
+
         with self.assertRaises(PySparkTypeError) as pe:
             cdf.select(CF.rank().over(cdf.a))
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors into error class

Reply via email to