This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 593d3d86e50 [SPARK-43209][CONNECT][PYTHON] Migrate Expression errors
into error class
593d3d86e50 is described below
commit 593d3d86e5071dde4fe2071b4d5f0a4de0f4bbcb
Author: itholic <[email protected]>
AuthorDate: Mon Apr 24 13:27:40 2023 +0800
[SPARK-43209][CONNECT][PYTHON] Migrate Expression errors into error class
### What changes were proposed in this pull request?
This PR proposes to migrate Expression errors into PySpark error framework.
### Why are the changes needed?
To leverage the PySpark error framework so that we can provide more
actionable and consistent errors for users.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
The existing CI should pass.
Closes #40869 from itholic/error_connect_expr.
Authored-by: itholic <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/errors/error_classes.py | 25 +++++++++
python/pyspark/sql/connect/expressions.py | 63 ++++++++++++++++++----
.../sql/tests/connect/test_connect_column.py | 23 +++++---
.../sql/tests/connect/test_connect_function.py | 13 +++--
4 files changed, 100 insertions(+), 24 deletions(-)
diff --git a/python/pyspark/errors/error_classes.py
b/python/pyspark/errors/error_classes.py
index d7c1b00d115..4425ed79928 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -54,11 +54,21 @@ ERROR_CLASSES_JSON = """
"Cannot convert column into bool: please use '&' for 'and', '|' for
'or', '~' for 'not' when building DataFrame boolean expressions."
]
},
+ "CANNOT_INFER_ARRAY_TYPE": {
+ "message": [
+ "Can not infer Array Type from an list with None as the first element."
+ ]
+ },
"CANNOT_PARSE_DATATYPE": {
"message": [
"Unable to parse datatype from schema. <error>."
]
},
+ "CANNOT_PROVIDE_METADATA": {
+ "message": [
+ "metadata can only be provided for a single column."
+ ]
+ },
"CANNOT_SET_TOGETHER": {
"message": [
"<arg_list> should not be set together."
@@ -319,6 +329,16 @@ ERROR_CLASSES_JSON = """
"State is either not defined or has already been removed."
]
},
+ "UNSUPPORTED_DATA_TYPE" : {
+ "message" : [
+ "Unsupported DataType `<data_type>`."
+ ]
+ },
+ "UNSUPPORTED_LITERAL" : {
+ "message" : [
+ "Unsupported Literal '<literal>'."
+ ]
+ },
"UNSUPPORTED_NUMPY_ARRAY_SCALAR" : {
"message" : [
"The type of array scalar '<dtype>' is not supported."
@@ -354,6 +374,11 @@ ERROR_CLASSES_JSON = """
"Value for `<arg_name>` must be True, got '<arg_value>'."
]
},
+ "VALUE_OUT_OF_BOUND" : {
+ "message" : [
+ "Value for `<arg_name>` must be between <min> and <max>."
+ ]
+ },
"WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION" : {
"message" : [
"Function `<func_name>` should take between 1 and 3 arguments, but
provided function takes <num_args>."
diff --git a/python/pyspark/sql/connect/expressions.py
b/python/pyspark/sql/connect/expressions.py
index 8ed365091fc..4fc0147d29b 100644
--- a/python/pyspark/sql/connect/expressions.py
+++ b/python/pyspark/sql/connect/expressions.py
@@ -73,6 +73,7 @@ from pyspark.sql.connect.types import (
pyspark_types_to_proto_types,
proto_schema_to_pyspark_data_type,
)
+from pyspark.errors import PySparkTypeError, PySparkValueError
if TYPE_CHECKING:
from pyspark.sql.connect.client import SparkConnectClient
@@ -160,7 +161,10 @@ class ColumnAlias(Expression):
return exp
else:
if self._metadata:
- raise ValueError("metadata can only be provided for a single
column")
+ raise PySparkValueError(
+ error_class="CANNOT_PROVIDE_METADATA",
+ message_parameters={},
+ )
exp = proto.Expression()
exp.alias.name.extend(self._alias)
exp.alias.expr.CopyFrom(self._parent.to_plan(session))
@@ -255,7 +259,10 @@ class LiteralExpression(Expression):
elif isinstance(dataType, ArrayType):
assert isinstance(value, list)
else:
- raise TypeError(f"Unsupported Data Type {dataType}")
+ raise PySparkTypeError(
+ error_class="UNSUPPORTED_DATA_TYPE",
+ message_parameters={"data_type": str(dataType)},
+ )
self._value = value
self._dataType = dataType
@@ -274,7 +281,14 @@ class LiteralExpression(Expression):
elif JVM_LONG_MIN <= value <= JVM_LONG_MAX:
return LongType()
else:
- raise ValueError(f"integer {value} out of bounds")
+ raise PySparkValueError(
+ error_class="VALUE_OUT_OF_BOUND",
+ message_parameters={
+ "arg_name": "value",
+ "min": str(JVM_LONG_MIN),
+ "max": str(JVM_SHORT_MAX),
+ },
+ )
elif isinstance(value, float):
return DoubleType()
elif isinstance(value, str):
@@ -297,15 +311,22 @@ class LiteralExpression(Expression):
# follow the 'infer_array_from_first_element' strategy in
'sql.types._infer_type'
# right now, it's dedicated for pyspark.ml params like array<...>,
array<array<...>>
if len(value) == 0:
- raise TypeError("Can not infer Array Type from an empty list")
+ raise PySparkValueError(
+ error_class="CANNOT_BE_EMPTY",
+ message_parameters={"item": "value"},
+ )
first = value[0]
if first is None:
- raise TypeError(
- "Can not infer Array Type from an list with None as the
first element"
+ raise PySparkTypeError(
+ error_class="CANNOT_INFER_ARRAY_TYPE",
+ message_parameters={},
)
return ArrayType(LiteralExpression._infer_type(first), True)
- raise TypeError(f"Unsupported Data Type {type(value).__name__}")
+ raise PySparkTypeError(
+ error_class="UNSUPPORTED_DATA_TYPE",
+ message_parameters={"data_type": type(value).__name__},
+ )
@classmethod
def _from_value(cls, value: Any) -> "LiteralExpression":
@@ -366,7 +387,10 @@ class LiteralExpression(Expression):
assert elementType == dataType.elementType
return [LiteralExpression._to_value(v, elementType) for v in
literal.array.elements]
- raise TypeError(f"Unsupported Literal Value {literal}")
+ raise PySparkTypeError(
+ error_class="UNSUPPORTED_LITERAL",
+ message_parameters={"literal": str(literal)},
+ )
def to_plan(self, session: "SparkConnectClient") -> "proto.Expression":
"""Converts the literal expression to the literal in proto."""
@@ -413,7 +437,10 @@ class LiteralExpression(Expression):
LiteralExpression(v, element_type).to_plan(session).literal
)
else:
- raise ValueError(f"Unsupported Data Type {self._dataType}")
+ raise PySparkTypeError(
+ error_class="UNSUPPORTED_DATA_TYPE",
+ message_parameters={"data_type": str(self._dataType)},
+ )
return expr
@@ -940,7 +967,14 @@ class WindowExpression(Expression):
elif JVM_INT_MIN <= start <= JVM_INT_MAX:
expr.window.frame_spec.lower.value.literal.integer = start
else:
- raise ValueError(f"start is out of bound: {start}")
+ raise PySparkValueError(
+ error_class="VALUE_OUT_OF_BOUND",
+ message_parameters={
+ "arg_name": "start",
+ "min": str(JVM_INT_MIN),
+ "max": str(JVM_INT_MAX),
+ },
+ )
end = self._windowSpec._frame._end
if end == 0:
@@ -950,7 +984,14 @@ class WindowExpression(Expression):
elif JVM_INT_MIN <= end <= JVM_INT_MAX:
expr.window.frame_spec.upper.value.literal.integer = end
else:
- raise ValueError(f"end is out of bound: {end}")
+ raise PySparkValueError(
+ error_class="VALUE_OUT_OF_BOUND",
+ message_parameters={
+ "arg_name": "end",
+ "min": str(JVM_INT_MIN),
+ "max": str(JVM_INT_MAX),
+ },
+ )
else:
expr.window.frame_spec.frame_type = (
diff --git a/python/pyspark/sql/tests/connect/test_connect_column.py
b/python/pyspark/sql/tests/connect/test_connect_column.py
index 5703f8d2a3c..a62f4dcfebf 100644
--- a/python/pyspark/sql/tests/connect/test_connect_column.py
+++ b/python/pyspark/sql/tests/connect/test_connect_column.py
@@ -513,18 +513,25 @@ class SparkConnectColumnTests(SparkConnectSQLTestCase):
self.assertEqual(cdf1.schema, sdf1.schema)
self.assert_eq(cdf1.toPandas(), sdf1.toPandas())
- with self.assertRaisesRegex(
- ValueError,
- "integer 9223372036854775808 out of bounds",
- ):
+ # negative test for incorrect type
+ with self.assertRaises(PySparkValueError) as pe:
cdf.select(CF.lit(JVM_LONG_MAX + 1)).show()
- with self.assertRaisesRegex(
- ValueError,
- "integer -9223372036854775809 out of bounds",
- ):
+ self.check_error(
+ exception=pe.exception,
+ error_class="VALUE_OUT_OF_BOUND",
+ message_parameters={"arg_name": "value", "min":
"-9223372036854775808", "max": "32767"},
+ )
+
+ with self.assertRaises(PySparkValueError) as pe:
cdf.select(CF.lit(JVM_LONG_MIN - 1)).show()
+ self.check_error(
+ exception=pe.exception,
+ error_class="VALUE_OUT_OF_BOUND",
+ message_parameters={"arg_name": "value", "min":
"-9223372036854775808", "max": "32767"},
+ )
+
def test_cast(self):
# SPARK-41412: test basic Column.cast
df = self.connect.read.table(self.tbl_name)
diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py
b/python/pyspark/sql/tests/connect/test_connect_function.py
index 57b39310fe8..38a7ed4df62 100644
--- a/python/pyspark/sql/tests/connect/test_connect_function.py
+++ b/python/pyspark/sql/tests/connect/test_connect_function.py
@@ -17,7 +17,7 @@
import os
import unittest
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkTypeError, PySparkValueError
from pyspark.sql import SparkSession as PySparkSession
from pyspark.sql.types import StringType, StructType, StructField, ArrayType,
IntegerType
from pyspark.testing.pandasutils import PandasOnSparkTestUtils
@@ -862,12 +862,15 @@ class SparkConnectFunctionTests(ReusedConnectTestCase,
PandasOnSparkTestUtils, S
)
# check error
- with self.assertRaisesRegex(
- ValueError,
- "end is out of bound",
- ):
+ with self.assertRaises(PySparkValueError) as pe:
cdf.select(CF.sum("a").over(CW.orderBy("b").rowsBetween(0, (1 <<
33)))).show()
+ self.check_error(
+ exception=pe.exception,
+ error_class="VALUE_OUT_OF_BOUND",
+ message_parameters={"arg_name": "end", "min": "-2147483648",
"max": "2147483647"},
+ )
+
with self.assertRaises(PySparkTypeError) as pe:
cdf.select(CF.rank().over(cdf.a))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]