This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 73a2ebb901e5 [SPARK-50696][PYTHON] Optimize Py4J call for DDL parse
method
73a2ebb901e5 is described below
commit 73a2ebb901e5eabb590876eb54b8549d4c5ab2a3
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Dec 30 09:46:34 2024 +0800
[SPARK-50696][PYTHON] Optimize Py4J call for DDL parse method
### What changes were proposed in this pull request?
Optimize the DDL parse method in Python
### Why are the changes needed?
to reduce the Py4J calls
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Existing tests
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #49320 from zhengruifeng/py_opt_ddl.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/types.py | 31 +++-------------------
.../spark/sql/api/python/PythonSQLUtils.scala | 23 ++++++++++++++++
2 files changed, 26 insertions(+), 28 deletions(-)
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 303c0c3c8bd0..93ac6655b886 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1916,34 +1916,9 @@ def _parse_datatype_string(s: str) -> DataType:
from py4j.java_gateway import JVMView
sc = get_active_spark_context()
-
- def from_ddl_schema(type_str: str) -> DataType:
- return _parse_datatype_json_string(
- cast(JVMView, sc._jvm)
- .org.apache.spark.sql.types.StructType.fromDDL(type_str)
- .json()
- )
-
- def from_ddl_datatype(type_str: str) -> DataType:
- return _parse_datatype_json_string(
- cast(JVMView, sc._jvm)
-
.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str)
- .json()
- )
-
- try:
- # DDL format, "fieldname datatype, fieldname datatype".
- return from_ddl_schema(s)
- except Exception as e:
- try:
- # For backwards compatibility, "integer", "struct<fieldname:
datatype>" and etc.
- return from_ddl_datatype(s)
- except BaseException:
- try:
- # For backwards compatibility, "fieldname: datatype,
fieldname: datatype" case.
- return from_ddl_datatype("struct<%s>" % s.strip())
- except BaseException:
- raise e
+ return _parse_datatype_json_string(
+ cast(JVMView,
sc._jvm).org.apache.spark.sql.api.python.PythonSQLUtils.ddlToJson(s)
+ )
def _parse_datatype_json_string(json_string: str) -> DataType:
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
index e33fe38b160a..49fe494903cd 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala
@@ -148,6 +148,29 @@ private[sql] object PythonSQLUtils extends Logging {
DataType.fromJson(json).asInstanceOf[StructType].toDDL
}
+ def ddlToJson(ddl: String): String = {
+ val dataType = try {
+ // DDL format, "fieldname datatype, fieldname datatype".
+ StructType.fromDDL(ddl)
+ } catch {
+ case e: Throwable =>
+ try {
+ // For backwards compatibility, "integer", "struct<fieldname:
datatype>" and etc.
+ parseDataType(ddl)
+ } catch {
+ case _: Throwable =>
+ try {
+ // For backwards compatibility, "fieldname: datatype, fieldname:
datatype" case.
+ parseDataType(s"struct<${ddl.trim}>")
+ } catch {
+ case _: Throwable =>
+ throw e
+ }
+ }
+ }
+ dataType.json
+ }
+
def unresolvedNamedLambdaVariable(name: String): Column =
Column(internal.UnresolvedNamedLambdaVariable.apply(name))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]