This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6a44b627f40f [SPARK-45811][PYTHON][DOCS] Refine docstring of `from_xml`
6a44b627f40f is described below
commit 6a44b627f40f501a171794416b6a6a9cae8893b5
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Tue Nov 7 10:01:50 2023 -0800
[SPARK-45811][PYTHON][DOCS] Refine docstring of `from_xml`
### What changes were proposed in this pull request?
This PR proposes to improve the docstring of `from_xml`.
### Why are the changes needed?
For end users, and better usability of PySpark.
### Does this PR introduce _any_ user-facing change?
Yes, it fixes the user facing documentation.
### How was this patch tested?
Manually tested.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43680 from HyukjinKwon/SPARK-45186.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions.py | 51 ++++++++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index dd6be89ab853..ef5c0ea073ab 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -13635,6 +13635,8 @@ def json_object_keys(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("json_object_keys", col)
+# TODO: Fix and add an example for StructType with Spark Connect
+# e.g., StructType([StructField("a", IntegerType())])
@_try_remote_functions
def from_xml(
col: "ColumnOrName",
@@ -13668,40 +13670,51 @@ def from_xml(
Examples
--------
- >>> from pyspark.sql.types import *
- >>> from pyspark.sql.functions import from_xml, schema_of_xml, lit
-
- StructType input with simple IntegerType.
+ Example 1: Parsing XML with a :class:`StructType` schema
+ >>> import pyspark.sql.functions as sf
+ >>> from pyspark.sql.types import StructType, StructField, LongType
+ ... # Sample data with an XML column
>>> data = [(1, '''<p><a>1</a></p>''')]
>>> df = spark.createDataFrame(data, ("key", "value"))
+ ... # Define the schema using a StructType
+ >>> schema = StructType([StructField("a", LongType())])
+ ... # Parse the XML column using the specified schema
+ >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect()
+ [Row(xml=Row(a=1))]
- TODO: Fix StructType for spark connect
- schema = StructType([StructField("a", IntegerType())])
+ Example 2: Parsing XML with a DDL-formatted string schema
+ >>> import pyspark.sql.functions as sf
+ >>> data = [(1, '''<p><a>1</a></p>''')]
+ >>> df = spark.createDataFrame(data, ("key", "value"))
+ ... # Define the schema using a DDL-formatted string
>>> schema = "STRUCT<a: BIGINT>"
- >>> df.select(from_xml(df.value, schema).alias("xml")).collect()
+ ... # Parse the XML column using the DDL-formatted schema
+ >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect()
[Row(xml=Row(a=1))]
- String input.
-
- >>> df.select(from_xml(df.value, "a INT").alias("xml")).collect()
- [Row(xml=Row(a=1))]
+ Example 3: Parsing XML with :class:`ArrayType` in schema
+ >>> import pyspark.sql.functions as sf
>>> data = [(1, '<p><a>1</a><a>2</a></p>')]
>>> df = spark.createDataFrame(data, ("key", "value"))
-
- TODO: Fix StructType for spark connect
- schema = StructType([StructField("a", ArrayType(IntegerType()))])
-
+ ... # Define the schema with an Array type
>>> schema = "STRUCT<a: ARRAY<BIGINT>>"
- >>> df.select(from_xml(df.value, schema).alias("xml")).collect()
+ ... # Parse the XML column using the schema with an Array
+ >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect()
[Row(xml=Row(a=[1, 2]))]
- Column input generated by schema_of_xml.
+ Example 4: Parsing XML using :meth:`pyspark.sql.functions.schema_of_xml`
- >>> schema = schema_of_xml(lit(data[0][1]))
- >>> df.select(from_xml(df.value, schema).alias("xml")).collect()
+ >>> import pyspark.sql.functions as sf
+ >>> # Sample data with an XML column
+ ... data = [(1, '<p><a>1</a><a>2</a></p>')]
+ >>> df = spark.createDataFrame(data, ("key", "value"))
+ ... # Generate the schema from an example XML value
+ >>> schema = sf.schema_of_xml(sf.lit(data[0][1]))
+ ... # Parse the XML column using the generated schema
+ >>> df.select(sf.from_xml(df.value, schema).alias("xml")).collect()
[Row(xml=Row(a=[1, 2]))]
"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]