(spark) branch master updated: [SPARK-53029][FOLLOWUP] Update PyArrow StructType usage to avoid .names attribute for compatibility with PyArrow

gurwls223 Wed, 10 Sep 2025 00:01:48 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new d61a7b93ae02 [SPARK-53029][FOLLOWUP] Update PyArrow StructType usage 
to avoid .names attribute for compatibility with PyArrow
d61a7b93ae02 is described below

commit d61a7b93ae02d04f580ae2ce911484a83018dacb
Author: Allison Wang <[email protected]>
AuthorDate: Wed Sep 10 16:00:41 2025 +0900

    [SPARK-53029][FOLLOWUP] Update PyArrow StructType usage to avoid .names 
attribute for compatibility with PyArrow
    
    ### What changes were proposed in this pull request?
    
    This is a follow up for SPARK-53029. The `.names` attribute for PyArrow's 
StructType was added in PyArrow version 18.0.0, but the minimum required 
version for pyarrow now is 15.0.0.
    
    ### Why are the changes needed?
    
    To fix compatibility issue.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    
    ### How was this patch tested?
    
    UT
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #52289 from allisonwang-db/SPARK-53426-fix.
    
    Lead-authored-by: Allison Wang <[email protected]>
    Co-authored-by: Allison Wang <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/sql/pandas/serializers.py          |  4 +--
 python/pyspark/sql/tests/arrow/test_arrow_udtf.py | 31 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/pandas/serializers.py 
b/python/pyspark/sql/pandas/serializers.py
index 58c962896df6..860335d1ff0f 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -267,7 +267,7 @@ class 
ArrowStreamArrowUDTFSerializer(ArrowStreamUDTFSerializer):
                 if batch.num_columns == 0:
                     coerced_batch = batch  # skip type coercion
                 else:
-                    expected_field_names = arrow_return_type.names
+                    expected_field_names = [field.name for field in 
arrow_return_type]
                     actual_field_names = batch.schema.names
 
                     if expected_field_names != actual_field_names:
@@ -283,7 +283,7 @@ class 
ArrowStreamArrowUDTFSerializer(ArrowStreamUDTFSerializer):
                         coerced_array = self._create_array(original_array, 
field.type)
                         coerced_arrays.append(coerced_array)
                     coerced_batch = pa.RecordBatch.from_arrays(
-                        coerced_arrays, names=arrow_return_type.names
+                        coerced_arrays, names=expected_field_names
                     )
                 yield coerced_batch, arrow_return_type
 
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udtf.py 
b/python/pyspark/sql/tests/arrow/test_arrow_udtf.py
index 7c8f85a3790f..741673341dde 100644
--- a/python/pyspark/sql/tests/arrow/test_arrow_udtf.py
+++ b/python/pyspark/sql/tests/arrow/test_arrow_udtf.py
@@ -699,6 +699,37 @@ class ArrowUDTFTestsMixin:
         expected_df = self.spark.createDataFrame([(60, 180)], "computed_value 
int, multiplied int")
         assertDataFrameEqual(result_df, expected_df)
 
+    def test_arrow_udtf_with_named_arguments(self):
+        @arrow_udtf(returnType="x int, y int, sum int")
+        class NamedArgsUDTF:
+            def eval(self, x: "pa.Array", y: "pa.Array") -> 
Iterator["pa.Table"]:
+                assert isinstance(x, pa.Array), f"Expected pa.Array, got 
{type(x)}"
+                assert isinstance(y, pa.Array), f"Expected pa.Array, got 
{type(y)}"
+
+                x_val = x[0].as_py()
+                y_val = y[0].as_py()
+                result_table = pa.table(
+                    {
+                        "x": pa.array([x_val], type=pa.int32()),
+                        "y": pa.array([y_val], type=pa.int32()),
+                        "sum": pa.array([x_val + y_val], type=pa.int32()),
+                    }
+                )
+                yield result_table
+
+        # Test SQL registration and usage with named arguments
+        self.spark.udtf.register("named_args_udtf", NamedArgsUDTF)
+
+        # Test with named arguments in SQL
+        sql_result_df = self.spark.sql("SELECT * FROM named_args_udtf(y => 10, 
x => 5)")
+        expected_df = self.spark.createDataFrame([(5, 10, 15)], "x int, y int, 
sum int")
+        assertDataFrameEqual(sql_result_df, expected_df)
+
+        # Test with mixed positional and named arguments
+        sql_result_df2 = self.spark.sql("SELECT * FROM named_args_udtf(7, y => 
3)")
+        expected_df2 = self.spark.createDataFrame([(7, 3, 10)], "x int, y int, 
sum int")
+        assertDataFrameEqual(sql_result_df2, expected_df2)
+
 
 class ArrowUDTFTests(ArrowUDTFTestsMixin, ReusedSQLTestCase):
     pass


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-53029][FOLLOWUP] Update PyArrow StructType usage to avoid .names attribute for compatibility with PyArrow

Reply via email to