(spark) branch master updated: [SPARK-55464][PYTHON] Support GeographyType in convert_numpy

ruifengz Sun, 01 Mar 2026 22:22:24 -0800

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 56faeb4962d2 [SPARK-55464][PYTHON] Support GeographyType in 
convert_numpy
56faeb4962d2 is described below

commit 56faeb4962d2ff99016539146a3d4c2f3beef79c
Author: Fangchen Li <[email protected]>
AuthorDate: Mon Mar 2 14:22:01 2026 +0800

    [SPARK-55464][PYTHON] Support GeographyType in convert_numpy
    
    ### What changes were proposed in this pull request?
    
    Support GeographyType in convert_numpy.
    
    ### Why are the changes needed?
    
    Part of the new arrow to pandas converter
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Unittests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Opus 4.6
    
    Closes #54444 from fangchenli/SPARK-55462-convert-numpy-geography.
    
    Authored-by: Fangchen Li <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/sql/conversion.py            |  7 ++++-
 python/pyspark/sql/tests/test_conversion.py | 40 +++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index c10e7173cb57..97db4563ad2c 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -1598,6 +1598,7 @@ class ArrowArrayToPandasConversion:
             TimestampNTZType,
             UserDefinedType,
             VariantType,
+            GeographyType,
         )
         if df_for_struct and isinstance(spark_type, StructType):
             return all(isinstance(f.dataType, supported_types) for f in 
spark_type.fields)
@@ -1709,13 +1710,17 @@ class ArrowArrayToPandasConversion:
             series = series.map(
                 lambda v: VariantVal(v["value"], v["metadata"]) if v is not 
None else None
             )
+        elif isinstance(spark_type, GeographyType):
+            series = arr.to_pandas()
+            series = series.map(
+                lambda v: Geography.fromWKB(v["wkb"], v["srid"]) if v is not 
None else None
+            )
         # elif isinstance(
         #     spark_type,
         #     (
         #         ArrayType,
         #         MapType,
         #         StructType,
-        #         GeographyType,
         #         GeometryType,
         #     ),
         # ):
diff --git a/python/pyspark/sql/tests/test_conversion.py 
b/python/pyspark/sql/tests/test_conversion.py
index c3ac461ca1d4..04c22fb31ae9 100644
--- a/python/pyspark/sql/tests/test_conversion.py
+++ b/python/pyspark/sql/tests/test_conversion.py
@@ -32,6 +32,7 @@ from pyspark.sql.types import (
     BinaryType,
     DecimalType,
     DoubleType,
+    Geography,
     GeographyType,
     GeometryType,
     IntegerType,
@@ -656,6 +657,45 @@ class ArrowArrayToPandasConversionTests(unittest.TestCase):
         )
         self.assertEqual(len(result), 0)
 
+    def test_geography_convert_numpy(self):
+        import pyarrow as pa
+
+        geography_type = pa.struct(
+            [
+                pa.field("srid", pa.int32(), nullable=False),
+                pa.field(
+                    "wkb",
+                    pa.binary(),
+                    nullable=False,
+                    metadata={b"geography": b"true", b"srid": b"4326"},
+                ),
+            ]
+        )
+
+        # basic conversion with nulls
+        # POINT(1.0, 2.0) and POINT(17.0, 7.0) in WKB format
+        wkb1 = bytes.fromhex("0101000000000000000000F03F0000000000000040")
+        wkb2 = bytes.fromhex("010100000000000000000031400000000000001c40")
+        arr = pa.array(
+            [
+                {"srid": 4326, "wkb": wkb1},
+                None,
+                {"srid": 4326, "wkb": wkb2},
+            ],
+            type=geography_type,
+        )
+        result = ArrowArrayToPandasConversion.convert_numpy(arr, 
GeographyType(4326), ser_name="g")
+        self.assertEqual(result.iloc[0], Geography(wkb1, 4326))
+        self.assertIsNone(result.iloc[1])
+        self.assertEqual(result.iloc[2], Geography(wkb2, 4326))
+        self.assertEqual(result.name, "g")
+
+        # empty
+        result = ArrowArrayToPandasConversion.convert_numpy(
+            pa.array([], type=geography_type), GeographyType(4326)
+        )
+        self.assertEqual(len(result), 0)
+
 
 if __name__ == "__main__":
     from pyspark.testing import main


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-55464][PYTHON] Support GeographyType in convert_numpy

Reply via email to