This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 56faeb4962d2 [SPARK-55464][PYTHON] Support GeographyType in
convert_numpy
56faeb4962d2 is described below
commit 56faeb4962d2ff99016539146a3d4c2f3beef79c
Author: Fangchen Li <[email protected]>
AuthorDate: Mon Mar 2 14:22:01 2026 +0800
[SPARK-55464][PYTHON] Support GeographyType in convert_numpy
### What changes were proposed in this pull request?
Support GeographyType in convert_numpy.
### Why are the changes needed?
Part of the new arrow to pandas converter
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Unittests
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Opus 4.6
Closes #54444 from fangchenli/SPARK-55462-convert-numpy-geography.
Authored-by: Fangchen Li <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/conversion.py | 7 ++++-
python/pyspark/sql/tests/test_conversion.py | 40 +++++++++++++++++++++++++++++
2 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index c10e7173cb57..97db4563ad2c 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -1598,6 +1598,7 @@ class ArrowArrayToPandasConversion:
TimestampNTZType,
UserDefinedType,
VariantType,
+ GeographyType,
)
if df_for_struct and isinstance(spark_type, StructType):
return all(isinstance(f.dataType, supported_types) for f in
spark_type.fields)
@@ -1709,13 +1710,17 @@ class ArrowArrayToPandasConversion:
series = series.map(
lambda v: VariantVal(v["value"], v["metadata"]) if v is not
None else None
)
+ elif isinstance(spark_type, GeographyType):
+ series = arr.to_pandas()
+ series = series.map(
+ lambda v: Geography.fromWKB(v["wkb"], v["srid"]) if v is not
None else None
+ )
# elif isinstance(
# spark_type,
# (
# ArrayType,
# MapType,
# StructType,
- # GeographyType,
# GeometryType,
# ),
# ):
diff --git a/python/pyspark/sql/tests/test_conversion.py
b/python/pyspark/sql/tests/test_conversion.py
index c3ac461ca1d4..04c22fb31ae9 100644
--- a/python/pyspark/sql/tests/test_conversion.py
+++ b/python/pyspark/sql/tests/test_conversion.py
@@ -32,6 +32,7 @@ from pyspark.sql.types import (
BinaryType,
DecimalType,
DoubleType,
+ Geography,
GeographyType,
GeometryType,
IntegerType,
@@ -656,6 +657,45 @@ class ArrowArrayToPandasConversionTests(unittest.TestCase):
)
self.assertEqual(len(result), 0)
+ def test_geography_convert_numpy(self):
+ import pyarrow as pa
+
+ geography_type = pa.struct(
+ [
+ pa.field("srid", pa.int32(), nullable=False),
+ pa.field(
+ "wkb",
+ pa.binary(),
+ nullable=False,
+ metadata={b"geography": b"true", b"srid": b"4326"},
+ ),
+ ]
+ )
+
+ # basic conversion with nulls
+ # POINT(1.0, 2.0) and POINT(17.0, 7.0) in WKB format
+ wkb1 = bytes.fromhex("0101000000000000000000F03F0000000000000040")
+ wkb2 = bytes.fromhex("010100000000000000000031400000000000001c40")
+ arr = pa.array(
+ [
+ {"srid": 4326, "wkb": wkb1},
+ None,
+ {"srid": 4326, "wkb": wkb2},
+ ],
+ type=geography_type,
+ )
+ result = ArrowArrayToPandasConversion.convert_numpy(arr,
GeographyType(4326), ser_name="g")
+ self.assertEqual(result.iloc[0], Geography(wkb1, 4326))
+ self.assertIsNone(result.iloc[1])
+ self.assertEqual(result.iloc[2], Geography(wkb2, 4326))
+ self.assertEqual(result.name, "g")
+
+ # empty
+ result = ArrowArrayToPandasConversion.convert_numpy(
+ pa.array([], type=geography_type), GeographyType(4326)
+ )
+ self.assertEqual(len(result), 0)
+
if __name__ == "__main__":
from pyspark.testing import main
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]