Copilot commented on code in PR #805:
URL: https://github.com/apache/sedona-db/pull/805#discussion_r3192472434


##########
rust/sedona/src/context.rs:
##########
@@ -188,7 +188,13 @@ impl SedonaContext {
         state_builder = state_builder.with_query_planner(Arc::new(planner));
 
         let mut state = state_builder.build();
+
+        // Register GeoParquet and try to initialize our statistics 
accumulator. It is OK if this fails
+        // (is likely because we already registered it).
         state.register_file_format(Arc::new(GeoParquetFormatFactory::new()), 
true)?;
+        let _ =
+            
sedona_geoparquet::statistics_accumulator::SedonaGeoStatsAccumulatorFactory::try_init();

Review Comment:
   The result of SedonaGeoStatsAccumulatorFactory::try_init() is discarded 
unconditionally. This can hide real initialization failures (e.g., if the 
Parquet global factory cannot be installed) and leave GeoParquet stats silently 
disabled. Consider ignoring only the expected "already initialized" case and 
surfacing/logging other errors.
   



##########
rust/sedona-geoparquet/src/writer.rs:
##########
@@ -532,8 +574,183 @@ fn append_float_bbox(
     Ok(())
 }
 
+#[derive(Debug, PartialEq)]
+struct NormalizeForGeoParquet {
+    crs_provider: CrsProviderOption,
+    version: GeoParquetVersion,
+    signature: Signature,
+}
+
+impl NormalizeForGeoParquet {
+    fn new(crs_provider: CrsProviderOption, version: GeoParquetVersion) -> 
Self {
+        Self {
+            crs_provider,
+            version,
+            signature: Signature::any(1, Volatility::Stable),
+        }
+    }
+}
+
+impl std::hash::Hash for NormalizeForGeoParquet {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.version.hash(state);
+        self.signature.hash(state);
+    }
+}
+
+impl Eq for NormalizeForGeoParquet {}
+
+impl ScalarUDFImpl for NormalizeForGeoParquet {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "normalize_for_geoparquet"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        sedona_internal_err!("return_type() should not be called")
+    }
+
+    fn return_field_from_args(&self, args: datafusion_expr::ReturnFieldArgs) 
-> Result<FieldRef> {
+        normalize_field_for_geoparquet(&args.arg_fields[0], self.version, 
&self.crs_provider)
+    }
+
+    fn invoke_with_args(&self, args: datafusion_expr::ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        Ok(args.args[0].clone())
+    }
+}
+
+fn normalize_field_for_geoparquet(
+    field: &FieldRef,
+    version: GeoParquetVersion,
+    crs_provider: &CrsProviderOption,
+) -> Result<FieldRef> {
+    if field.metadata().is_empty() {
+        return Ok(field.clone());
+    }
+
+    let sedona_type = SedonaType::from_storage_field(field)?;
+    match sedona_type {
+        SedonaType::Arrow(DataType::Struct(children)) => {
+            let new_type = DataType::Struct(
+                children
+                    .iter()
+                    .map(|f| normalize_field_for_geoparquet(f, version, 
crs_provider))
+                    .collect::<Result<_>>()?,
+            );
+            Ok(Arc::new(field.as_ref().clone().with_data_type(new_type)))
+        }
+        SedonaType::Arrow(DataType::List(child)) => {
+            let new_type = DataType::List(normalize_field_for_geoparquet(
+                &child,
+                version,
+                crs_provider,
+            )?);
+            Ok(Arc::new(field.as_ref().clone().with_data_type(new_type)))
+        }
+        SedonaType::Arrow(_) => Ok(field.clone()),
+        SedonaType::Wkb(edges, crs) | SedonaType::WkbView(edges, crs) => match 
version {
+            // For GeoParquet 1.0 and 1.1, strip the metadata (we write Binary 
storage)
+            GeoParquetVersion::V1_0 | GeoParquetVersion::V1_1 => Ok(Arc::new(
+                field.as_ref().clone().with_metadata(HashMap::new()),
+            )),
+            // For GeoParquet 2.0 and None, ensure we have projjson CRS output
+            GeoParquetVersion::V2_0 | GeoParquetVersion::Omitted => {
+                let normalized_crs_value =
+                    normalize_crs_for_geoparquet(field.name(), &crs, 
crs_provider)?;
+                let normalized_crs =
+                    
deserialize_crs_from_obj(&normalized_crs_value.unwrap_or(Value::Null))?;
+                Ok(serialize_edges_and_crs_with_parquet_bug(
+                    field,
+                    &normalized_crs,
+                    edges,
+                ))
+            }
+        },
+        _ => exec_err!("Unsupported geometry output to Parquet: 
{sedona_type}"),
+    }
+}
+
+// Due to a bug in the parquet type conversion, we need to serialize invalid 
metadata for gegraphy

Review Comment:
   Typo in comment: "gegraphy" should be "geography".
   



##########
rust/sedona-geoparquet/src/writer.rs:
##########
@@ -123,93 +152,69 @@ pub fn create_geoparquet_writer_physical_plan(
         metadata.primary_column = field_names[output_geometry_primary].clone();
     }
 
-    // Apply all columns
-    for i in output_geometry_column_indices {
-        let f = conf.output_schema().field(i);
-        let sedona_type = SedonaType::from_storage_field(f)?;
-        let mut column_metadata = GeoParquetColumnMetadata::default();
-
-        let (edge_type, crs) = match sedona_type {
-            SedonaType::Wkb(edge_type, crs) | SedonaType::WkbView(edge_type, 
crs) => {
-                (edge_type, crs)
+    // We skip writing the arrow metadata because we have to serialize invalid 
GeoArrow
+    // metadata in some cases to work around a bug in the parquet GeoArrow -> 
Parquet
+    // logical type conversion.
+    let mut parquet_options = 
options.inner.clone().with_skip_arrow_metadata(true);
+
+    // Create the column metadata and finalize the Parquet meatdata if we're 
omitting the GeoParquet metadata

Review Comment:
   This comment says "if we're omitting the GeoParquet metadata", but the 
guarded block runs only when metadata.version is non-empty (i.e., when 
GeoParquet metadata is being written). Please update the comment to reflect the 
actual condition to avoid misleading future changes.
   



##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -483,6 +483,104 @@ def test_write_geoparquet_geography(con, geoarrow_data):
         assert table_roundtrip == table
 
 
+def test_write_geoparquet_2_0(con, geoarrow_data):
+    # Checks a read and write of geography (roundtrip, since nobody else can 
read/write)
+    path = (
+        geoarrow_data
+        / "natural-earth"
+        / "files"
+        / "natural-earth_countries_geo.parquet"
+    )
+    skip_if_not_exists(path)
+
+    table = con.read_parquet(path).to_arrow_table()
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "tmp.parquet"
+        con.create_data_frame(table).to_parquet(tmp_parquet, 
geoparquet_version="2.0")
+
+        table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+        assert table_roundtrip == table
+
+        # Check for metadata and logical type
+        file = parquet.ParquetFile(tmp_parquet)
+        file_kv_metadata = file.metadata.metadata
+        assert b"geo" in file_kv_metadata
+        geo_metadata = json.loads(file_kv_metadata[b"geo"])
+        assert geo_metadata["version"] == "2.0.0"
+
+        file.metadata.schema.column(2).logical_type.to_json() == '{"Type": 
"Geometry"}'
+

Review Comment:
   This line performs a comparison but doesn't assert it, so the test won't 
fail if the logical type is wrong. Use an assertion (e.g., assert that the JSON 
matches) to actually validate the Parquet logical type.



##########
rust/sedona-geoparquet/src/writer.rs:
##########
@@ -123,93 +152,69 @@ pub fn create_geoparquet_writer_physical_plan(
         metadata.primary_column = field_names[output_geometry_primary].clone();
     }
 
-    // Apply all columns
-    for i in output_geometry_column_indices {
-        let f = conf.output_schema().field(i);
-        let sedona_type = SedonaType::from_storage_field(f)?;
-        let mut column_metadata = GeoParquetColumnMetadata::default();
-
-        let (edge_type, crs) = match sedona_type {
-            SedonaType::Wkb(edge_type, crs) | SedonaType::WkbView(edge_type, 
crs) => {
-                (edge_type, crs)
+    // We skip writing the arrow metadata because we have to serialize invalid 
GeoArrow
+    // metadata in some cases to work around a bug in the parquet GeoArrow -> 
Parquet
+    // logical type conversion.
+    let mut parquet_options = 
options.inner.clone().with_skip_arrow_metadata(true);
+
+    // Create the column metadata and finalize the Parquet meatdata if we're 
omitting the GeoParquet metadata

Review Comment:
   Typo in comment: "meatdata" should be "metadata".
   



##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -483,6 +483,104 @@ def test_write_geoparquet_geography(con, geoarrow_data):
         assert table_roundtrip == table
 
 
+def test_write_geoparquet_2_0(con, geoarrow_data):
+    # Checks a read and write of geography (roundtrip, since nobody else can 
read/write)
+    path = (
+        geoarrow_data
+        / "natural-earth"
+        / "files"
+        / "natural-earth_countries_geo.parquet"
+    )
+    skip_if_not_exists(path)
+
+    table = con.read_parquet(path).to_arrow_table()
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "tmp.parquet"
+        con.create_data_frame(table).to_parquet(tmp_parquet, 
geoparquet_version="2.0")
+
+        table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+        assert table_roundtrip == table
+
+        # Check for metadata and logical type
+        file = parquet.ParquetFile(tmp_parquet)
+        file_kv_metadata = file.metadata.metadata
+        assert b"geo" in file_kv_metadata
+        geo_metadata = json.loads(file_kv_metadata[b"geo"])
+        assert geo_metadata["version"] == "2.0.0"
+
+        file.metadata.schema.column(2).logical_type.to_json() == '{"Type": 
"Geometry"}'
+
+
+def test_write_geoparquet_no_metadata(con, geoarrow_data):
+    # Checks a read and write of geography (roundtrip, since nobody else can 
read/write)
+    path = (
+        geoarrow_data
+        / "natural-earth"
+        / "files"
+        / "natural-earth_countries_geo.parquet"
+    )
+    skip_if_not_exists(path)
+
+    table = con.read_parquet(path).to_arrow_table()
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "tmp.parquet"
+        con.create_data_frame(table).to_parquet(tmp_parquet, 
geoparquet_version="none")
+
+        table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+        assert table_roundtrip == table
+
+        # Check for absent metadata and but correct logical type
+        file = parquet.ParquetFile(tmp_parquet)
+        file_kv_metadata = file.metadata.metadata
+        assert file_kv_metadata is None or b"geo" not in file_kv_metadata
+
+        file.metadata.schema.column(2).logical_type.to_json() == '{"Type": 
"Geometry"}'
+        geo_stats = file.metadata.row_group(0).column(2).geo_statistics
+        assert geo_stats is not None
+        assert geo_stats.geospatial_types == [3, 6]
+        assert geo_stats.xmin <= -180
+        assert geo_stats.xmax >= 180
+
+
+def test_write_geoparquet_geography_no_metadata(con, geoarrow_data):
+    # Checks a read and write of geography (roundtrip, since nobody else can 
read/write)
+    path = (
+        geoarrow_data
+        / "natural-earth"
+        / "files"
+        / "natural-earth_countries-geography_geo.parquet"
+    )
+    skip_if_not_exists(path)
+
+    table = con.read_parquet(path).to_arrow_table()
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "tmp.parquet"
+        con.create_data_frame(table).to_parquet(tmp_parquet, 
geoparquet_version="none")
+
+        table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+        assert table_roundtrip == table
+
+        # Check for absent metadata and but correct logical type
+        file = parquet.ParquetFile(tmp_parquet)
+        file_kv_metadata = file.metadata.metadata
+        assert file_kv_metadata is None or b"geo" not in file_kv_metadata
+
+        file.metadata.schema.column(2).logical_type.to_json() == '{"Type": 
"Geography"}'

Review Comment:
   This line performs a comparison but doesn't assert it, so the test won't 
fail if the logical type is wrong. Use an assertion to validate the Parquet 
logical type.
   



##########
c/sedona-s2geography/src/geography.rs:
##########
@@ -95,10 +95,15 @@ unsafe impl<'a> Send for Geography<'a> {}
 unsafe impl<'a> Sync for Geography<'a> {}
 
 /// Factory for creating Geography objects from various formats
+#[derive(Debug)]
 pub struct GeographyFactory {
     ptr: *mut S2GeogFactory,
 }
 
+// Safety: const methods are thread safe (although there aren't any const 
methods)

Review Comment:
   The safety comment justifying Send/Sync says "const methods" are 
thread-safe, but GeographyFactory has no const/&self methods (all public APIs 
require &mut self). Please clarify the actual safety rationale (e.g., safe to 
move across threads / use from a single thread at a time) so the Send/Sync impl 
is correctly documented.
   



##########
c/sedona-s2geography/src/rect_bounder.rs:
##########
@@ -31,6 +31,15 @@ pub struct RectBounder {
     ptr: *mut S2GeogRectBounder,
 }
 
+impl std::fmt::Debug for RectBounder {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RectBounder")
+            .field("ptr", &self.ptr)
+            .field("finish()", &self.finish())

Review Comment:
   The Debug implementation calls finish(), which can be relatively expensive 
and may have side effects depending on the C++ implementation. Debug formatting 
should generally avoid executing potentially mutating or costly computations; 
consider only printing ptr/is_empty or cached state.
   



##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -483,6 +483,104 @@ def test_write_geoparquet_geography(con, geoarrow_data):
         assert table_roundtrip == table
 
 
+def test_write_geoparquet_2_0(con, geoarrow_data):
+    # Checks a read and write of geography (roundtrip, since nobody else can 
read/write)
+    path = (
+        geoarrow_data
+        / "natural-earth"
+        / "files"
+        / "natural-earth_countries_geo.parquet"
+    )
+    skip_if_not_exists(path)
+
+    table = con.read_parquet(path).to_arrow_table()
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "tmp.parquet"
+        con.create_data_frame(table).to_parquet(tmp_parquet, 
geoparquet_version="2.0")
+
+        table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+        assert table_roundtrip == table
+
+        # Check for metadata and logical type
+        file = parquet.ParquetFile(tmp_parquet)
+        file_kv_metadata = file.metadata.metadata
+        assert b"geo" in file_kv_metadata
+        geo_metadata = json.loads(file_kv_metadata[b"geo"])
+        assert geo_metadata["version"] == "2.0.0"
+
+        file.metadata.schema.column(2).logical_type.to_json() == '{"Type": 
"Geometry"}'
+
+
+def test_write_geoparquet_no_metadata(con, geoarrow_data):
+    # Checks a read and write of geography (roundtrip, since nobody else can 
read/write)
+    path = (
+        geoarrow_data
+        / "natural-earth"
+        / "files"
+        / "natural-earth_countries_geo.parquet"
+    )
+    skip_if_not_exists(path)
+
+    table = con.read_parquet(path).to_arrow_table()
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "tmp.parquet"
+        con.create_data_frame(table).to_parquet(tmp_parquet, 
geoparquet_version="none")
+
+        table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+        assert table_roundtrip == table
+
+        # Check for absent metadata and but correct logical type
+        file = parquet.ParquetFile(tmp_parquet)
+        file_kv_metadata = file.metadata.metadata
+        assert file_kv_metadata is None or b"geo" not in file_kv_metadata
+
+        file.metadata.schema.column(2).logical_type.to_json() == '{"Type": 
"Geometry"}'
+        geo_stats = file.metadata.row_group(0).column(2).geo_statistics

Review Comment:
   This line performs a comparison but doesn't assert it, so the test won't 
fail if the logical type is wrong. Use an assertion to validate the Parquet 
logical type.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to