Copilot commented on code in PR #805:
URL: https://github.com/apache/sedona-db/pull/805#discussion_r3192472434
##########
rust/sedona/src/context.rs:
##########
@@ -188,7 +188,13 @@ impl SedonaContext {
state_builder = state_builder.with_query_planner(Arc::new(planner));
let mut state = state_builder.build();
+
+ // Register GeoParquet and try to initialize our statistics
accumulator. It is OK if this fails
+ // (is likely because we already registered it).
state.register_file_format(Arc::new(GeoParquetFormatFactory::new()),
true)?;
+ let _ =
+
sedona_geoparquet::statistics_accumulator::SedonaGeoStatsAccumulatorFactory::try_init();
Review Comment:
The result of SedonaGeoStatsAccumulatorFactory::try_init() is discarded
unconditionally. This can hide real initialization failures (e.g., if the
Parquet global factory cannot be installed) and leave GeoParquet stats silently
disabled. Consider ignoring only the expected "already initialized" case and
surfacing/logging other errors.
##########
rust/sedona-geoparquet/src/writer.rs:
##########
@@ -532,8 +574,183 @@ fn append_float_bbox(
Ok(())
}
+#[derive(Debug, PartialEq)]
+struct NormalizeForGeoParquet {
+ crs_provider: CrsProviderOption,
+ version: GeoParquetVersion,
+ signature: Signature,
+}
+
+impl NormalizeForGeoParquet {
+ fn new(crs_provider: CrsProviderOption, version: GeoParquetVersion) ->
Self {
+ Self {
+ crs_provider,
+ version,
+ signature: Signature::any(1, Volatility::Stable),
+ }
+ }
+}
+
+impl std::hash::Hash for NormalizeForGeoParquet {
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+ self.version.hash(state);
+ self.signature.hash(state);
+ }
+}
+
+impl Eq for NormalizeForGeoParquet {}
+
+impl ScalarUDFImpl for NormalizeForGeoParquet {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "normalize_for_geoparquet"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ sedona_internal_err!("return_type() should not be called")
+ }
+
+ fn return_field_from_args(&self, args: datafusion_expr::ReturnFieldArgs)
-> Result<FieldRef> {
+ normalize_field_for_geoparquet(&args.arg_fields[0], self.version,
&self.crs_provider)
+ }
+
+ fn invoke_with_args(&self, args: datafusion_expr::ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ Ok(args.args[0].clone())
+ }
+}
+
+fn normalize_field_for_geoparquet(
+ field: &FieldRef,
+ version: GeoParquetVersion,
+ crs_provider: &CrsProviderOption,
+) -> Result<FieldRef> {
+ if field.metadata().is_empty() {
+ return Ok(field.clone());
+ }
+
+ let sedona_type = SedonaType::from_storage_field(field)?;
+ match sedona_type {
+ SedonaType::Arrow(DataType::Struct(children)) => {
+ let new_type = DataType::Struct(
+ children
+ .iter()
+ .map(|f| normalize_field_for_geoparquet(f, version,
crs_provider))
+ .collect::<Result<_>>()?,
+ );
+ Ok(Arc::new(field.as_ref().clone().with_data_type(new_type)))
+ }
+ SedonaType::Arrow(DataType::List(child)) => {
+ let new_type = DataType::List(normalize_field_for_geoparquet(
+ &child,
+ version,
+ crs_provider,
+ )?);
+ Ok(Arc::new(field.as_ref().clone().with_data_type(new_type)))
+ }
+ SedonaType::Arrow(_) => Ok(field.clone()),
+ SedonaType::Wkb(edges, crs) | SedonaType::WkbView(edges, crs) => match
version {
+ // For GeoParquet 1.0 and 1.1, strip the metadata (we write Binary
storage)
+ GeoParquetVersion::V1_0 | GeoParquetVersion::V1_1 => Ok(Arc::new(
+ field.as_ref().clone().with_metadata(HashMap::new()),
+ )),
+ // For GeoParquet 2.0 and None, ensure we have projjson CRS output
+ GeoParquetVersion::V2_0 | GeoParquetVersion::Omitted => {
+ let normalized_crs_value =
+ normalize_crs_for_geoparquet(field.name(), &crs,
crs_provider)?;
+ let normalized_crs =
+
deserialize_crs_from_obj(&normalized_crs_value.unwrap_or(Value::Null))?;
+ Ok(serialize_edges_and_crs_with_parquet_bug(
+ field,
+ &normalized_crs,
+ edges,
+ ))
+ }
+ },
+ _ => exec_err!("Unsupported geometry output to Parquet:
{sedona_type}"),
+ }
+}
+
+// Due to a bug in the parquet type conversion, we need to serialize invalid
metadata for gegraphy
Review Comment:
Typo in comment: "gegraphy" should be "geography".
##########
rust/sedona-geoparquet/src/writer.rs:
##########
@@ -123,93 +152,69 @@ pub fn create_geoparquet_writer_physical_plan(
metadata.primary_column = field_names[output_geometry_primary].clone();
}
- // Apply all columns
- for i in output_geometry_column_indices {
- let f = conf.output_schema().field(i);
- let sedona_type = SedonaType::from_storage_field(f)?;
- let mut column_metadata = GeoParquetColumnMetadata::default();
-
- let (edge_type, crs) = match sedona_type {
- SedonaType::Wkb(edge_type, crs) | SedonaType::WkbView(edge_type,
crs) => {
- (edge_type, crs)
+ // We skip writing the arrow metadata because we have to serialize invalid
GeoArrow
+ // metadata in some cases to work around a bug in the parquet GeoArrow ->
Parquet
+ // logical type conversion.
+ let mut parquet_options =
options.inner.clone().with_skip_arrow_metadata(true);
+
+ // Create the column metadata and finalize the Parquet meatdata if we're
omitting the GeoParquet metadata
Review Comment:
This comment says "if we're omitting the GeoParquet metadata", but the
guarded block runs only when metadata.version is non-empty (i.e., when
GeoParquet metadata is being written). Please update the comment to reflect the
actual condition to avoid misleading future changes.
##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -483,6 +483,104 @@ def test_write_geoparquet_geography(con, geoarrow_data):
assert table_roundtrip == table
+def test_write_geoparquet_2_0(con, geoarrow_data):
+ # Checks a read and write of geography (roundtrip, since nobody else can
read/write)
+ path = (
+ geoarrow_data
+ / "natural-earth"
+ / "files"
+ / "natural-earth_countries_geo.parquet"
+ )
+ skip_if_not_exists(path)
+
+ table = con.read_parquet(path).to_arrow_table()
+
+ with tempfile.TemporaryDirectory() as td:
+ tmp_parquet = Path(td) / "tmp.parquet"
+ con.create_data_frame(table).to_parquet(tmp_parquet,
geoparquet_version="2.0")
+
+ table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+ assert table_roundtrip == table
+
+ # Check for metadata and logical type
+ file = parquet.ParquetFile(tmp_parquet)
+ file_kv_metadata = file.metadata.metadata
+ assert b"geo" in file_kv_metadata
+ geo_metadata = json.loads(file_kv_metadata[b"geo"])
+ assert geo_metadata["version"] == "2.0.0"
+
+ file.metadata.schema.column(2).logical_type.to_json() == '{"Type":
"Geometry"}'
+
Review Comment:
This line performs a comparison but doesn't assert it, so the test won't
fail if the logical type is wrong. Use an assertion (e.g., assert that the JSON
matches) to actually validate the Parquet logical type.
##########
rust/sedona-geoparquet/src/writer.rs:
##########
@@ -123,93 +152,69 @@ pub fn create_geoparquet_writer_physical_plan(
metadata.primary_column = field_names[output_geometry_primary].clone();
}
- // Apply all columns
- for i in output_geometry_column_indices {
- let f = conf.output_schema().field(i);
- let sedona_type = SedonaType::from_storage_field(f)?;
- let mut column_metadata = GeoParquetColumnMetadata::default();
-
- let (edge_type, crs) = match sedona_type {
- SedonaType::Wkb(edge_type, crs) | SedonaType::WkbView(edge_type,
crs) => {
- (edge_type, crs)
+ // We skip writing the arrow metadata because we have to serialize invalid
GeoArrow
+ // metadata in some cases to work around a bug in the parquet GeoArrow ->
Parquet
+ // logical type conversion.
+ let mut parquet_options =
options.inner.clone().with_skip_arrow_metadata(true);
+
+ // Create the column metadata and finalize the Parquet meatdata if we're
omitting the GeoParquet metadata
Review Comment:
Typo in comment: "meatdata" should be "metadata".
##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -483,6 +483,104 @@ def test_write_geoparquet_geography(con, geoarrow_data):
assert table_roundtrip == table
+def test_write_geoparquet_2_0(con, geoarrow_data):
+ # Checks a read and write of geography (roundtrip, since nobody else can
read/write)
+ path = (
+ geoarrow_data
+ / "natural-earth"
+ / "files"
+ / "natural-earth_countries_geo.parquet"
+ )
+ skip_if_not_exists(path)
+
+ table = con.read_parquet(path).to_arrow_table()
+
+ with tempfile.TemporaryDirectory() as td:
+ tmp_parquet = Path(td) / "tmp.parquet"
+ con.create_data_frame(table).to_parquet(tmp_parquet,
geoparquet_version="2.0")
+
+ table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+ assert table_roundtrip == table
+
+ # Check for metadata and logical type
+ file = parquet.ParquetFile(tmp_parquet)
+ file_kv_metadata = file.metadata.metadata
+ assert b"geo" in file_kv_metadata
+ geo_metadata = json.loads(file_kv_metadata[b"geo"])
+ assert geo_metadata["version"] == "2.0.0"
+
+ file.metadata.schema.column(2).logical_type.to_json() == '{"Type":
"Geometry"}'
+
+
+def test_write_geoparquet_no_metadata(con, geoarrow_data):
+ # Checks a read and write of geography (roundtrip, since nobody else can
read/write)
+ path = (
+ geoarrow_data
+ / "natural-earth"
+ / "files"
+ / "natural-earth_countries_geo.parquet"
+ )
+ skip_if_not_exists(path)
+
+ table = con.read_parquet(path).to_arrow_table()
+
+ with tempfile.TemporaryDirectory() as td:
+ tmp_parquet = Path(td) / "tmp.parquet"
+ con.create_data_frame(table).to_parquet(tmp_parquet,
geoparquet_version="none")
+
+ table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+ assert table_roundtrip == table
+
+ # Check for absent metadata and but correct logical type
+ file = parquet.ParquetFile(tmp_parquet)
+ file_kv_metadata = file.metadata.metadata
+ assert file_kv_metadata is None or b"geo" not in file_kv_metadata
+
+ file.metadata.schema.column(2).logical_type.to_json() == '{"Type":
"Geometry"}'
+ geo_stats = file.metadata.row_group(0).column(2).geo_statistics
+ assert geo_stats is not None
+ assert geo_stats.geospatial_types == [3, 6]
+ assert geo_stats.xmin <= -180
+ assert geo_stats.xmax >= 180
+
+
+def test_write_geoparquet_geography_no_metadata(con, geoarrow_data):
+ # Checks a read and write of geography (roundtrip, since nobody else can
read/write)
+ path = (
+ geoarrow_data
+ / "natural-earth"
+ / "files"
+ / "natural-earth_countries-geography_geo.parquet"
+ )
+ skip_if_not_exists(path)
+
+ table = con.read_parquet(path).to_arrow_table()
+
+ with tempfile.TemporaryDirectory() as td:
+ tmp_parquet = Path(td) / "tmp.parquet"
+ con.create_data_frame(table).to_parquet(tmp_parquet,
geoparquet_version="none")
+
+ table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+ assert table_roundtrip == table
+
+ # Check for absent metadata and but correct logical type
+ file = parquet.ParquetFile(tmp_parquet)
+ file_kv_metadata = file.metadata.metadata
+ assert file_kv_metadata is None or b"geo" not in file_kv_metadata
+
+ file.metadata.schema.column(2).logical_type.to_json() == '{"Type":
"Geography"}'
Review Comment:
This line performs a comparison but doesn't assert it, so the test won't
fail if the logical type is wrong. Use an assertion to validate the Parquet
logical type.
##########
c/sedona-s2geography/src/geography.rs:
##########
@@ -95,10 +95,15 @@ unsafe impl<'a> Send for Geography<'a> {}
unsafe impl<'a> Sync for Geography<'a> {}
/// Factory for creating Geography objects from various formats
+#[derive(Debug)]
pub struct GeographyFactory {
ptr: *mut S2GeogFactory,
}
+// Safety: const methods are thread safe (although there aren't any const
methods)
Review Comment:
The safety comment justifying Send/Sync says "const methods" are
thread-safe, but GeographyFactory has no const/&self methods (all public APIs
require &mut self). Please clarify the actual safety rationale (e.g., safe to
move across threads / use from a single thread at a time) so the Send/Sync impl
is correctly documented.
##########
c/sedona-s2geography/src/rect_bounder.rs:
##########
@@ -31,6 +31,15 @@ pub struct RectBounder {
ptr: *mut S2GeogRectBounder,
}
+impl std::fmt::Debug for RectBounder {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.debug_struct("RectBounder")
+ .field("ptr", &self.ptr)
+ .field("finish()", &self.finish())
Review Comment:
The Debug implementation calls finish(), which can be relatively expensive
and may have side effects depending on the C++ implementation. Debug formatting
should generally avoid executing potentially mutating or costly computations;
consider only printing ptr/is_empty or cached state.
##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -483,6 +483,104 @@ def test_write_geoparquet_geography(con, geoarrow_data):
assert table_roundtrip == table
+def test_write_geoparquet_2_0(con, geoarrow_data):
+ # Checks a read and write of geography (roundtrip, since nobody else can
read/write)
+ path = (
+ geoarrow_data
+ / "natural-earth"
+ / "files"
+ / "natural-earth_countries_geo.parquet"
+ )
+ skip_if_not_exists(path)
+
+ table = con.read_parquet(path).to_arrow_table()
+
+ with tempfile.TemporaryDirectory() as td:
+ tmp_parquet = Path(td) / "tmp.parquet"
+ con.create_data_frame(table).to_parquet(tmp_parquet,
geoparquet_version="2.0")
+
+ table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+ assert table_roundtrip == table
+
+ # Check for metadata and logical type
+ file = parquet.ParquetFile(tmp_parquet)
+ file_kv_metadata = file.metadata.metadata
+ assert b"geo" in file_kv_metadata
+ geo_metadata = json.loads(file_kv_metadata[b"geo"])
+ assert geo_metadata["version"] == "2.0.0"
+
+ file.metadata.schema.column(2).logical_type.to_json() == '{"Type":
"Geometry"}'
+
+
+def test_write_geoparquet_no_metadata(con, geoarrow_data):
+ # Checks a read and write of geography (roundtrip, since nobody else can
read/write)
+ path = (
+ geoarrow_data
+ / "natural-earth"
+ / "files"
+ / "natural-earth_countries_geo.parquet"
+ )
+ skip_if_not_exists(path)
+
+ table = con.read_parquet(path).to_arrow_table()
+
+ with tempfile.TemporaryDirectory() as td:
+ tmp_parquet = Path(td) / "tmp.parquet"
+ con.create_data_frame(table).to_parquet(tmp_parquet,
geoparquet_version="none")
+
+ table_roundtrip = con.read_parquet(tmp_parquet).to_arrow_table()
+ assert table_roundtrip == table
+
+ # Check for absent metadata and but correct logical type
+ file = parquet.ParquetFile(tmp_parquet)
+ file_kv_metadata = file.metadata.metadata
+ assert file_kv_metadata is None or b"geo" not in file_kv_metadata
+
+ file.metadata.schema.column(2).logical_type.to_json() == '{"Type":
"Geometry"}'
+ geo_stats = file.metadata.row_group(0).column(2).geo_statistics
Review Comment:
This line performs a comparison but doesn't assert it, so the test won't
fail if the logical type is wrong. Use an assertion to validate the Parquet
logical type.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]