Copilot commented on code in PR #806:
URL: https://github.com/apache/sedona-db/pull/806#discussion_r3212170645


##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -664,3 +665,88 @@ def 
test_read_parquet_validate_wkb_partial_invalid_rows(con, tmp_path):
         con.read_parquet(
             path, geometry_columns=geometry_columns, validate=True
         ).to_arrow_table()
+
+
+def test_prune_geography_parquet():
+    """Test that geography parquet files can be pruned with statistics 
crossing the antimeridian."""
+    if "s2geography" not in sedonadb.__features__:
+        pytest.skip("Python package built without s2geography")
+
+    con = sedonadb.connect()
+
+    # Create random points crossing the antimeridian (bounds=[170, 10, 190, 30]
+    # which wraps to [170, 10, -170, 30])
+    con.funcs.table.sd_random_geometry(
+        "Point", 100_000, seed=42, bounds=[170, 10, 190, 30]

Review Comment:
   This test generates 100,000 random points, which is 10× larger than the 
similar `test_write_sort_by_geometry` dataset in this file (10,000). This can 
noticeably increase CI runtime and memory for a unit test. Consider reducing 
the row count to the minimum needed to create >1 row group (e.g., 20,000 with 
`max_row_group_size=10_000`) while keeping the wraparound/pruning assertions 
deterministic via the fixed seed.
   



##########
python/sedonadb/tests/io/test_parquet.py:
##########
@@ -664,3 +665,88 @@ def 
test_read_parquet_validate_wkb_partial_invalid_rows(con, tmp_path):
         con.read_parquet(
             path, geometry_columns=geometry_columns, validate=True
         ).to_arrow_table()
+
+
+def test_prune_geography_parquet():
+    """Test that geography parquet files can be pruned with statistics 
crossing the antimeridian."""
+    if "s2geography" not in sedonadb.__features__:
+        pytest.skip("Python package built without s2geography")
+
+    con = sedonadb.connect()
+
+    # Create random points crossing the antimeridian (bounds=[170, 10, 190, 30]
+    # which wraps to [170, 10, -170, 30])
+    con.funcs.table.sd_random_geometry(
+        "Point", 100_000, seed=42, bounds=[170, 10, 190, 30]
+    ).to_view("pts", overwrite=True)
+
+    with tempfile.TemporaryDirectory() as td:
+        tmp_parquet = Path(td) / "geog.parquet"
+
+        # Write as geography with small row groups to enable pruning
+        con.sql(
+            "SELECT ST_SetSRID(ST_GeogFromWKB(ST_AsBinary(geometry)), 4326) as 
geog FROM pts"
+        ).to_parquet(
+            tmp_parquet,
+            geoparquet_version="2.0",
+            max_row_group_size=10_000,
+            sort_by=["geog"],
+        )
+
+        # Verify the parquet file has geography statistics with antimeridian 
wraparound
+        f = parquet.ParquetFile(tmp_parquet)
+        assert f.metadata.num_row_groups > 1
+
+        # Check that at least one row group has wraparound statistics (xmin > 
xmax)
+        has_wraparound = False
+        for i in range(f.metadata.num_row_groups):
+            stats = f.metadata.row_group(i).column(0).geo_statistics
+            if stats is not None and stats.xmin > stats.xmax:
+                has_wraparound = True
+                break
+        assert has_wraparound, (
+            "Expected at least one row group with antimeridian wraparound"
+        )
+
+        # Query 1: Query crossing the antimeridian - should match results
+        query_antimeridian = f"""
+            SELECT * FROM "{tmp_parquet}" WHERE
+            ST_Intersects(geog, ST_SetSRID(ST_GeogFromText(
+                'POLYGON ((179 20, -179 20, -179 22, 179 22, 179 20))'
+            ), 4326))
+        """
+        result_count = con.sql(query_antimeridian).count()
+        assert result_count > 0, "Expected results from antimeridian-crossing 
query"
+
+        # Query 2: Query a small region that should only match some row groups
+        # This queries the far west side (around 170-172 longitude)
+        query_partial = f"""
+            SELECT * FROM "{tmp_parquet}" WHERE
+            ST_Intersects(geog, ST_SetSRID(ST_GeogFromText(
+                'POLYGON ((170 10, 172 10, 172 12, 170 12, 170 10))'
+            ), 4326))
+        """
+
+        # Verify the query returns results
+        result_count = con.sql(query_partial).count()
+        assert result_count > 0, "Expected some results from partial region 
query"
+
+        # Verify pruning occurred via EXPLAIN ANALYZE
+        explained = con.sql(query_partial).explain("analyze").to_pandas()
+        plan_text = explained.iloc[0, 1]
+
+        # Check that spatial pruning metrics are reported
+        assert "row_groups_spatial_pruned" in plan_text, (
+            "Expected row_groups_spatial_pruned in explain output"
+        )
+
+        # Verify that pruning actually reduced the number of row groups scanned
+        match = re.search(
+            r"row_groups_spatial_pruned=(\d+) total .* (\d+) matched", 
plan_text
+        )
+        if match:
+            total = int(match.group(1))
+            matched = int(match.group(2))
+            assert matched < total, (
+                f"Expected pruning to reduce row groups: {matched} matched out 
of {total}"
+            )

Review Comment:
   The pruning assertion is currently conditional: if the regex doesn’t match, 
the test still passes and won’t actually verify that fewer row groups were 
scanned. Also, `.*` won’t match newlines unless you pass `re.S`/`re.DOTALL`, 
which is likely with multi-line EXPLAIN output. Make this an unconditional 
assertion by requiring a match (and using DOTALL or a newline-safe pattern) so 
the test fails when the expected metrics format or pruning behavior isn’t 
present.
   



##########
rust/sedona-expr/src/spatial_filter.rs:
##########
@@ -178,45 +181,70 @@ impl SpatialFilter {
         let maybe_rhs = rhs.evaluate_internal(table_stats)?;
         Ok(maybe_lhs || maybe_rhs)
     }
+}
+
+/// Factory for creating [SpatialFilter] objects from expressions
+#[derive(Debug, Clone)]
+pub struct SpatialFilterFactory {
+    literal_bounders: Vec<Arc<dyn LiteralBounder>>,
+}
+
+impl Default for SpatialFilterFactory {
+    fn default() -> Self {
+        Self {
+            literal_bounders: vec![Arc::new(DefaultLiteralBounder)],
+        }
+    }
+}
+
+impl SpatialFilterFactory {
+    /// Add a [LiteralBounder] (e.g., that can handle non-planar edges)
+    pub fn with_bounder(mut self, literal_bounder: Arc<dyn LiteralBounder>) -> 
Self {
+        self.literal_bounders.push(literal_bounder);
+        self
+    }
 
     /// Construct a SpatialPredicate from a [PhysicalExpr]
     ///
     /// Parses expr to extract known expressions we can evaluate against 
statistics.
-    pub fn try_from_expr(expr: &Arc<dyn PhysicalExpr>) -> Result<Self> {
-        if let Some(spatial_filter) = Self::try_from_range_predicate(expr)? {
+    pub fn try_from_expr(&self, expr: &Arc<dyn PhysicalExpr>) -> 
Result<SpatialFilter> {
+        if let Some(spatial_filter) = self.try_from_range_predicate(expr)? {
             Ok(spatial_filter)
-        } else if let Some(spatial_filter) = 
Self::try_from_distance_predicate(expr)? {
+        } else if let Some(spatial_filter) = 
self.try_from_distance_predicate(expr)? {
             Ok(spatial_filter)

Review Comment:
   `SpatialFilter::try_from_expr` used to be a public constructor, but this 
change moves construction behind `SpatialFilterFactory::try_from_expr`, which 
is a breaking API change for downstream crates. Consider restoring 
`SpatialFilter::try_from_expr` as a thin wrapper around 
`SpatialFilterFactory::default().try_from_expr(...)` (optionally deprecated) so 
existing callers keep working while still enabling pluggable bounders via the 
factory.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to