Copilot commented on code in PR #2946:
URL: https://github.com/apache/sedona/pull/2946#discussion_r3220679842


##########
spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala:
##########
@@ -42,17 +43,24 @@ import 
org.apache.spark.sql.execution.datasources.PushableColumnBase
 import 
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetFileFormatBase
 import 
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter
 import 
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.AndFilter
+import 
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.Box2DLeafFilter
 import 
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.LeafFilter
 import 
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.OrFilter
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer, 
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance, 
ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals, ST_Intersects, 
ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
+import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_BoxContains, 
ST_BoxIntersects, ST_Buffer, ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, 
ST_DWithin, ST_Distance, ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals, 
ST_Intersects, ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
 import 
org.apache.spark.sql.sedona_sql.optimization.ExpressionUtils.splitConjunctivePredicates
 import org.apache.spark.sql.types.DoubleType
 import org.locationtech.jts.geom.Geometry
 import org.locationtech.jts.geom.Point
 
 class SpatialFilterPushDownForGeoParquet(sparkSession: SparkSession) extends 
Rule[LogicalPlan] {
 
+  private def enableBox2DFilterPushDown: Boolean =
+    sparkSession.conf
+      .get("spark.sedona.geoparquet.box2dFilterPushDown", "true")

Review Comment:
   The comments correctly note this pushdown can produce false negatives when 
the Box2D covering column is wider than the actual geometry envelope (or 
generally: when predicate semantics are on the Box2D column, but pruning is 
done using the geometry bbox). With the config defaulting to `"true"`, this can 
silently return incorrect query results for valid GeoParquet inputs produced by 
other writers (or user-supplied covering boxes). To prevent incorrect results 
by default, consider making `spark.sedona.geoparquet.box2dFilterPushDown` 
default to `false` (opt-in), or gate pushdown on a reliable “exact covering” 
signal in metadata (if/when available); otherwise keep-file (no pruning) is the 
only safe default.
   



##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
         assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
       }
     }
+
+    it("Push down ST_BoxIntersects against a Box2D covering column") {

Review Comment:
   The PR description states reverse argument order is supported for both 
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)` 
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and 
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path 
remains correct for the commuted argument form.



##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
         assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
       }
     }
+
+    it("Push down ST_BoxIntersects against a Box2D covering column") {
+      val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+      try {
+        // Q1 region only (region 1, center +10/+10)
+        val q1Filter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0), 
ST_Point(15.0, 15.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+        // Window covering Q2 and Q4 (negative X) — should preserve regions 0 
and 2
+        val leftHalfFilter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0), 
ST_Point(-1.0, 20.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
+        // Disjoint window prunes everything
+        val disjointFilter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(100.0, 100.0), 
ST_Point(200.0, 200.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, disjointFilter, Seq.empty)
+      } finally {
+        FileUtils.deleteDirectory(new File(box2dDir).getParentFile)
+      }
+    }
+
+    it("Push down ST_BoxContains against a Box2D covering column") {

Review Comment:
   The PR description states reverse argument order is supported for both 
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)` 
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and 
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path 
remains correct for the commuted argument form.



##########
spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala:
##########
@@ -88,4 +89,68 @@ object GeoParquetSpatialFilter {
     }
     override def simpleString: String = s"$columnName ${predicateType.name} 
$queryWindow"
   }
+
+  /**
+   * Pushdown filter for predicates that operate on a Box2D-typed column (e.g.
+   * `ST_BoxIntersects(box_col, lit_box)` or `ST_BoxContains(box_col, 
lit_box)`).
+   *
+   * Per-file evaluation: walks the file's GeoParquet column metadata to find 
the geometry column
+   * whose covering metadata points at `box2dColumnName`, then prunes using 
that geometry column's
+   * recorded bbox.
+   *
+   * Both intersects and contains map to a file-level INTERSECTS check: 
per-row containment
+   * implies per-row intersection, so the file's recorded geom bbox must 
intersect the query box
+   * for any row to match.
+   *
+   * '''Soundness caveat.''' The GeoParquet 1.1 spec allows covering bboxes to 
be conservatively
+   * wider than per-row geometry envelopes (e.g. sedona-db's Float32 writer 
rounds outward via
+   * `next_after`). When that happens, the union of per-row Box2D values is a 
strict superset of
+   * the file's geom bbox, and pruning using the geom bbox can produce false 
negatives. Pushdown
+   * is sound when the Box2D column is exactly the per-row geometry envelope — 
which is the case
+   * for Sedona's own writer (`ST_Box2D(geom)` produces exact envelopes). A 
proper Parquet column
+   * statistics-based pruning that operates on the Box2D column's own 
xmin/ymin/xmax/ymax bounds
+   * is tracked as a follow-up; until then this filter is opt-out via
+   * `spark.sedona.geoparquet.box2dFilterPushDown`.

Review Comment:
   The PR description claims this pushdown is “sound (never produces false 
negatives)”, but the in-code Scaladoc explicitly documents that pruning via the 
geometry bbox can produce false negatives for conservatively widened covering 
bboxes. Please align the PR description (and/or the implementation/default 
enablement) with this documented behavior so users aren’t misled about 
correctness guarantees.



##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
         assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
       }
     }
+
+    it("Push down ST_BoxIntersects against a Box2D covering column") {
+      val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+      try {
+        // Q1 region only (region 1, center +10/+10)
+        val q1Filter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0), 
ST_Point(15.0, 15.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+        // Window covering Q2 and Q4 (negative X) — should preserve regions 0 
and 2
+        val leftHalfFilter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0), 
ST_Point(-1.0, 20.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+

Review Comment:
   The PR description states reverse argument order is supported for both 
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)` 
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and 
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path 
remains correct for the commuted argument form.
   



##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
         assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
       }
     }
+
+    it("Push down ST_BoxIntersects against a Box2D covering column") {
+      val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+      try {
+        // Q1 region only (region 1, center +10/+10)
+        val q1Filter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0), 
ST_Point(15.0, 15.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+        // Window covering Q2 and Q4 (negative X) — should preserve regions 0 
and 2
+        val leftHalfFilter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0), 
ST_Point(-1.0, 20.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
+        // Disjoint window prunes everything
+        val disjointFilter =
+          "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(100.0, 100.0), 
ST_Point(200.0, 200.0)))"
+        verifyBox2DFilter(box2dDf, box2dMetaMap, disjointFilter, Seq.empty)
+      } finally {
+        FileUtils.deleteDirectory(new File(box2dDir).getParentFile)
+      }
+    }
+
+    it("Push down ST_BoxContains against a Box2D covering column") {
+      val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+      try {
+        // ST_BoxContains(box_col, lit_box) pushes down as INTERSECTS at the 
file level. A tiny
+        // query box inside Q1 prunes everything except region 1.
+        val containsFilter =
+          "ST_BoxContains(geom_bbox, ST_MakeBox2D(ST_Point(9.0, 9.0), 
ST_Point(10.0, 10.0)))"

Review Comment:
   The PR description states reverse argument order is supported for both 
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)` 
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and 
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path 
remains correct for the commuted argument form.



##########
spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala:
##########
@@ -144,6 +152,20 @@ class SpatialFilterPushDownForGeoParquet(sparkSession: 
SparkSession) extends Rul
             SpatialPredicate.INTERSECTS,
             GeometryUDT.deserialize(value))
 
+      // Box2D predicates push down to an INTERSECTS check on the geometry 
column whose covering
+      // metadata references this Box2D column. Both BoxIntersects and 
BoxContains use INTERSECTS
+      // semantics at the file level: per-row containment implies per-row 
intersection, which is
+      // only possible if the geometry column's file-level bbox intersects the 
query box.
+      // Soundness assumes the Box2D column is exactly the per-row geometry 
envelope (true for
+      // ST_Box2D(geom)-derived columns including the auto-generated 
<geom>_bbox path). When the
+      // covering column is conservatively wider than the geometry, pushdown 
can drop matching
+      // files; the box2dFilterPushDown conf lets users opt out in that case.
+      case ST_BoxIntersects(_) | ST_BoxContains(_) if 
enableBox2DFilterPushDown =>
+        for {
+          (name, value) <- resolveNameAndLiteral(predicate.children, 
pushableColumn)
+          queryBox <- extractBox2DLiteral(value)
+        } yield Box2DLeafFilter(unquote(name), queryBox)

Review Comment:
   The comments correctly note this pushdown can produce false negatives when 
the Box2D covering column is wider than the actual geometry envelope (or 
generally: when predicate semantics are on the Box2D column, but pruning is 
done using the geometry bbox). With the config defaulting to `"true"`, this can 
silently return incorrect query results for valid GeoParquet inputs produced by 
other writers (or user-supplied covering boxes). To prevent incorrect results 
by default, consider making `spark.sedona.geoparquet.box2dFilterPushDown` 
default to `false` (opt-in), or gate pushdown on a reliable “exact covering” 
signal in metadata (if/when available); otherwise keep-file (no pruning) is the 
only safe default.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to