Copilot commented on code in PR #2946:
URL: https://github.com/apache/sedona/pull/2946#discussion_r3220679842
##########
spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala:
##########
@@ -42,17 +43,24 @@ import
org.apache.spark.sql.execution.datasources.PushableColumnBase
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetFileFormatBase
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.AndFilter
+import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.Box2DLeafFilter
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.LeafFilter
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.OrFilter
+import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer,
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance,
ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals, ST_Intersects,
ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
+import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_BoxContains,
ST_BoxIntersects, ST_Buffer, ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses,
ST_DWithin, ST_Distance, ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals,
ST_Intersects, ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
import
org.apache.spark.sql.sedona_sql.optimization.ExpressionUtils.splitConjunctivePredicates
import org.apache.spark.sql.types.DoubleType
import org.locationtech.jts.geom.Geometry
import org.locationtech.jts.geom.Point
class SpatialFilterPushDownForGeoParquet(sparkSession: SparkSession) extends
Rule[LogicalPlan] {
+ private def enableBox2DFilterPushDown: Boolean =
+ sparkSession.conf
+ .get("spark.sedona.geoparquet.box2dFilterPushDown", "true")
Review Comment:
The comments correctly note this pushdown can produce false negatives when
the Box2D covering column is wider than the actual geometry envelope (or
generally: when predicate semantics are on the Box2D column, but pruning is
done using the geometry bbox). With the config defaulting to `"true"`, this can
silently return incorrect query results for valid GeoParquet inputs produced by
other writers (or user-supplied covering boxes). To prevent incorrect results
by default, consider making `spark.sedona.geoparquet.box2dFilterPushDown`
default to `false` (opt-in), or gate pushdown on a reliable “exact covering”
signal in metadata (if/when available); otherwise keep-file (no pruning) is the
only safe default.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
Review Comment:
The PR description states reverse argument order is supported for both
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)`
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path
remains correct for the commuted argument form.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // Q1 region only (region 1, center +10/+10)
+ val q1Filter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0),
ST_Point(15.0, 15.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+ // Window covering Q2 and Q4 (negative X) — should preserve regions 0
and 2
+ val leftHalfFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0),
ST_Point(-1.0, 20.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
+ // Disjoint window prunes everything
+ val disjointFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(100.0, 100.0),
ST_Point(200.0, 200.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, disjointFilter, Seq.empty)
+ } finally {
+ FileUtils.deleteDirectory(new File(box2dDir).getParentFile)
+ }
+ }
+
+ it("Push down ST_BoxContains against a Box2D covering column") {
Review Comment:
The PR description states reverse argument order is supported for both
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)`
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path
remains correct for the commuted argument form.
##########
spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala:
##########
@@ -88,4 +89,68 @@ object GeoParquetSpatialFilter {
}
override def simpleString: String = s"$columnName ${predicateType.name}
$queryWindow"
}
+
+ /**
+ * Pushdown filter for predicates that operate on a Box2D-typed column (e.g.
+ * `ST_BoxIntersects(box_col, lit_box)` or `ST_BoxContains(box_col,
lit_box)`).
+ *
+ * Per-file evaluation: walks the file's GeoParquet column metadata to find
the geometry column
+ * whose covering metadata points at `box2dColumnName`, then prunes using
that geometry column's
+ * recorded bbox.
+ *
+ * Both intersects and contains map to a file-level INTERSECTS check:
per-row containment
+ * implies per-row intersection, so the file's recorded geom bbox must
intersect the query box
+ * for any row to match.
+ *
+ * '''Soundness caveat.''' The GeoParquet 1.1 spec allows covering bboxes to
be conservatively
+ * wider than per-row geometry envelopes (e.g. sedona-db's Float32 writer
rounds outward via
+ * `next_after`). When that happens, the union of per-row Box2D values is a
strict superset of
+ * the file's geom bbox, and pruning using the geom bbox can produce false
negatives. Pushdown
+ * is sound when the Box2D column is exactly the per-row geometry envelope —
which is the case
+ * for Sedona's own writer (`ST_Box2D(geom)` produces exact envelopes). A
proper Parquet column
+ * statistics-based pruning that operates on the Box2D column's own
xmin/ymin/xmax/ymax bounds
+ * is tracked as a follow-up; until then this filter is opt-out via
+ * `spark.sedona.geoparquet.box2dFilterPushDown`.
Review Comment:
The PR description claims this pushdown is “sound (never produces false
negatives)”, but the in-code Scaladoc explicitly documents that pruning via the
geometry bbox can produce false negatives for conservatively widened covering
bboxes. Please align the PR description (and/or the implementation/default
enablement) with this documented behavior so users aren’t misled about
correctness guarantees.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // Q1 region only (region 1, center +10/+10)
+ val q1Filter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0),
ST_Point(15.0, 15.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+ // Window covering Q2 and Q4 (negative X) — should preserve regions 0
and 2
+ val leftHalfFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0),
ST_Point(-1.0, 20.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
Review Comment:
The PR description states reverse argument order is supported for both
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)`
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path
remains correct for the commuted argument form.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // Q1 region only (region 1, center +10/+10)
+ val q1Filter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0),
ST_Point(15.0, 15.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+ // Window covering Q2 and Q4 (negative X) — should preserve regions 0
and 2
+ val leftHalfFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0),
ST_Point(-1.0, 20.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
+ // Disjoint window prunes everything
+ val disjointFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(100.0, 100.0),
ST_Point(200.0, 200.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, disjointFilter, Seq.empty)
+ } finally {
+ FileUtils.deleteDirectory(new File(box2dDir).getParentFile)
+ }
+ }
+
+ it("Push down ST_BoxContains against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // ST_BoxContains(box_col, lit_box) pushes down as INTERSECTS at the
file level. A tiny
+ // query box inside Q1 prunes everything except region 1.
+ val containsFilter =
+ "ST_BoxContains(geom_bbox, ST_MakeBox2D(ST_Point(9.0, 9.0),
ST_Point(10.0, 10.0)))"
Review Comment:
The PR description states reverse argument order is supported for both
predicates, but the new tests only exercise the `ST_Box*(box_col, lit_box)`
ordering. Add coverage for `ST_BoxIntersects(lit_box, geom_bbox)` and
`ST_BoxContains(lit_box, geom_bbox)` to ensure the pushdown recognition path
remains correct for the commuted argument form.
##########
spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala:
##########
@@ -144,6 +152,20 @@ class SpatialFilterPushDownForGeoParquet(sparkSession:
SparkSession) extends Rul
SpatialPredicate.INTERSECTS,
GeometryUDT.deserialize(value))
+ // Box2D predicates push down to an INTERSECTS check on the geometry
column whose covering
+ // metadata references this Box2D column. Both BoxIntersects and
BoxContains use INTERSECTS
+ // semantics at the file level: per-row containment implies per-row
intersection, which is
+ // only possible if the geometry column's file-level bbox intersects the
query box.
+ // Soundness assumes the Box2D column is exactly the per-row geometry
envelope (true for
+ // ST_Box2D(geom)-derived columns including the auto-generated
<geom>_bbox path). When the
+ // covering column is conservatively wider than the geometry, pushdown
can drop matching
+ // files; the box2dFilterPushDown conf lets users opt out in that case.
+ case ST_BoxIntersects(_) | ST_BoxContains(_) if
enableBox2DFilterPushDown =>
+ for {
+ (name, value) <- resolveNameAndLiteral(predicate.children,
pushableColumn)
+ queryBox <- extractBox2DLiteral(value)
+ } yield Box2DLeafFilter(unquote(name), queryBox)
Review Comment:
The comments correctly note this pushdown can produce false negatives when
the Box2D covering column is wider than the actual geometry envelope (or
generally: when predicate semantics are on the Box2D column, but pruning is
done using the geometry bbox). With the config defaulting to `"true"`, this can
silently return incorrect query results for valid GeoParquet inputs produced by
other writers (or user-supplied covering boxes). To prevent incorrect results
by default, consider making `spark.sedona.geoparquet.box2dFilterPushDown`
default to `false` (opt-in), or gate pushdown on a reliable “exact covering”
signal in metadata (if/when available); otherwise keep-file (no pruning) is the
only safe default.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]