jiayuasu commented on code in PR #2946:
URL: https://github.com/apache/sedona/pull/2946#discussion_r3223783297
##########
spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala:
##########
@@ -42,17 +43,24 @@ import
org.apache.spark.sql.execution.datasources.PushableColumnBase
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetFileFormatBase
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.AndFilter
+import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.Box2DLeafFilter
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.LeafFilter
import
org.apache.spark.sql.execution.datasources.geoparquet.GeoParquetSpatialFilter.OrFilter
+import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer,
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance,
ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals, ST_Intersects,
ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
+import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_BoxContains,
ST_BoxIntersects, ST_Buffer, ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses,
ST_DWithin, ST_Distance, ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals,
ST_Intersects, ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
import
org.apache.spark.sql.sedona_sql.optimization.ExpressionUtils.splitConjunctivePredicates
import org.apache.spark.sql.types.DoubleType
import org.locationtech.jts.geom.Geometry
import org.locationtech.jts.geom.Point
class SpatialFilterPushDownForGeoParquet(sparkSession: SparkSession) extends
Rule[LogicalPlan] {
+ private def enableBox2DFilterPushDown: Boolean =
+ sparkSession.conf
+ .get("spark.sedona.geoparquet.box2dFilterPushDown", "true")
Review Comment:
Done in 3c236660 — flipped the default to false. Sedona-written files (or
any tool that derives the covering from `getEnvelopeInternal()`) are safe to
opt in via `spark.sedona.geoparquet.box2dFilterPushDown=true`. The default-on
flip will land once the proper Parquet column-statistics-based pruning in #2949
makes pushdown sound for all spec-compliant inputs.
##########
spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala:
##########
@@ -144,6 +152,20 @@ class SpatialFilterPushDownForGeoParquet(sparkSession:
SparkSession) extends Rul
SpatialPredicate.INTERSECTS,
GeometryUDT.deserialize(value))
+ // Box2D predicates push down to an INTERSECTS check on the geometry
column whose covering
+ // metadata references this Box2D column. Both BoxIntersects and
BoxContains use INTERSECTS
+ // semantics at the file level: per-row containment implies per-row
intersection, which is
+ // only possible if the geometry column's file-level bbox intersects the
query box.
+ // Soundness assumes the Box2D column is exactly the per-row geometry
envelope (true for
+ // ST_Box2D(geom)-derived columns including the auto-generated
<geom>_bbox path). When the
+ // covering column is conservatively wider than the geometry, pushdown
can drop matching
+ // files; the box2dFilterPushDown conf lets users opt out in that case.
+ case ST_BoxIntersects(_) | ST_BoxContains(_) if
enableBox2DFilterPushDown =>
+ for {
+ (name, value) <- resolveNameAndLiteral(predicate.children,
pushableColumn)
+ queryBox <- extractBox2DLiteral(value)
+ } yield Box2DLeafFilter(unquote(name), queryBox)
Review Comment:
Same fix as the other comment on this default — flipped to false (opt-in) in
3c236660.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
Review Comment:
Added in 3c236660 — both ST_BoxIntersects and ST_BoxContains tests now cover
`ST_Box*(lit_box, geom_bbox)` reverse-argument-order forms alongside the
original orientation.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // Q1 region only (region 1, center +10/+10)
+ val q1Filter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0),
ST_Point(15.0, 15.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+ // Window covering Q2 and Q4 (negative X) — should preserve regions 0
and 2
+ val leftHalfFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0),
ST_Point(-1.0, 20.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
Review Comment:
Covered by the new reverse-arg-order test cases in 3c236660.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // Q1 region only (region 1, center +10/+10)
+ val q1Filter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0),
ST_Point(15.0, 15.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+ // Window covering Q2 and Q4 (negative X) — should preserve regions 0
and 2
+ val leftHalfFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0),
ST_Point(-1.0, 20.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
+ // Disjoint window prunes everything
+ val disjointFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(100.0, 100.0),
ST_Point(200.0, 200.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, disjointFilter, Seq.empty)
+ } finally {
+ FileUtils.deleteDirectory(new File(box2dDir).getParentFile)
+ }
+ }
+
+ it("Push down ST_BoxContains against a Box2D covering column") {
Review Comment:
Covered by the new reverse-arg-order test cases in 3c236660.
##########
spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala:
##########
@@ -317,6 +318,87 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
assert(getPushedDownSpatialFilter(dfFiltered).isEmpty)
}
}
+
+ it("Push down ST_BoxIntersects against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // Q1 region only (region 1, center +10/+10)
+ val q1Filter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(5.0, 5.0),
ST_Point(15.0, 15.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, q1Filter, Seq(1))
+
+ // Window covering Q2 and Q4 (negative X) — should preserve regions 0
and 2
+ val leftHalfFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(-20.0, -20.0),
ST_Point(-1.0, 20.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, leftHalfFilter, Seq(0, 2))
+
+ // Disjoint window prunes everything
+ val disjointFilter =
+ "ST_BoxIntersects(geom_bbox, ST_MakeBox2D(ST_Point(100.0, 100.0),
ST_Point(200.0, 200.0)))"
+ verifyBox2DFilter(box2dDf, box2dMetaMap, disjointFilter, Seq.empty)
+ } finally {
+ FileUtils.deleteDirectory(new File(box2dDir).getParentFile)
+ }
+ }
+
+ it("Push down ST_BoxContains against a Box2D covering column") {
+ val (box2dDf, box2dDir, box2dMetaMap) = setupBox2DCoveringFixture()
+ try {
+ // ST_BoxContains(box_col, lit_box) pushes down as INTERSECTS at the
file level. A tiny
+ // query box inside Q1 prunes everything except region 1.
+ val containsFilter =
+ "ST_BoxContains(geom_bbox, ST_MakeBox2D(ST_Point(9.0, 9.0),
ST_Point(10.0, 10.0)))"
Review Comment:
Covered by the new reverse-arg-order test cases in 3c236660.
##########
spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala:
##########
@@ -88,4 +89,68 @@ object GeoParquetSpatialFilter {
}
override def simpleString: String = s"$columnName ${predicateType.name}
$queryWindow"
}
+
+ /**
+ * Pushdown filter for predicates that operate on a Box2D-typed column (e.g.
+ * `ST_BoxIntersects(box_col, lit_box)` or `ST_BoxContains(box_col,
lit_box)`).
+ *
+ * Per-file evaluation: walks the file's GeoParquet column metadata to find
the geometry column
+ * whose covering metadata points at `box2dColumnName`, then prunes using
that geometry column's
+ * recorded bbox.
+ *
+ * Both intersects and contains map to a file-level INTERSECTS check:
per-row containment
+ * implies per-row intersection, so the file's recorded geom bbox must
intersect the query box
+ * for any row to match.
+ *
+ * '''Soundness caveat.''' The GeoParquet 1.1 spec allows covering bboxes to
be conservatively
+ * wider than per-row geometry envelopes (e.g. sedona-db's Float32 writer
rounds outward via
+ * `next_after`). When that happens, the union of per-row Box2D values is a
strict superset of
+ * the file's geom bbox, and pruning using the geom bbox can produce false
negatives. Pushdown
+ * is sound when the Box2D column is exactly the per-row geometry envelope —
which is the case
+ * for Sedona's own writer (`ST_Box2D(geom)` produces exact envelopes). A
proper Parquet column
+ * statistics-based pruning that operates on the Box2D column's own
xmin/ymin/xmax/ymax bounds
+ * is tracked as a follow-up; until then this filter is opt-out via
+ * `spark.sedona.geoparquet.box2dFilterPushDown`.
Review Comment:
Updated the PR description to remove the "sound (never produces false
negatives)" claim and document the soundness caveat prominently, in line with
the in-code Scaladoc. The default-off behavior now matches the documented
correctness guarantee.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]