paleolimbot commented on code in PR #737:
URL: https://github.com/apache/sedona-db/pull/737#discussion_r3012260066
##########
rust/sedona-spatial-join/src/index/default_spatial_index.rs:
##########
@@ -299,37 +292,6 @@ impl SpatialIndex for DefaultSpatialIndex {
&self.inner.indexed_batches[batch_idx].batch
}
- /// This method implements [`SpatialIndex::query`], which is a two-phase
spatial join:
- /// 1. **Filter phase**: Uses the R-tree index with the probe geometry's
bounding rectangle
- /// to quickly identify candidate geometries that might satisfy the
spatial predicate
- /// 2. **Refinement phase**: Evaluates the exact spatial predicate on
candidates to determine
- /// actual matches
- fn query(
- &self,
Review Comment:
This wasn't used (only `query_batch()` was), so I removed it
##########
rust/sedona-spatial-join/src/index/spatial_index_builder.rs:
##########
@@ -16,38 +16,25 @@
// under the License.
use datafusion_physical_plan::metrics::{self, ExecutionPlanMetricsSet,
MetricBuilder};
-use sedona_common::SpatialJoinOptions;
use sedona_expr::statistics::GeoStatistics;
-use std::sync::Arc;
+use
crate::evaluated_batch::evaluated_batch_stream::SendableEvaluatedBatchStream;
use crate::index::spatial_index::SpatialIndexRef;
-use crate::{
- evaluated_batch::evaluated_batch_stream::SendableEvaluatedBatchStream,
- spatial_predicate::SpatialPredicate,
-};
use async_trait::async_trait;
use datafusion_common::Result;
/// Builder for constructing a SpatialIndex from geometry batches.
#[async_trait]
-pub(crate) trait SpatialIndexBuilder {
- /// Estimate the amount of memory required by the R-tree index and
evaluating spatial predicates.
- /// The estimated memory usage does not include the memory required for
holding the build side
- /// batches.
- fn estimate_extra_memory_usage(
- &self,
- geo_stats: &GeoStatistics,
- spatial_predicate: &SpatialPredicate,
- options: &SpatialJoinOptions,
- ) -> usize;
Review Comment:
I moved this to the `SpatialJoinProvider` because we don't need to
instantiate an actual ready-to-go builder just to estimate hypothetical memory
usage
##########
rust/sedona-spatial-join/src/evaluated_batch.rs:
##########
@@ -55,19 +52,6 @@ impl EvaluatedBatch {
pub fn num_rows(&self) -> usize {
self.batch.num_rows()
}
-
- pub fn wkb(&self, idx: usize) -> Option<&Wkb<'_>> {
- let wkbs = self.geom_array.wkbs();
- wkbs[idx].as_ref()
- }
-
- pub fn rects(&self) -> &Vec<Option<Rect<f32>>> {
- &self.geom_array.rects
- }
-
- pub fn distance(&self) -> &Option<ColumnarValue> {
- &self.geom_array.distance
- }
Review Comment:
I moved these to the `EvaluatedGeometryArray` as part of the process of
figuring out the exact responsibility of these and mediating field access of
the evaluated array behind methods.
##########
rust/sedona-spatial-join/src/operand_evaluator.rs:
##########
@@ -73,32 +89,34 @@ pub(crate) trait OperandEvaluator: fmt::Debug + Send + Sync
{
/// Create a spatial predicate evaluator for the spatial predicate.
pub(crate) fn create_operand_evaluator(
predicate: &SpatialPredicate,
- options: SpatialJoinOptions,
+ evaluated_array_factory: Arc<dyn EvaluatedGeometryArrayFactory>,
) -> Arc<dyn OperandEvaluator> {
match predicate {
- SpatialPredicate::Distance(predicate) => {
- Arc::new(DistanceOperandEvaluator::new(predicate.clone(), options))
- }
- SpatialPredicate::Relation(predicate) => {
- Arc::new(RelationOperandEvaluator::new(predicate.clone(), options))
- }
- SpatialPredicate::KNearestNeighbors(predicate) => {
- Arc::new(KNNOperandEvaluator::new(predicate.clone()))
- }
+ SpatialPredicate::Distance(predicate) =>
Arc::new(DistanceOperandEvaluator::new(
+ predicate.clone(),
+ evaluated_array_factory,
+ )),
+ SpatialPredicate::Relation(predicate) =>
Arc::new(RelationOperandEvaluator::new(
+ predicate.clone(),
+ evaluated_array_factory,
+ )),
+ SpatialPredicate::KNearestNeighbors(predicate) =>
Arc::new(KNNOperandEvaluator::new(
+ predicate.clone(),
+ evaluated_array_factory,
+ )),
}
}
/// Result of evaluating a geometry batch.
pub struct EvaluatedGeometryArray {
- /// Type of geometry_array
- pub sedona_type: SedonaType,
+ sedona_type: SedonaType,
/// The array of geometries produced by evaluating the geometry expression.
- pub geometry_array: ArrayRef,
+ geometry_array: ArrayRef,
/// The rects of the geometries in the geometry array. The length of this
array is equal to the number of geometries.
/// The rects will be None for empty or null geometries.
- pub rects: Vec<Option<Rect<f32>>>,
+ rects: Vec<Option<Rect<f32>>>,
/// The distance value produced by evaluating the distance expression.
- pub distance: Option<ColumnarValue>,
+ distance: Option<ColumnarValue>,
Review Comment:
I made these all accessible by methods to better track places that these are
fetched or set
##########
rust/sedona-spatial-join/src/operand_evaluator.rs:
##########
@@ -30,38 +31,53 @@ use sedona_geo_generic_alg::BoundingRect;
use sedona_schema::datatypes::SedonaType;
use wkb::reader::Wkb;
-use sedona_common::option::SpatialJoinOptions;
use sedona_common::sedona_internal_err;
use crate::{
spatial_predicate::{DistancePredicate, KNNPredicate, RelationPredicate,
SpatialPredicate},
utils::arrow_utils::get_array_memory_size,
};
+/// Factory for [EvaluatedGeometryArray] instances given an evaluated geometry
column
+///
+/// Allows join providers to customize the eagerly evaluated representation of
geometry.
+/// This is currently limited to a concrete internal representation but may
expand to
+/// support non-Cartesian bounding or non-WKB backed geometry arrays. This
also may
+/// expand to support more compact or efficient serialization/deserialization
of the
+/// evaluated array when spilling.
+pub(crate) trait EvaluatedGeometryArrayFactory: fmt::Debug + Send + Sync {
+ /// Create a new [EvaluatedGeometryArray]
+ fn try_new_evaluated_array(
+ &self,
+ geometry_array: ArrayRef,
+ sedona_type: &SedonaType,
+ ) -> Result<EvaluatedGeometryArray>;
+}
Review Comment:
I added this because it most precisely encapsulates what custom joins need
to control. SpatialIndexBuilders and SpatialIndexes ultimately consume
EvaluatedGeometryArrays that are created by other parts of the machinery. What
exactly "eagerly evaluated" looks like depends on what the index will
ultimately need.
I think there are some improvements possible here by better encapsulating
what questions are being asked of this object by non-SpatialIndex internals
(e.g., update the spatial analyzer stats, repartition, serialize to record
batch, deserialize from record batch) but that is a future project once we have
a working Geography and GPU join with the current structure.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]