paleolimbot commented on code in PR #737:
URL: https://github.com/apache/sedona-db/pull/737#discussion_r3012260066


##########
rust/sedona-spatial-join/src/index/default_spatial_index.rs:
##########
@@ -299,37 +292,6 @@ impl SpatialIndex for DefaultSpatialIndex {
         &self.inner.indexed_batches[batch_idx].batch
     }
 
-    /// This method implements [`SpatialIndex::query`], which is a two-phase 
spatial join:
-    /// 1. **Filter phase**: Uses the R-tree index with the probe geometry's 
bounding rectangle
-    ///    to quickly identify candidate geometries that might satisfy the 
spatial predicate
-    /// 2. **Refinement phase**: Evaluates the exact spatial predicate on 
candidates to determine
-    ///    actual matches
-    fn query(
-        &self,

Review Comment:
   This wasn't used (only `query_batch()` was), so I removed it



##########
rust/sedona-spatial-join/src/index/spatial_index_builder.rs:
##########
@@ -16,38 +16,25 @@
 // under the License.
 
 use datafusion_physical_plan::metrics::{self, ExecutionPlanMetricsSet, 
MetricBuilder};
-use sedona_common::SpatialJoinOptions;
 use sedona_expr::statistics::GeoStatistics;
-use std::sync::Arc;
 
+use 
crate::evaluated_batch::evaluated_batch_stream::SendableEvaluatedBatchStream;
 use crate::index::spatial_index::SpatialIndexRef;
-use crate::{
-    evaluated_batch::evaluated_batch_stream::SendableEvaluatedBatchStream,
-    spatial_predicate::SpatialPredicate,
-};
 use async_trait::async_trait;
 use datafusion_common::Result;
 
 /// Builder for constructing a SpatialIndex from geometry batches.
 #[async_trait]
-pub(crate) trait SpatialIndexBuilder {
-    /// Estimate the amount of memory required by the R-tree index and 
evaluating spatial predicates.
-    /// The estimated memory usage does not include the memory required for 
holding the build side
-    /// batches.
-    fn estimate_extra_memory_usage(
-        &self,
-        geo_stats: &GeoStatistics,
-        spatial_predicate: &SpatialPredicate,
-        options: &SpatialJoinOptions,
-    ) -> usize;

Review Comment:
   I moved this to the `SpatialJoinProvider` because we don't need to 
instantiate an actual ready-to-go builder just to estimate hypothetical memory 
usage



##########
rust/sedona-spatial-join/src/evaluated_batch.rs:
##########
@@ -55,19 +52,6 @@ impl EvaluatedBatch {
     pub fn num_rows(&self) -> usize {
         self.batch.num_rows()
     }
-
-    pub fn wkb(&self, idx: usize) -> Option<&Wkb<'_>> {
-        let wkbs = self.geom_array.wkbs();
-        wkbs[idx].as_ref()
-    }
-
-    pub fn rects(&self) -> &Vec<Option<Rect<f32>>> {
-        &self.geom_array.rects
-    }
-
-    pub fn distance(&self) -> &Option<ColumnarValue> {
-        &self.geom_array.distance
-    }

Review Comment:
   I moved these to the `EvaluatedGeometryArray` as part of the process of 
figuring out the exact responsibility of these and mediating field access of 
the evaluated array behind methods.



##########
rust/sedona-spatial-join/src/operand_evaluator.rs:
##########
@@ -73,32 +89,34 @@ pub(crate) trait OperandEvaluator: fmt::Debug + Send + Sync 
{
 /// Create a spatial predicate evaluator for the spatial predicate.
 pub(crate) fn create_operand_evaluator(
     predicate: &SpatialPredicate,
-    options: SpatialJoinOptions,
+    evaluated_array_factory: Arc<dyn EvaluatedGeometryArrayFactory>,
 ) -> Arc<dyn OperandEvaluator> {
     match predicate {
-        SpatialPredicate::Distance(predicate) => {
-            Arc::new(DistanceOperandEvaluator::new(predicate.clone(), options))
-        }
-        SpatialPredicate::Relation(predicate) => {
-            Arc::new(RelationOperandEvaluator::new(predicate.clone(), options))
-        }
-        SpatialPredicate::KNearestNeighbors(predicate) => {
-            Arc::new(KNNOperandEvaluator::new(predicate.clone()))
-        }
+        SpatialPredicate::Distance(predicate) => 
Arc::new(DistanceOperandEvaluator::new(
+            predicate.clone(),
+            evaluated_array_factory,
+        )),
+        SpatialPredicate::Relation(predicate) => 
Arc::new(RelationOperandEvaluator::new(
+            predicate.clone(),
+            evaluated_array_factory,
+        )),
+        SpatialPredicate::KNearestNeighbors(predicate) => 
Arc::new(KNNOperandEvaluator::new(
+            predicate.clone(),
+            evaluated_array_factory,
+        )),
     }
 }
 
 /// Result of evaluating a geometry batch.
 pub struct EvaluatedGeometryArray {
-    /// Type of geometry_array
-    pub sedona_type: SedonaType,
+    sedona_type: SedonaType,
     /// The array of geometries produced by evaluating the geometry expression.
-    pub geometry_array: ArrayRef,
+    geometry_array: ArrayRef,
     /// The rects of the geometries in the geometry array. The length of this 
array is equal to the number of geometries.
     /// The rects will be None for empty or null geometries.
-    pub rects: Vec<Option<Rect<f32>>>,
+    rects: Vec<Option<Rect<f32>>>,
     /// The distance value produced by evaluating the distance expression.
-    pub distance: Option<ColumnarValue>,
+    distance: Option<ColumnarValue>,

Review Comment:
   I made these all accessible by methods to better track places that these are 
fetched or set



##########
rust/sedona-spatial-join/src/operand_evaluator.rs:
##########
@@ -30,38 +31,53 @@ use sedona_geo_generic_alg::BoundingRect;
 use sedona_schema::datatypes::SedonaType;
 use wkb::reader::Wkb;
 
-use sedona_common::option::SpatialJoinOptions;
 use sedona_common::sedona_internal_err;
 
 use crate::{
     spatial_predicate::{DistancePredicate, KNNPredicate, RelationPredicate, 
SpatialPredicate},
     utils::arrow_utils::get_array_memory_size,
 };
 
+/// Factory for [EvaluatedGeometryArray] instances given an evaluated geometry 
column
+///
+/// Allows join providers to customize the eagerly evaluated representation of 
geometry.
+/// This is currently limited to a concrete internal representation but may 
expand to
+/// support non-Cartesian bounding or non-WKB backed geometry arrays. This 
also may
+/// expand to support more compact or efficient serialization/deserialization 
of the
+/// evaluated array when spilling.
+pub(crate) trait EvaluatedGeometryArrayFactory: fmt::Debug + Send + Sync {
+    /// Create a new [EvaluatedGeometryArray]
+    fn try_new_evaluated_array(
+        &self,
+        geometry_array: ArrayRef,
+        sedona_type: &SedonaType,
+    ) -> Result<EvaluatedGeometryArray>;
+}

Review Comment:
   I added this because it most precisely encapsulates what custom joins need 
to control. SpatialIndexBuilders and SpatialIndexes ultimately consume 
EvaluatedGeometryArrays that are created by other parts of the machinery. What 
exactly "eagerly evaluated" looks like depends on what the index will 
ultimately need.
   
   I think there are some improvements possible here by better encapsulating 
what questions are being asked of this object by non-SpatialIndex internals 
(e.g., update the spatial analyzer stats, repartition, serialize to record 
batch, deserialize from record batch) but that is a future project once we have 
a working Geography and GPU join with the current structure.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to