paleolimbot commented on code in PR #775:
URL: https://github.com/apache/sedona-db/pull/775#discussion_r3140602505


##########
rust/sedona-spatial-join-geography/src/join_provider.rs:
##########
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, Float64Array};
+use datafusion_common::{exec_datafusion_err, JoinType, Result, ScalarValue};
+use datafusion_physical_plan::ColumnarValue;
+use geo_index::rtree::util::f64_box_to_f32;
+use geo_types::{coord, Rect};
+use sedona_common::sedona_internal_err;
+use sedona_expr::statistics::GeoStatistics;
+use sedona_functions::executor::IterGeo;
+use sedona_s2geography::{
+    geography::{Geography, GeographyFactory},
+    rect_bounder::RectBounder,
+};
+use sedona_schema::datatypes::SedonaType;
+use sedona_spatial_join::{
+    index::{
+        default_spatial_index_builder::DefaultSpatialIndexBuilder,
+        spatial_index_builder::SpatialJoinBuildMetrics, SpatialIndexBuilder,
+    },
+    join_provider::SpatialJoinProvider,
+    operand_evaluator::{EvaluatedGeometryArray, EvaluatedGeometryArrayFactory},
+    SpatialJoinOptions, SpatialPredicate,
+};
+
+use crate::{
+    refiner::GeographyRefinerFactory, 
spatial_index_builder::GeographySpatialIndexBuilder,
+};
+
+#[derive(Debug)]
+pub(crate) struct GeographyJoinProvider;
+
+impl SpatialJoinProvider for GeographyJoinProvider {
+    fn try_new_spatial_index_builder(
+        &self,
+        schema: arrow_schema::SchemaRef,
+        spatial_predicate: SpatialPredicate,
+        options: SpatialJoinOptions,
+        join_type: JoinType,
+        probe_threads_count: usize,
+        metrics: SpatialJoinBuildMetrics,
+    ) -> Result<Box<dyn SpatialIndexBuilder>> {
+        // Create the inner (default) builder
+        let builder = GeographySpatialIndexBuilder::new(
+            schema,
+            spatial_predicate.clone(),
+            options,
+            join_type,
+            probe_threads_count,
+            metrics,
+        )?;
+
+        Ok(Box::new(builder))
+    }
+
+    fn estimate_extra_memory_usage(
+        &self,
+        geo_stats: &GeoStatistics,
+        spatial_predicate: &SpatialPredicate,
+        options: &SpatialJoinOptions,
+    ) -> usize {
+        DefaultSpatialIndexBuilder::estimate_extra_memory_usage(
+            geo_stats,
+            spatial_predicate,
+            options,
+            Arc::new(GeographyRefinerFactory),
+        )
+    }
+
+    fn evaluated_array_factory(&self) -> Arc<dyn 
EvaluatedGeometryArrayFactory> {
+        Arc::new(GeographyEvaluatedArrayFactory)
+    }
+}
+
+#[derive(Debug)]
+struct GeographyEvaluatedArrayFactory;
+
+impl EvaluatedGeometryArrayFactory for GeographyEvaluatedArrayFactory {
+    fn try_new_evaluated_array(
+        &self,
+        geometry_array: ArrayRef,
+        sedona_type: &SedonaType,
+        distance_columnar_value: Option<&ColumnarValue>,
+    ) -> Result<EvaluatedGeometryArray> {
+        // Without distance expansion use the impl without a bounder modifier
+        let Some(distance_columnar_value) = distance_columnar_value else {
+            return try_new_evaluated_array_impl(geometry_array, sedona_type, 
|_bounder| {});
+        };
+
+        let result = match distance_columnar_value {
+            ColumnarValue::Scalar(ScalarValue::Float64(Some(distance))) => {
+                try_new_evaluated_array_impl(geometry_array, sedona_type, 
|bounder| {
+                    bounder.expand_by_distance(*distance);
+                })
+            }
+            ColumnarValue::Scalar(ScalarValue::Float64(None)) => {
+                try_new_evaluated_array_impl(geometry_array, sedona_type, 
|bounder| {
+                    bounder.clear();
+                })
+            }
+            ColumnarValue::Array(array) => {
+                if let Some(array) = 
array.as_any().downcast_ref::<Float64Array>() {
+                    let mut distance_iter = array.iter();
+                    try_new_evaluated_array_impl(geometry_array, sedona_type, 
|bounder| {
+                        if let Some(next_distance) = distance_iter
+                            .next()
+                            .expect("distance array has correct length")
+                        {
+                            // Non null distance
+                            bounder.expand_by_distance(next_distance);
+                        } else {
+                            // Null distance
+                            bounder.clear();
+                        }
+                    })
+                } else {
+                    return sedona_internal_err!("Distance columnar value is 
not a Float64Array");
+                }
+            }
+            _ => {
+                return sedona_internal_err!("Distance columnar value is not a 
Float64");
+            }
+        }?;
+
+        // Store the distance value so it can be used during refinement
+        Ok(result.with_distance(Some(distance_columnar_value.clone())))
+    }
+}
+
+fn try_new_evaluated_array_impl(
+    geometry_array: ArrayRef,
+    sedona_type: &SedonaType,
+    mut modify_bounder: impl FnMut(&mut RectBounder),
+) -> Result<EvaluatedGeometryArray> {
+    // Compute rectangles from wkb using the RectBounder from s2geography
+    let num_rows = geometry_array.len();
+    let mut bounder = RectBounder::new();
+    let mut factory = GeographyFactory::new();
+    let mut geog = Geography::new();
+    let mut rect_vec = Vec::with_capacity(num_rows);
+
+    geometry_array.iter_as_wkb_bytes(sedona_type, num_rows, |wkb_opt| {
+        if let Some(wkb) = wkb_opt {
+            bounder.clear();
+            factory.init_from_wkb(wkb, &mut geog).map_err(|e| {
+                exec_datafusion_err!("Failed to read WKB in evaluated array 
factory: {e}")
+            })?;
+            bounder.bound(&geog).map_err(|e| {
+                exec_datafusion_err!("Failed to bound geography in evaluated 
array factory: {e}")
+            })?;
+
+            // Account for distance (either by expanding the bounds or by 
emptying them for a null
+            // distance).
+            modify_bounder(&mut bounder);
+
+            let maybe_bounds = bounder.finish().map_err(|e| {
+                exec_datafusion_err!("Failed to finish bounding in evaluated 
array factory: {e}")
+            })?;
+            if let Some((mut min_x, min_y, mut max_x, max_y)) = maybe_bounds {
+                // The evaluated geometry array currently needs Cartesian 
rectangles; however
+                // we can still recalculate these when we ingest into the 
index. In the
+                // partitioned join we may want to ensure we can express 
bounds in a way that
+                // the partitioner understands (if it doesn't already) to do a 
better job
+                // partitioning wraparounds.
+                if min_x > max_x {

Review Comment:
   I added https://github.com/apache/sedona-db/issues/782 for this one...we 
basically need to replace the `Rect` in the evaluated geometry array with 
something that explicitly handles wraparound to make sure all of those usages 
don't do something invalid. But yes, for now, this is making the full longitude 
range from a wraparound to be defensive.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to