paleolimbot commented on code in PR #799:
URL: https://github.com/apache/sedona-db/pull/799#discussion_r3158518280
##########
rust/sedona-spatial-join-geography/tests/spatial_join_integration.rs:
##########
Review Comment:
These tests were copied by via LLM from the sedona-spatial-join crate,
taking the random geometry parameters from the Python tests. I needed this to
debug a deduplication issue but it is good to have regardless.
##########
rust/sedona-spatial-join-geography/src/join_provider.rs:
##########
@@ -177,28 +176,16 @@ fn try_new_evaluated_array_impl(
let maybe_bounds = bounder.finish().map_err(|e| {
exec_datafusion_err!("Failed to finish bounding in evaluated
array factory: {e}")
})?;
- if let Some((mut min_x, min_y, mut max_x, max_y)) = maybe_bounds {
- // The evaluated geometry array currently needs Cartesian
rectangles; however
- // we can still recalculate these when we ingest into the
index. In the
- // partitioned join we may want to ensure we can express
bounds in a way that
- // the partitioner understands (if it doesn't already) to do a
better job
- // partitioning wraparounds.
- if min_x > max_x {
- min_x = -180.0;
- max_x = 180.0;
- }
-
- let (min_x, min_y, max_x, max_y) = f64_box_to_f32(min_x,
min_y, max_x, max_y);
- let rect = Rect::new(coord!(x: min_x, y: min_y), coord!(x:
max_x, y: max_y));
- rect_vec.push(Some(rect));
+ if let Some((min_x, min_y, max_x, max_y)) = maybe_bounds {
+ rect_vec.push(Bounds2D::new((min_x, max_x), (min_y, max_y)));
Review Comment:
This is the main point of the PR...eliminate the expansion of potentially
very small feature bounds into the entire width of the earth.
##########
rust/sedona-spatial-join/src/index/default_spatial_index.rs:
##########
@@ -538,9 +564,21 @@ impl SpatialIndex for DefaultSpatialIndex {
};
// Sort and dedup candidates to avoid duplicate results when we
index one geometry
- // using several boxes.
- candidates.sort_unstable();
- candidates.dedup();
+ // using several boxes (e.g., for antimeridian-crossing
geometries).
+ // First dedup by data_idx (fast), then dedup by position (handles
wraparound case).
+ if self.inner.wraparound.is_some() {
+ candidates.sort_unstable();
+ candidates.dedup();
+
+ // Dedup by position: when a geometry spans the antimeridian,
it may be indexed
+ // as two separate boxes with different data_idx values that
map to the same position.
+ let mut seen_positions: std::collections::HashSet<(i32, i32)> =
+ std::collections::HashSet::new();
+ candidates.retain(|data_idx| {
+ let pos = self.inner.data_id_to_batch_pos[*data_idx as
usize];
+ seen_positions.insert(pos)
+ });
+ }
Review Comment:
...and when we query it we have to deduplicate the results. The
deduplication happened already before this PR but the DWithin tests exposed
that it didn't handle the multiple source rectangles properly.
We may want to solve this performance issue before merging this PR (a
`HashSet` is almost certainly slowing us down here).
##########
rust/sedona-spatial-join/src/utils/bounds.rs:
##########
@@ -0,0 +1,277 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use float_next_after::NextAfter;
+use sedona_geometry::{
+ bounding_box::BoundingBox,
+ interval::{Interval, IntervalTrait, WraparoundInterval},
+};
+
+/// A float32 bounding box with wraparound support
+///
+/// This struct is conceptually similar to the Rect<f32> but explicitly
supports
+/// wraparound x intervals to ensure raw xmin and xmax values are not misused.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Bounds2D {
+ x: (f32, f32),
+ y: (f32, f32),
+}
Review Comment:
This is the main utility supporting the EvaluatedGeometryArray. I played
with making the `IntervalTrait` generic but it was a bit of a pain and I was
keen to keep this scoped.
##########
rust/sedona-spatial-join/src/index/default_spatial_index_builder.rs:
##########
@@ -142,25 +164,41 @@ impl DefaultSpatialIndexBuilder {
fn build_rtree(&mut self) -> Result<RTreeBuildResult> {
let build_timer = self.metrics.build_time.timer();
+ // Each item will add 0 (empty), 1 (regular) or 2 (wraparound)
+ // rectangles to the index.
let num_rects = self
.indexed_batches
.iter()
- .map(|batch| batch.geom_array.rects().iter().flatten().count())
- .sum::<usize>();
+ .flat_map(|batch| batch.geom_array.rects().iter())
+ .map(|rect| {
+ if rect.is_empty() {
+ 0
+ } else if rect.is_wraparound() {
+ 2
+ } else {
+ 1
+ }
+ })
+ .sum();
let mut rtree_builder = RTreeBuilder::<f32>::new(num_rects as u32);
let mut batch_pos_vec = vec![(-1, -1); num_rects];
for (batch_idx, batch) in self.indexed_batches.iter().enumerate() {
let rects = batch.geom_array.rects();
- for (idx, rect_opt) in rects.iter().enumerate() {
- let Some(rect) = rect_opt else {
- continue;
- };
- let min = rect.min();
- let max = rect.max();
- let data_idx = rtree_builder.add(min.x, min.y, max.x, max.y);
- batch_pos_vec[data_idx as usize] = (batch_idx as i32, idx as
i32);
+ for (idx, rect) in rects.iter().enumerate() {
+ let (left, right) =
rect.split(&self.wraparound.unwrap_or(Interval::empty()));
+ if !left.is_empty() {
+ let (x, y) = left.into_inner();
+ let data_idx = rtree_builder.add(x.0, y.0, x.1, y.1);
+ batch_pos_vec[data_idx as usize] = (batch_idx as i32, idx
as i32);
+ }
+
+ if !right.is_empty() {
+ let (x, y) = right.into_inner();
+ let data_idx = rtree_builder.add(x.0, y.0, x.1, y.1);
+ batch_pos_vec[data_idx as usize] = (batch_idx as i32, idx
as i32);
+ }
}
Review Comment:
This is the main change in the spatial index: we can now insert two
rectangles into the index instead of one.
##########
c/sedona-libgpuspatial/src/lib.rs:
##########
@@ -113,9 +112,10 @@ mod sys {
/// Inserts a batch of bounding boxes into the index.
/// Each rectangle is represented as a `Rect<f32>` with minimum and
maximum x and y coordinates.
/// This method accumulates these rectangles until `finish_building`
is called to finalize the index.
- /// The method can be called multiple times to insert data in batches
before finalizing.
- pub fn push_build(&mut self, rects: &[Rect<f32>]) -> Result<()> {
- // Re-interpreting Rect<f32> as flat f32 array (xmin, ymin, xmax,
ymax)
+ /// The method can be called multiple times to insert data in batches
before finalizing. The values
+ /// in rects are ordered (xmin, ymin, xmax, ymax).
+ pub fn push_build(&mut self, rects: &[(f32, f32, f32, f32)]) ->
Result<()> {
+ // Re-interpreting rects as a flat f32 array (xmin, ymin, xmax,
ymax)
Review Comment:
cc @pwrliang to make sure I didn't muck this up...the evaluated geometry
array is no longer using Rect so I updated this to just use f32s. I don't think
the GPU tests actually run here but this does build.
##########
rust/sedona-spatial-join-gpu/src/index/gpu_spatial_index_builder.rs:
##########
@@ -212,11 +208,12 @@ impl SpatialIndexBuilder for GPUSpatialIndexBuilder {
for (batch_idx, batch) in self.indexed_batches.iter().enumerate() {
let rects = batch.geom_array.rects();
- for (idx, rect_opt) in rects.iter().enumerate() {
- if let Some(rect) = rect_opt {
- native_rects.push(*rect);
- } else {
+ for (idx, rect) in rects.iter().enumerate() {
+ if rect.is_empty() {
native_rects.push(empty_rect);
+ } else {
+ let (x, y) = rect.clone().into_inner();
+ native_rects.push((x.0, y.0, x.1, y.1));
Review Comment:
cc @pwrliang...did I reinterpret this correctly?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]