Copilot commented on code in PR #799: URL: https://github.com/apache/sedona-db/pull/799#discussion_r3158619608
########## rust/sedona-spatial-join/src/utils/bounds.rs: ########## @@ -0,0 +1,277 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use float_next_after::NextAfter; +use sedona_geometry::{ + bounding_box::BoundingBox, + interval::{Interval, IntervalTrait, WraparoundInterval}, +}; + +/// A float32 bounding box with wraparound support +/// +/// This struct is conceptually similar to the Rect<f32> but explicitly supports +/// wraparound x intervals to ensure raw xmin and xmax values are not misused. +#[derive(Debug, Clone, PartialEq)] +pub struct Bounds2D { + x: (f32, f32), + y: (f32, f32), +} + +impl Bounds2D { + /// Create a new empty rectangle + pub fn empty() -> Self { + Self::new(WraparoundInterval::empty(), Interval::empty()) + } + + /// Returns true if this rectangle is empty + pub fn is_empty(&self) -> bool { + self.x().is_empty() && self.y().is_empty() Review Comment: `Bounds2D::is_empty()` currently returns true only when *both* x and y intervals are empty. A 2D bounds should be considered empty if either dimension is empty; otherwise you can end up treating a bounds with an empty x (or y) interval as non-empty (e.g., after `split()`/intersection), which can lead to invalid index inserts/queries and partitioning decisions. Consider changing this to use `||` (or otherwise ensure `Bounds2D::new` normalizes any partially-empty inputs to fully-empty). ```suggestion self.x().is_empty() || self.y().is_empty() ``` ########## rust/sedona-spatial-join/src/index/default_spatial_index_builder.rs: ########## @@ -142,25 +164,41 @@ impl DefaultSpatialIndexBuilder { fn build_rtree(&mut self) -> Result<RTreeBuildResult> { let build_timer = self.metrics.build_time.timer(); + // Each item will add 0 (empty), 1 (regular) or 2 (wraparound) + // rectangles to the index. let num_rects = self .indexed_batches .iter() - .map(|batch| batch.geom_array.rects().iter().flatten().count()) - .sum::<usize>(); + .flat_map(|batch| batch.geom_array.rects().iter()) + .map(|rect| { + if rect.is_empty() { + 0 + } else if rect.is_wraparound() { + 2 + } else { + 1 + } + }) + .sum(); let mut rtree_builder = RTreeBuilder::<f32>::new(num_rects as u32); let mut batch_pos_vec = vec![(-1, -1); num_rects]; for (batch_idx, batch) in self.indexed_batches.iter().enumerate() { let rects = batch.geom_array.rects(); - for (idx, rect_opt) in rects.iter().enumerate() { - let Some(rect) = rect_opt else { - continue; - }; - let min = rect.min(); - let max = rect.max(); - let data_idx = rtree_builder.add(min.x, min.y, max.x, max.y); - batch_pos_vec[data_idx as usize] = (batch_idx as i32, idx as i32); + for (idx, rect) in rects.iter().enumerate() { + let (left, right) = rect.split(&self.wraparound.unwrap_or(Interval::empty())); + if !left.is_empty() { Review Comment: `build_rtree()` uses `rect.split(&self.wraparound.unwrap_or(Interval::empty()))`. If a wraparound bounds is encountered while `self.wraparound` is `None`, this will silently split against an empty interval and drop the geometry from the index (false negatives). Since the docs say this configuration is invalid, it would be safer to detect `rect.is_wraparound() && self.wraparound.is_none()` and return an explicit error early. ########## rust/sedona-spatial-join/src/partitioning/stream_repartitioner.rs: ########## @@ -601,26 +600,27 @@ pub(crate) fn assign_rows( PartitionedSide::BuildSide => { let mut cnt = 0; let num_regular_partitions = partitioner.num_regular_partitions() as u32; - for rect_opt in batch.geom_array.rects() { - let partition = match rect_opt { - Some(rect) => partitioner.partition_no_multi(&geo_rect_to_bbox(rect))?, - None => { - // Round-robin empty geometries through regular partitions to avoid - // overloading a single slot when the build side is mostly empty. - let p = SpatialPartition::Regular(cnt); - cnt = (cnt + 1) % num_regular_partitions; - p - } + for rect in batch.geom_array.rects() { + let partition = if rect.is_empty() { + // Round-robin empty geometries through regular partitions to avoid + // overloading a single slot when the build side is mostly empty. + let p = SpatialPartition::Regular(cnt); + cnt = (cnt + 1) % num_regular_partitions; + p + } else { + partitioner.partition_no_multi(&BoundingBox::xy(rect.x(), rect.y()))? }; assignments.push(partition); } } PartitionedSide::ProbeSide => { - for rect_opt in batch.geom_array.rects() { - let partition = match rect_opt { - Some(rect) => partitioner.partition(&geo_rect_to_bbox(rect))?, - None => SpatialPartition::None, + for rect in batch.geom_array.rects() { + let partition = if rect.is_empty() { + SpatialPartition::None + } else { + partitioner.partition(&BoundingBox::xy(rect.x(), rect.y()))? }; Review Comment: `assign_rows()` now passes `BoundingBox::xy(rect.x(), rect.y())` directly into the partitioner. For wraparound x-intervals this will currently error in the partitioning utilities (e.g., `bbox_to_f32_rect` returns an Execution error when `bbox.x().is_wraparound()`). This means repartitioning/out-of-core paths can fail for geography antimeridian-crossing geometries. Consider making `assign_rows()` wraparound-aware (e.g., split bounds using the configured absolute wraparound interval and union the partition results / force `Multi`), or ensure geography joins avoid partitioners that require non-wraparound rectangles until wraparound partitioning is implemented. ########## rust/sedona-spatial-join/src/index/default_spatial_index.rs: ########## @@ -519,13 +532,26 @@ impl SpatialIndex for DefaultSpatialIndex { let mut current_row_idx = range.start; for row_idx in range { current_row_idx = row_idx; - let Some(probe_rect) = rects[row_idx] else { + let probe_rect = &rects[row_idx]; + if probe_rect.is_empty() { continue; + } + + let mut candidates = if let Some(wraparound) = &self.inner.wraparound { + let (left, right) = probe_rect.split(wraparound); + let (x, y) = left.into_inner(); + let mut candidates = self.inner.rtree.search(x.0, y.0, x.1, y.1); + if !right.is_empty() { + let (x, y) = right.into_inner(); + candidates.extend(self.inner.rtree.search(x.0, y.0, x.1, y.1)); + } Review Comment: `query_batch()` always performs an R-tree search using the `left` bounds returned by `probe_rect.split(wraparound)` even if `left` is empty. If `wraparound` is misconfigured or the split produces an empty half, this can result in searches with nonsensical extents (e.g., INF/-INF) and potentially incorrect results or panics in the underlying index. Guard the left search with `if !left.is_empty()` (mirroring the builder’s insertion logic) and return an empty candidate set if both halves are empty. ########## c/sedona-libgpuspatial/src/lib.rs: ########## @@ -113,9 +112,10 @@ mod sys { /// Inserts a batch of bounding boxes into the index. /// Each rectangle is represented as a `Rect<f32>` with minimum and maximum x and y coordinates. /// This method accumulates these rectangles until `finish_building` is called to finalize the index. - /// The method can be called multiple times to insert data in batches before finalizing. - pub fn push_build(&mut self, rects: &[Rect<f32>]) -> Result<()> { - // Re-interpreting Rect<f32> as flat f32 array (xmin, ymin, xmax, ymax) + /// The method can be called multiple times to insert data in batches before finalizing. The values + /// in rects are ordered (xmin, ymin, xmax, ymax). + pub fn push_build(&mut self, rects: &[(f32, f32, f32, f32)]) -> Result<()> { + // Re-interpreting rects as a flat f32 array (xmin, ymin, xmax, ymax) let raw_ptr = rects.as_ptr() as *const f32; self.inner.push_build(raw_ptr, rects.len() as u32) } Review Comment: `push_build()`/`probe()` cast `&[(f32, f32, f32, f32)]` to `*const f32` and pass it to the CUDA layer as a flat f32 array. Rust does not guarantee the memory layout of tuples for FFI / raw reinterpretation, so this cast can be undefined behavior. To make this sound, consider introducing a `#[repr(C)]` struct (e.g., `struct RawRect { xmin: f32, ymin: f32, xmax: f32, ymax: f32 }`) or using a proven-bytemuck `Pod` type for the buffer, and update the doc comment (it still mentions `Rect<f32>`). ########## rust/sedona-spatial-join/src/index/default_spatial_index.rs: ########## @@ -69,6 +70,15 @@ struct DefaultSpatialIndexInner { /// to get the prepared geometries array index. pub(crate) rtree: RTree<f32>, + /// Absolute bounds of the coordinate system + /// + /// If this value should be `None` if each input feature has exactly one item in the tree and + /// each probe input is guaranteed to have non-wraparound bounds. This is the case when using + /// non-wraparound bounds like those computed by the default evaluated geometry array + /// constructor. If this is some, it must contain the bounds of any rectangle in the tree. This + /// is needed to split probe rectangles into finite queries when querying the tree. Review Comment: Doc comment grammar: “If this value should be `None` if …” reads like a duplicated conditional and is confusing. Consider rewording to “This value should be `None` if …” and clarifying the invariants for `Some(_)` vs `None`. ```suggestion /// This value should be `None` when each input feature has exactly one item in the tree and /// each probe input is guaranteed to have non-wraparound bounds. This is the case when using /// non-wraparound bounds like those computed by the default evaluated geometry array /// constructor. If this is `Some(bounds)`, `bounds` must contain the bounds of every /// rectangle in the tree. These bounds are needed to split probe rectangles into finite /// queries when querying the tree. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
