paleolimbot commented on code in PR #708:
URL: https://github.com/apache/sedona-db/pull/708#discussion_r3102191729


##########
c/sedona-s2geography/benches/s2geography-functions.rs:
##########
@@ -76,6 +76,94 @@ fn criterion_benchmark(c: &mut Criterion) {
         ),
     );
 
+    benchmark::scalar(
+        c,
+        &f,
+        "s2geography",
+        "st_distance",
+        BenchmarkArgs::ArrayScalar(
+            Transformed(Point.into(), to_geography()),
+            Transformed(Polygon(10).into(), to_geography()),
+        ),
+    );

Review Comment:
   These are the benchmarks I used to track performance. They aren't perfect 
but they did a good job guiding some of the choices and ensured that none of 
the reorganization that happened slowed things down. Mostly things became 
faster (particularly for the array/scalar cases).



##########
c/sedona-s2geography/src/geography.rs:
##########
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::marker::PhantomData;
+use std::ptr;
+
+use crate::s2geog_call;
+use crate::s2geog_check;
+use crate::s2geography_c_bindgen::*;
+use crate::utils::S2GeogCError;
+
+/// Safe wrapper around an unbound S2Geog geography object
+///
+/// The Geography is a wrapper around a traditional geometry optmizied for
+/// a spherical interpretation of edges using the S2 geometry library. The
+/// geography caches some internal scratch space and may be reused when
+/// looping over many geographies derived from the same lifetime.
+///
+/// Some Geography objects are tied to the lifetime of their input (e.g.,
+/// when reading WKB); however, others own their coordinates (e.g., when
+/// reading WKT).
+///
+/// Geographies are thread safe. The internal index is built on the fly when
+/// required in a thread safe way (although it may result in faster evaluation
+/// to force a build early in a situation where it is known that a lot of
+/// evaluations are about to happen).
+pub struct Geography<'a> {

Review Comment:
   This is an addition made to the s2geography library via a new C API that was 
required for joins. We need a tg-like object that can be prepared and that 
prepares itself on the fly in a thread safe way.



##########
c/sedona-s2geography/CMakeLists.txt:
##########
@@ -27,134 +27,29 @@ set(CMAKE_CXX_STANDARD 17)
 find_package(OpenSSL REQUIRED)
 find_package(absl REQUIRED)
 
-# Use FetchContent to build nanoarrow and geoarrow, being careful to namespace
-# symbols so they don't conflict with other nanoarrows that might be statically
-# linked into the same executable (even though we're careful to align versions
-# within this workspace, we don't have control of anything outside that stack)
-if(POLICY CMP0135)
-  cmake_policy(SET CMP0135 NEW)
-endif()
-
-# S2's CMake is pretty awful so we just build the library ourselves. We pin the
-# version of s2geometry for predictability (although it is also available on
-# vcpkg and homebrew).
-add_library(s2
-            s2geometry/src/s2/encoded_s2cell_id_vector.cc
-            s2geometry/src/s2/encoded_s2point_vector.cc
-            s2geometry/src/s2/encoded_s2shape_index.cc
-            s2geometry/src/s2/encoded_string_vector.cc
-            s2geometry/src/s2/id_set_lexicon.cc
-            s2geometry/src/s2/mutable_s2shape_index.cc
-            s2geometry/src/s2/r2rect.cc
-            s2geometry/src/s2/s1angle.cc
-            s2geometry/src/s2/s1chord_angle.cc
-            s2geometry/src/s2/s1interval.cc
-            s2geometry/src/s2/s2boolean_operation.cc
-            s2geometry/src/s2/s2buffer_operation.cc
-            s2geometry/src/s2/s2builder.cc
-            s2geometry/src/s2/s2builder_graph.cc
-            s2geometry/src/s2/s2builderutil_closed_set_normalizer.cc
-            s2geometry/src/s2/s2builderutil_find_polygon_degeneracies.cc
-            s2geometry/src/s2/s2builderutil_get_snapped_winding_delta.cc
-            s2geometry/src/s2/s2builderutil_lax_polygon_layer.cc
-            s2geometry/src/s2/s2builderutil_lax_polyline_layer.cc
-            s2geometry/src/s2/s2builderutil_s2point_vector_layer.cc
-            s2geometry/src/s2/s2builderutil_s2polygon_layer.cc
-            s2geometry/src/s2/s2builderutil_s2polyline_layer.cc
-            s2geometry/src/s2/s2builderutil_s2polyline_vector_layer.cc
-            s2geometry/src/s2/s2builderutil_snap_functions.cc
-            s2geometry/src/s2/s2cap.cc
-            s2geometry/src/s2/s2cell.cc
-            s2geometry/src/s2/s2cell_id.cc
-            s2geometry/src/s2/s2cell_index.cc
-            s2geometry/src/s2/s2cell_union.cc
-            s2geometry/src/s2/s2centroids.cc
-            s2geometry/src/s2/s2closest_cell_query.cc
-            s2geometry/src/s2/s2closest_edge_query.cc
-            s2geometry/src/s2/s2closest_point_query.cc
-            s2geometry/src/s2/s2contains_vertex_query.cc
-            s2geometry/src/s2/s2convex_hull_query.cc
-            s2geometry/src/s2/s2coords.cc
-            s2geometry/src/s2/s2crossing_edge_query.cc
-            s2geometry/src/s2/s2debug.cc
-            s2geometry/src/s2/s2earth.cc
-            s2geometry/src/s2/s2edge_clipping.cc
-            s2geometry/src/s2/s2edge_crosser.cc
-            s2geometry/src/s2/s2edge_crossings.cc
-            s2geometry/src/s2/s2edge_distances.cc
-            s2geometry/src/s2/s2edge_tessellator.cc
-            s2geometry/src/s2/s2error.cc
-            s2geometry/src/s2/s2furthest_edge_query.cc
-            s2geometry/src/s2/s2hausdorff_distance_query.cc
-            s2geometry/src/s2/s2latlng.cc
-            s2geometry/src/s2/s2latlng_rect.cc
-            s2geometry/src/s2/s2latlng_rect_bounder.cc
-            s2geometry/src/s2/s2lax_loop_shape.cc
-            s2geometry/src/s2/s2lax_polygon_shape.cc
-            s2geometry/src/s2/s2lax_polyline_shape.cc
-            s2geometry/src/s2/s2loop.cc
-            s2geometry/src/s2/s2loop_measures.cc
-            s2geometry/src/s2/s2measures.cc
-            s2geometry/src/s2/s2memory_tracker.cc
-            s2geometry/src/s2/s2metrics.cc
-            s2geometry/src/s2/s2max_distance_targets.cc
-            s2geometry/src/s2/s2min_distance_targets.cc
-            s2geometry/src/s2/s2padded_cell.cc
-            s2geometry/src/s2/s2point_compression.cc
-            s2geometry/src/s2/s2point_region.cc
-            s2geometry/src/s2/s2pointutil.cc
-            s2geometry/src/s2/s2polygon.cc
-            s2geometry/src/s2/s2polyline.cc
-            s2geometry/src/s2/s2polyline_alignment.cc
-            s2geometry/src/s2/s2polyline_measures.cc
-            s2geometry/src/s2/s2polyline_simplifier.cc
-            s2geometry/src/s2/s2predicates.cc
-            s2geometry/src/s2/s2projections.cc
-            s2geometry/src/s2/s2r2rect.cc
-            s2geometry/src/s2/s2region.cc
-            s2geometry/src/s2/s2region_term_indexer.cc
-            s2geometry/src/s2/s2region_coverer.cc
-            s2geometry/src/s2/s2region_intersection.cc
-            s2geometry/src/s2/s2region_union.cc
-            s2geometry/src/s2/s2shape_index.cc
-            s2geometry/src/s2/s2shape_index_buffered_region.cc
-            s2geometry/src/s2/s2shape_index_measures.cc
-            s2geometry/src/s2/s2shape_measures.cc
-            s2geometry/src/s2/s2shape_nesting_query.cc
-            s2geometry/src/s2/s2shapeutil_build_polygon_boundaries.cc
-            s2geometry/src/s2/s2shapeutil_coding.cc
-            s2geometry/src/s2/s2shapeutil_contains_brute_force.cc
-            s2geometry/src/s2/s2shapeutil_conversion.cc
-            s2geometry/src/s2/s2shapeutil_edge_iterator.cc
-            s2geometry/src/s2/s2shapeutil_get_reference_point.cc
-            s2geometry/src/s2/s2shapeutil_visit_crossing_edge_pairs.cc
-            s2geometry/src/s2/s2text_format.cc
-            s2geometry/src/s2/s2wedge_relations.cc
-            s2geometry/src/s2/s2winding_operation.cc
-            s2geometry/src/s2/util/bits/bit-interleave.cc
-            s2geometry/src/s2/util/coding/coder.cc
-            s2geometry/src/s2/util/coding/varint.cc
-            s2geometry/src/s2/util/math/exactfloat/exactfloat.cc
-            s2geometry/src/s2/util/math/mathutil.cc
-            s2geometry/src/s2/util/units/length-units.cc)
+# Build s2geometry using its own CMake (v0.12.0 has proper add_subdirectory 
support).
+set(BUILD_SHARED_LIBS
+    OFF
+    CACHE BOOL "" FORCE)
+set(BUILD_TESTS
+    OFF
+    CACHE BOOL "" FORCE)
+set(BUILD_EXAMPLES
+    OFF
+    CACHE BOOL "" FORCE)
+add_subdirectory(s2geometry)

Review Comment:
   This is a structural change...before we had hard-coded a bunch of .cc files 
because the version of s2geometry we had in the submodule didn't support 
`add_subdirectory()`. I updated s2geometry to the latest version as part of 
this change and so now we can simplify this considerably.



##########
rust/sedona-testing/src/testers.rs:
##########
@@ -542,13 +542,13 @@ impl ScalarUdfTester {
     }
 
     fn invoke_scalar_arrays(&self, arg: impl Literal, arrays: Vec<ArrayRef>) 
-> Result<ArrayRef> {
-        let mut args = zip(arrays, &self.arg_types)
+        let scalar_arg = Self::scalar_arg(arg, &self.arg_types[0])?;
+        let mut args = zip(arrays, &self.arg_types[1..])

Review Comment:
   This fixes a bug discovered when running one of the benchmarks (which use 
this tester)



##########
c/sedona-s2geography/src/kernels.rs:
##########
@@ -0,0 +1,398 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use datafusion_common::Result;
+use sedona_common::sedona_internal_datafusion_err;
+use sedona_expr::scalar_udf::ScalarKernelRef;
+use sedona_extension::{extension::SedonaCScalarKernel, 
scalar_kernel::ImportedScalarKernel};
+
+use crate::{s2geog_check, s2geography_c_bindgen::*};
+
+pub fn s2_scalar_kernels() -> Result<Vec<(String, ScalarKernelRef)>> {
+    let mut ffi_scalar_kernels = Vec::<SedonaCScalarKernel>::new();
+    ffi_scalar_kernels.resize_with(unsafe { S2GeogNumKernels() }, 
Default::default);

Review Comment:
   This part didn't change, it just got split up because the C API is big 
enough that it was slightly nicer to split up the safe wrappers to different 
files



##########
c/sedona-s2geography/src/operator.rs:
##########
@@ -0,0 +1,236 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ffi::c_int;
+use std::ptr;
+
+use crate::geography::Geography;
+use crate::s2geog_call;
+use crate::s2geog_check;
+use crate::s2geography_c_bindgen::*;
+use crate::utils::S2GeogCError;
+
+/// S2Geography operator types
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
+#[repr(i32)]
+pub enum OpType {
+    /// Tests whether two geometries intersect
+    Intersects = S2GEOGRAPHY_OP_INTERSECTS,
+    /// Tests whether the first geometry contains the second
+    Contains = S2GEOGRAPHY_OP_CONTAINS,
+    /// Tests whether the first geometry is within the second
+    Within = S2GEOGRAPHY_OP_WITHIN,
+    /// Tests whether two geometries are equal
+    Equals = S2GEOGRAPHY_OP_EQUALS,
+    /// Tests whether two geometries are within some distance
+    DWithin = S2GEOGRAPHY_OP_DISTANCE_WITHIN,
+}
+
+impl OpType {
+    fn as_c_int(self) -> c_int {
+        self as c_int
+    }
+}
+
+/// S2Geography output types
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[non_exhaustive]
+#[repr(i32)]
+pub enum OutputType {
+    Unknown = 0,
+    Bool = S2GEOGRAPHY_OUTPUT_TYPE_BOOL,
+}
+
+/// Safe wrapper around S2GeogOp for evaluating spatial operations
+///
+/// This struct represents an operator that can be applied
+/// to various combinations of arguments. If possible one
+/// should use the kernels, which have lower overhead and operate more
+/// directly on the input when it arrives as Arrow batches.
+///
+/// Operations are an abstraction that help amortize overhead when evaluating
+/// the same operation many times and reduce the API surface of the s2geography
+/// C API when the Arrow interface is not sufficient.
+pub struct Op {
+    ptr: *mut S2GeogOp,
+}

Review Comment:
   This is needed because we need to evaluate the predicates directly from 
inside the join without the scalar functions



##########
c/sedona-s2geography/tests/mod.rs:
##########


Review Comment:
   These tests were just moved back to an inline `mod test { ... }`



##########
c/sedona-s2geography/src/rect_bounder.rs:
##########
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ptr;
+
+use crate::geography::Geography;
+use crate::s2geog_call;
+use crate::s2geog_check;
+use crate::s2geography_c_bindgen::*;
+use crate::utils::S2GeogCError;
+
+/// Safe wrapper around S2GeogRectBounder for computing bounding rectangles
+///
+/// This struct accumulates bounds from multiple geographies and can compute
+/// the minimum bounding rectangle that contains all of them.
+pub struct RectBounder {
+    ptr: *mut S2GeogRectBounder,
+}

Review Comment:
   We also need this for joins and also for pruning (to calculate the bounds of 
a ScalarValue for the geography type). Computing rectangle bounds is 
surprisingly complicated!



##########
c/sedona-s2geography/CMakeLists.txt:
##########
@@ -178,38 +73,19 @@ set(ABSL_LIBRARIES
     absl::str_format
     absl::strings
     absl::type_traits
-    absl::utility)
-
-target_include_directories(s2
-                           PUBLIC 
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/s2geometry/src>
-)
-target_link_libraries(s2 ${OPENSSL_LIBRARIES} ${ABSL_LIBRARIES} 
${CMAKE_THREAD_LIBS_INIT})
+    absl::utility
+    absl::vlog_is_on)
 
-install(TARGETS s2
-        EXPORT "s2geography-targets"
-        RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
-        ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-        LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
-
-# Build s2geography
+# Build s2geography (needs the s2::s2 alias and version info)
 add_library(s2::s2 ALIAS s2)
 set(S2_VERSION_MAJOR 0)
-set(S2_VERSION_MINOR 11)
+set(S2_VERSION_MINOR 13)
 set(S2_VERSION_PATCH 1)
-set(BUILD_SHARED_LIBS OFF)
 add_subdirectory(s2geography)
 
-# Add our glue library
-add_library(geography_glue src/geography_glue.cc)
-target_link_libraries(geography_glue
-                      s2geography

Review Comment:
   I moved the C API into s2geography itself because there were better tools to 
test it/leak check it there, so we no longer need to build our own glue library 
from the C++ side



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to