Copilot commented on code in PR #814:
URL: https://github.com/apache/sedona-db/pull/814#discussion_r3189224273


##########
r/sedonadb/R/dataframe-join.R:
##########
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Join two SedonaDB DataFrames
+#'
+#' Perform a join operation between two dataframes. Use [sd_join_by()] to
+#' specify join conditions using `x$column` and `y$column` syntax to
+#' reference columns from the left and right tables respectively.
+#'
+#' @param x The left dataframe
+#' @param y The right dataframe
+#' @param by Join specification. One of:
+#'   - A `sedonadb_join_by` object from [sd_join_by()]
+#'   - A `sedonadb_join_by` object from [sd_join_intersects()],
+#'     [sd_join_contains()], [sd_join_within()], [sd_join_covers()],
+#'     [sd_join_coveredby()], [sd_join_touches()], [sd_join_crosses()],
+#'     [sd_join_overlaps()], or [sd_join_equals()].
+#'   - A character vector of column names to join on in both tables
+#'   - A named character vector mapping left-table column names to
+#'     right-table column names, e.g. `c(x_val = "y_val")`
+#'   - `NULL` for a natural join on columns with matching names
+#' @param join_type The type of join to perform. One of "inner", "left", 
"right",
+#'   "full", "leftsemi", "rightsemi", "leftanti", "rightanti", "leftmark",
+#'   or "rightmark".
+#' @param select Post-join column selection. One of
+#'   - `NULL` for no modification, which may result in duplicate (unqualified)
+#'     column names. The column may still be
+#'     referred to with a qualifier in advanced usage using [sd_expr_column()].
+#'   - [sd_join_select_default()] for dplyr-like behaviour (equi-join keys
+#'     removed, intersecting names suffixed)
+#'   - [sd_join_select()] for a custom selection
+#' @param keep Use `TRUE` to keep all key column in an equijoin or spatial 
join.
+#'   This is only applied when using [sd_join_select_default()].
+#'
+#' @returns An object of class sedonadb_dataframe
+#' @export
+#'
+#' @examples
+#' df1 <- data.frame(x = letters[1:10], y = 1:10)
+#' df2 <- data.frame(y = 10:1, z = LETTERS[1:10])
+#' df1 |> sd_join(df2)
+#'
+sd_join <- function(
+  x,
+  y,
+  by = NULL,
+  join_type = "inner",
+  select = sd_join_select_default(),
+  keep = NULL
+) {
+  x <- as_sedonadb_dataframe(x)
+  y <- as_sedonadb_dataframe(y, ctx = x$ctx)
+
+  x_schema <- infer_nanoarrow_schema(x)
+  y_schema <- infer_nanoarrow_schema(y)
+  join_expr_ctx <- sd_join_expr_ctx(x_schema, y_schema, ctx = x$ctx)
+  join_conditions <- sd_build_join_conditions(join_expr_ctx, by, ctx = x$ctx)
+
+  df <- x$df$join(y$df, join_conditions, join_type, left_alias = "x", 
right_alias = "y")
+  out <- new_sedonadb_dataframe(x$ctx, df)
+
+  # Apply post-join column selection if needed
+  if (is.null(select)) {
+    projection <- NULL
+  } else if (inherits(select, "sedonadb_join_select_default")) {
+    # Default select: remove duplicate equijoin keys, apply suffixes
+    projection <- sd_build_default_select(
+      join_expr_ctx,
+      join_conditions,
+      select$suffix,
+      join_type,
+      keep = keep
+    )
+  } else if (inherits(select, "sedonadb_join_select")) {
+    # Custom select: evaluate user expressions
+    projection <- sd_eval_join_select_exprs(select, join_expr_ctx)
+  } else {
+    stop(
+      "`select` must be NULL, sd_join_select_default(), or sd_join_select()",
+      call. = FALSE
+    )
+  }
+
+  # NULL return from these functions means that no extra projecting is needed
+  if (is.null(projection)) {
+    out
+  } else {
+    sd_transmute(out, !!!projection)
+  }
+}
+
+#' @rdname sd_join
+#' @export
+sd_left_join <- function(
+  x,
+  y,
+  by = NULL,
+  select = sd_join_select_default(),
+  keep = NULL
+) {
+  sd_join(x, y, by = by, select = select, join_type = "left", keep = keep)
+}
+
+#' @rdname sd_join
+#' @export
+sd_right_join <- function(
+  x,
+  y,
+  by = NULL,
+  select = sd_join_select_default(),
+  keep = NULL
+) {
+  sd_join(x, y, by = by, select = select, join_type = "right", keep = keep)
+}
+
+#' @rdname sd_join
+#' @export
+sd_inner_join <- function(
+  x,
+  y,
+  by = NULL,
+  select = sd_join_select_default(),
+  keep = NULL
+) {
+  sd_join(x, y, by = by, select = select, join_type = "inner", keep = keep)
+}
+
+#' @rdname sd_join
+#' @export
+sd_full_join <- function(
+  x,
+  y,
+  by = NULL,
+  select = sd_join_select_default(),
+  keep = NULL
+) {
+  sd_join(x, y, by = by, select = select, join_type = "full", keep = keep)
+}
+
+#' @rdname sd_join
+#' @export
+sd_semi_join <- function(x, y, by = NULL) {
+  sd_join(x, y, by = by, join_type = "leftsemi")
+}
+
+#' @rdname sd_join
+#' @export
+sd_anti_join <- function(x, y, by = NULL) {
+  sd_join(x, y, by = by, join_type = "leftanti")
+}
+
+#' @rdname sd_join
+#' @export
+sd_cross_join <- function(x, y, by = NULL, select = sd_join_select_default()) {

Review Comment:
   `sd_cross_join()` advertises a `by` argument, but the implementation ignores 
whatever the caller passes and always delegates to `sd_join(..., by = 
character())`. If someone supplies `by` expecting it to constrain the join, 
they'll silently get a Cartesian product instead.
   



##########
r/sedonadb/R/join-expression.R:
##########
@@ -142,6 +241,83 @@ get_from_table_ref <- function(x, name) {
   x[[name]]
 }
 
+# Get the single geometry column from a table ref, or error if ambiguous
+get_geom_from_table_ref <- function(x) {
+  schema <- attr(x, "schema")
+  qualifier <- attr(x, "qualifier")
+  geom_cols <- get_geometry_columns(schema)
+
+  if (length(geom_cols) == 0) {
+    stop(
+      sprintf("No geometry columns found in table '%s'", qualifier),
+      call. = FALSE
+    )
+  }
+
+  if (length(geom_cols) > 1) {
+    stop(
+      sprintf(
+        "Ambiguous use of %s$geom()\n - Did you mean one of %s",
+        qualifier,
+        paste0(qualifier, "$", geom_cols, collapse = ", ")
+      ),
+      call. = FALSE
+    )
+  }
+
+  get_from_table_ref(x, geom_cols[[1]])
+}
+
+# Find geometry column names in a schema (columns with geoarrow extension type)
+get_geometry_columns <- function(schema) {
+  col_names <- names(schema$children)
+  is_geom <- vapply(
+    schema$children,
+    function(child) {
+      ext_name <- child$metadata[["ARROW:extension:name"]]
+      !is.null(ext_name) && grepl("^geoarrow\\.", ext_name)
+    },
+    logical(1)
+  )
+  col_names[is_geom]
+}
+
+# Match .tables$x$geom(), .tables$y$geom(), x$geom(), or y$geom() call pattern
+# Returns list(table_name, table_ref) or NULL if no match
+match_tables_geom_call <- function(expr, join_expr_ctx) {
+  # Match .tables$x$geom() or .tables$y$geom()
+  if (
+    rlang::is_call(expr[[1]], "$") &&
+      rlang::is_call(expr[[1]][[2]], "$") &&
+      rlang::is_symbol(expr[[1]][[2]][[2]], ".tables") &&
+      as.character(expr[[1]][[2]][[3]]) %in% c("x", "y") &&
+      rlang::is_symbol(expr[[1]][[3]], "geom")
+  ) {

Review Comment:
   This matcher treats any call whose function is `x$geom`/`.tables$x$geom` as 
the zero-argument geometry helper, but it never checks that the call actually 
has zero arguments. Inputs like `x$geom("foo")` will be accepted and behave 
exactly like `x$geom()`, so invalid user input is silently ignored instead of 
producing an error.



##########
r/sedonadb/R/join-expression.R:
##########
@@ -57,6 +73,60 @@ sd_join_by <- function(...) {
   )
 }
 
+#' @rdname sd_join_by
+#' @export
+sd_join_intersects <- function() {
+  sd_join_by(.fns$st_intersects(.tables$x$geom(), .tables$y$geom()))
+}
+
+#' @rdname sd_join_by
+#' @export
+sd_join_contains <- function() {
+  sd_join_by(.fns$st_contains(.tables$x$geom(), .tables$y$geom()))
+}

Review Comment:
   The new predicate-specific wrappers introduced in this block are not covered 
by tests anywhere in `test-join-expression.R` or `test-dataframe-join.R`; only 
`sd_join_intersects()` is exercised. Because several of these predicates are 
asymmetric, an x/y argument reversal here would currently go unnoticed.



##########
r/sedonadb/tests/testthat/test-dataframe-join.R:
##########
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("sd_join() select argument is applied to join results", {
+  df1 <- data.frame(common = "from_x", letters_x = letters[1:6], key = 1:6)
+  df2 <- data.frame(common = "from_y", key = 10:4, letters_y = LETTERS[1:7])
+
+  # With select = NULL, columns are blindly stacked
+  joined <- sd_join(df1, df2, sd_join_by(x$key == y$key), select = NULL)
+  expect_identical(
+    colnames(joined),
+    c(names(df1), names(df2))
+  )
+
+  # With select = sd_join_select_default()
+  joined <- sd_join(
+    df1,
+    df2,
+    sd_join_by(x$key == y$key),
+    select = sd_join_select_default()
+  )
+  expect_identical(
+    colnames(joined),
+    c("common.x", "letters_x", "key", "common.y", "letters_y")
+  )
+
+  # Check at least one result
+  expect_identical(
+    as.data.frame(joined |> sd_arrange(key)),
+    merge(df1, df2, by = "key")[c(
+      "common.x",
+      "letters_x",
+      "key",
+      "common.y",
+      "letters_y"
+    )]
+  )
+
+  # Check that custom suffixes work
+  joined <- sd_join(
+    df1,
+    df2,
+    sd_join_by(x$key == y$key),
+    select = sd_join_select_default(suffix = c("_custom_x", "_custom_y"))
+  )
+  expect_identical(
+    colnames(joined),
+    c("common_custom_x", "letters_x", "key", "common_custom_y", "letters_y")
+  )
+
+  # Check that custom selections work
+  joined <- sd_join(
+    df1,
+    df2,
+    sd_join_by(x$key == y$key),
+    select = sd_join_select(letters_x, key = y$key, common = x$common, 
y$letters_y)
+  )
+  expect_identical(
+    colnames(joined),
+    c("letters_x", "key", "common", "letters_y")
+  )
+})
+
+test_that("sd_join() join_type argument is applied to join results", {
+  df1 <- data.frame(letters_x = letters[1:6], key = 1:6)
+  df2 <- data.frame(key = 10:4, letters_y = LETTERS[1:7])
+
+  joined <- df1 |> sd_join(df2, by = "key", join_type = "left")
+  expect_identical(
+    as.data.frame(joined |> sd_arrange(key)),
+    merge(df1, df2, by = "key", all.x = TRUE, all.y = FALSE)[c(
+      "letters_x",
+      "key",
+      "letters_y"
+    )]
+  )
+
+  joined <- df1 |> sd_join(df2, by = "key", join_type = "right")
+  expect_identical(
+    as.data.frame(joined |> sd_arrange(key)),
+    merge(df1, df2, by = "key", all.x = FALSE, all.y = TRUE)[c(
+      "letters_x",
+      "key",
+      "letters_y"
+    )]
+  )
+
+  joined <- df1 |> sd_join(df2, by = "key", join_type = "full")
+  expect_identical(
+    as.data.frame(joined |> sd_arrange(key)),
+    merge(df1, df2, by = "key", all.x = TRUE, all.y = TRUE)[c(
+      "letters_x",
+      "key",
+      "letters_y"
+    )]
+  )
+
+  df1$extra_column <- "foofy"
+  joined <- df1 |> sd_join(df2, by = "key", join_type = "full")
+  expect_identical(colnames(joined), c("letters_x", "key", "extra_column", 
"letters_y"))
+})
+
+test_that("sd_join() computes the correct columns for spatial predicate 
joins", {
+  cities <- sd_read_parquet(system.file(
+    "files/natural-earth_cities_geo.parquet",
+    package = "sedonadb"
+  ))
+  countries <- sd_read_parquet(system.file(
+    "files/natural-earth_countries_geo.parquet",
+    package = "sedonadb"
+  ))
+
+  # left_join: We should get one geometry column from the logical left side 
(points)
+  df <- cities |> sd_left_join(countries, by = sd_join_intersects())
+  expect_identical(
+    colnames(df),
+    c("name.x", "geometry", "name.y", "continent")
+  )
+  expect_identical(
+    df |> sd_transmute(dim = .fns$st_dimension(geometry)) |> head(1) |> 
as.data.frame(),
+    data.frame(dim = 0L)
+  )
+
+  # inner_join: We should get one geometry column from the logical left side 
(points)
+  df <- cities |> sd_inner_join(countries, by = sd_join_intersects())
+  expect_identical(
+    colnames(df),
+    c("name.x", "geometry", "name.y", "continent")
+  )
+  expect_identical(
+    df |> sd_transmute(dim = .fns$st_dimension(geometry)) |> head(1) |> 
as.data.frame(),
+    data.frame(dim = 0L)
+  )
+
+  # right_join: We should get one geometry column from the logical right side 
(polygons)
+  df <- cities |> sd_right_join(countries, by = sd_join_intersects())
+  expect_identical(
+    colnames(df),
+    c("name.x", "geometry", "name.y", "continent")
+  )
+  expect_identical(
+    df |> sd_transmute(dim = .fns$st_dimension(geometry)) |> head(1) |> 
as.data.frame(),
+    data.frame(dim = 2L)
+  )
+
+  # full_join: Keeps both geometry columns by default
+  df <- cities |> sd_full_join(countries, by = sd_join_intersects())
+  expect_identical(
+    colnames(df),
+    c("name.x", "geometry.x", "name.y", "continent", "geometry.y")
+  )
+  expect_identical(
+    df |> sd_transmute(dim = .fns$st_dimension(geometry.x)) |> head(1) |> 
as.data.frame(),
+    data.frame(dim = 0L)
+  )
+  expect_identical(
+    df |> sd_transmute(dim = .fns$st_dimension(geometry.y)) |> head(1) |> 
as.data.frame(),

Review Comment:
   These assertions depend on `head(1)` from an unordered full join. If the 
planner happens to return an unmatched row first, one of 
`geometry.x`/`geometry.y` will be NULL and `st_dimension()` will no longer be 
`0L`/`2L`, so this test can become flaky even when the column-selection logic 
is correct.
   



##########
r/sedonadb/R/join-expression.R:
##########
@@ -466,10 +688,42 @@ sd_build_default_select <- function(
   x_names <- names(join_expr_ctx$x_schema$children)
   y_names <- names(join_expr_ctx$y_schema$children)
 
-  # Extract equijoin key pairs (simple x$col == y$col conditions)
-  # and remove them from y_names. We do this even for right joins to match
+  # Extract simple key pairs (x$col == y$col or some_fun(x$col, y$col) 
conditions)
+  # if keep is not TRUE.
+  if (isTRUE(keep)) {
+    simple_join_keys <- list(x_cols = character(), y_cols = character(), op = 
character())
+  } else {
+    simple_join_keys <- sd_extract_simple_join_keys(join_conditions)
+  }
+
+  # For the purposes of computing how to choose output columns, we consider
+  # st predicates equijoin keys. We can't do this for a full join (coalescing
+  # things that might not be equal doesn't make sense).
+  if (join_type == "full") {

Review Comment:
   Adding spatial predicates to `equijoin_ops` makes right joins reuse the 
left-hand column name for the surviving geometry, just like an equality key. 
That is only safe when both geometry columns have the same name: with 
`x$geom()`/`y$geom()` helpers, a right spatial join between tables with 
differently named geometry columns will return the right geometry under the 
left column's name.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to