Re: [PR] feat(r/sedonadb): Add join expression evaluation [sedona-db]

via GitHub Fri, 24 Apr 2026 15:43:56 -0700


Copilot commented on code in PR #781:
URL: https://github.com/apache/sedona-db/pull/781#discussion_r3140648021



##########
r/sedonadb/R/join-expression.R:
##########
@@ -0,0 +1,602 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Specify join conditions
+#'
+#' Use `sd_join_by()` to specify join conditions for [sd_join()] using
+#' expressions that reference columns from both tables. Table references
+#' are specified using `x$column` and `y$column` syntax to disambiguate
+#' columns from the left and right tables.
+#'
+#' @param ... Expressions specifying join conditions. These should be
+#'   comparison expressions (e.g., `x$id == y$id`, `x$value > y$threshold`)
+#'   or spatial predicate expressions
+#'   (e.g., `st_intersects(x$geometry, y$geometry)`).
+#'   Multiple conditions are combined with AND.
+#'
+#' @returns An object of class `sedonadb_join_by` containing the unevaluated
+#'   join condition expressions.
+#' @export
+#'
+#' @examples
+#' # Equality join on id column
+#' sd_join_by(x$id == y$id)
+#'
+#' # Multiple conditions (combined with AND)
+#' sd_join_by(x$id == y$id, x$date >= y$start_date)
+#'
+#' # Inequality join
+#' sd_join_by(x$value > y$threshold)
+#'
+sd_join_by <- function(...) {
+  exprs <- rlang::enquos(...)
+
+  if (length(exprs) == 0) {
+    stop("sd_join_by() requires at least one join condition")
+  }
+
+  structure(
+    list(
+      exprs = exprs
+    ),
+    class = "sedonadb_join_by"
+  )
+}
+
+#' @export
+print.sedonadb_join_by <- function(x, ...) {
+  cat("<sedonadb_join_by>\n")
+  for (i in seq_along(x$exprs)) {
+    cat("  ", rlang::expr_deparse(rlang::quo_get_expr(x$exprs[[i]])), "\n", 
sep = "")
+  }
+  invisible(x)
+}
+
+#' Expression evaluation context for joins
+#'
+#' Creates a context for evaluating join conditions that can reference columns
+#' from two tables using qualified references (`x$col` and `y$col`).
+#'
+#' @param x_schema Schema for the left table
+#' @param y_schema Schema for the right table
+#' @param env The expression environment
+#' @param ctx A SedonaDB context
+#' @param x_qualifier Qualifier for left table columns (default "x")
+#' @param y_qualifier Qualifier for right table columns (default "y")
+#'
+#' @return An object of class sedonadb_join_expr_ctx
+#' @noRd
+sd_join_expr_ctx <- function(
+  x_schema,
+  y_schema,
+  env = parent.frame(),
+  ctx = NULL
+) {
+  x_schema <- nanoarrow::as_nanoarrow_schema(x_schema)
+  y_schema <- nanoarrow::as_nanoarrow_schema(y_schema)
+
+  x_names <- as.character(names(x_schema$children))
+  y_names <- as.character(names(y_schema$children))
+
+  factory <- sd_expr_factory(ctx = ctx)
+
+  # We hard-code these for the purposes of the join expression
+  x_qualifier <- "x"
+  y_qualifier <- "y"
+
+  # Create qualified column references for both tables
+  # These are accessed via x$col and y$col syntax
+  x_cols <- lapply(x_names, function(name) {
+    sd_expr_column(name, qualifier = x_qualifier, factory = factory)
+  })
+  names(x_cols) <- x_names
+
+  y_cols <- lapply(y_names, function(name) {
+    sd_expr_column(name, qualifier = y_qualifier, factory = factory)
+  })
+  names(y_cols) <- y_names
+
+  # Create table reference objects that support `$` access
+  x_ref <- structure(x_cols, class = "sedonadb_table_ref", qualifier = 
x_qualifier)
+  y_ref <- structure(y_cols, class = "sedonadb_table_ref", qualifier = 
y_qualifier)
+
+  # The data mask contains x and y as table references
+  data <- list(x = x_ref, y = y_ref)
+
+  # Also include unqualified column references for unambiguous columns
+  all_names <- unique(c(x_names, y_names))
+  ambiguous <- intersect(x_names, y_names)
+
+  for (name in all_names) {
+    if (!(name %in% ambiguous)) {
+      # Unambiguous column - add to data mask
+      if (name %in% x_names) {
+        data[[name]] <- x_cols[[name]]
+      } else {
+        data[[name]] <- y_cols[[name]]
+      }
+    }
+  }
+
+  structure(
+    list(
+      factory = factory,
+      x_schema = x_schema,
+      y_schema = y_schema,
+      x_qualifier = x_qualifier,
+      y_qualifier = y_qualifier,
+      x_ref = x_ref,
+      y_ref = y_ref,
+      ambiguous_columns = ambiguous,
+      data = rlang::as_data_mask(data),
+      env = env,
+      fns = default_fns
+    ),
+    class = c("sedonadb_join_expr_ctx", "sedonadb_expr_ctx")
+  )
+}
+
+#' @export
+`$.sedonadb_table_ref` <- function(x, name) {
+  if (!(name %in% names(x))) {
+    qualifier <- attr(x, "qualifier")
+    stop(
+      sprintf("Column '%s' not found in table '%s'", name, qualifier),
+      call. = FALSE
+    )
+  }
+  x[[name]]
+}
+
+#' Evaluate join conditions
+#'
+#' Evaluates join condition expressions captured by [sd_join_by()] into
+#' SedonaDB expressions using a join expression context.
+#'
+#' @param join_by A `sedonadb_join_by` object from [sd_join_by()]
+#' @param join_expr_ctx A `sedonadb_join_expr_ctx` from `sd_join_expr_ctx()`
+#'
+#' @returns A list of `SedonaDBExpr` objects representing the join conditions
+#' @noRd
+sd_eval_join_conditions <- function(join_by, join_expr_ctx) {
+  ensure_translations_registered()
+
+  stopifnot(inherits(join_by, "sedonadb_join_by"))
+
+  lapply(join_by$exprs, function(quo) {
+    expr <- rlang::quo_get_expr(quo)
+    env <- rlang::quo_get_env(quo)
+
+    # Before we even attempt evaluation, we intercept bare names so that
+    # sd_join_by(x, y, z) creates an equijoin
+    if (rlang::is_symbol(expr)) {
+      col <- as.character(expr)
+      return(
+        sd_expr_binary(
+          "==",
+          sd_expr_column(col, qualifier = "x", factory = 
join_expr_ctx$factory),
+          sd_expr_column(col, qualifier = "y", factory = 
join_expr_ctx$factory),
+          factory = join_expr_ctx$factory
+        )
+      )
+    }
+
+    rlang::try_fetch(
+      {
+        result <- sd_eval_join_expr_inner(expr, join_expr_ctx, env)
+        as_sd_expr(result, factory = join_expr_ctx$factory)
+      },
+      error = function(e) {
+        rlang::abort(
+          sprintf("Error evaluating join condition %s", 
rlang::expr_label(expr)),
+          parent = e
+        )
+      }
+    )
+  })
+}
+
+sd_eval_join_expr_inner <- function(expr, join_expr_ctx, env) {
+  if (rlang::is_call(expr)) {
+    # Special handling for x$col and y$col syntax
+    if (rlang::is_call(expr, "$")) {
+      lhs <- expr[[2]]
+      rhs <- expr[[3]]
+
+      # Check if this is x$col or y$col pattern
+      if (rlang::is_symbol(lhs) && as.character(lhs) %in% c("x", "y")) {
+        table_ref <- rlang::eval_tidy(lhs, data = join_expr_ctx$data, env = 
env)
+        col_name <- as.character(rhs)
+        # Use the $ S3 method to get proper error handling for missing columns
+        return(`$.sedonadb_table_ref`(table_ref, col_name))
+      }
+    }
+
+    # Check for ambiguous unqualified column reference
+    if (rlang::is_symbol(expr)) {
+      name <- as.character(expr)
+      if (name %in% join_expr_ctx$ambiguous_columns) {
+        stop(
+          sprintf("Column '%s' is ambiguous (exists in both tables). ", name),
+          sprintf("Use x$%s or y$%s to disambiguate.", name, name),
+          call. = FALSE
+        )
+      }
+    }
+

Review Comment:
   `sd_eval_join_expr_inner()` has an unreachable check: inside the 
`rlang::is_call(expr)` branch it tests `rlang::is_symbol(expr)`, which can 
never be true. This dead code makes the control flow harder to follow; remove 
it (or move the ambiguous-symbol logic to the symbol branch, which already 
exists).
   ```suggestion
   
   ```



##########
r/sedonadb/tests/testthat/test-dataframe.R:
##########
@@ -523,3 +523,63 @@ test_that("sd_summarise() works with dplyr-like summarise 
syntax", {
     data.frame(x = sum(as.double(1:10)))
   )
 })
+
+test_that("sd_join() select argument is applied to join results", {
+  df1 <- data.frame(common = "from_x", letters_x = letters[1:6], key = 1:6)
+  df2 <- data.frame(common = "from_y", key = 10:4, letters_y = LETTERS[1:7])
+
+  # With select = NULL, columns are blindly stacked
+  joined <- sd_join(df1, df2, sd_join_by(x$key == y$key), select = NULL)
+  expect_identical(
+    colnames(joined),
+    c(names(df1), names(df2))
+  )
+
+  # With select = sd_join_select_default()
+  joined <- sd_join(
+    df1,
+    df2,
+    sd_join_by(x$key == y$key),
+    select = sd_join_select_default()
+  )
+  expect_identical(
+    colnames(joined),
+    c("common.x", "letters_x", "key", "common.y", "letters_y")
+  )
+
+  # Check at least one result
+  expect_identical(
+    as.data.frame(joined |> sd_arrange(key)),
+    merge(df1, df2, by = "key")[c(
+      "common.x",
+      "letters_x",
+      "key",
+      "common.y",
+      "letters_y"
+    )]
+  )
+
+  # Check that custom suffixes work
+  joined <- sd_join(
+    df1,
+    df2,
+    sd_join_by(x$key == y$key),
+    select = sd_join_select_default(suffix = c("_custom_x", "_custom_y"))
+  )

Review Comment:
   Test coverage for `sd_join()` currently exercises the `select` behaviors, 
but doesn’t cover `join_type` parsing/behavior (e.g., left/right/full) or the 
error message for an invalid `join_type`. Adding at least one non-inner join 
test and one invalid-value test would help catch regressions in the Rust 
`JoinType` parsing and the R-facing API.



##########
r/sedonadb/R/join-expression.R:
##########
@@ -0,0 +1,602 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Specify join conditions
+#'
+#' Use `sd_join_by()` to specify join conditions for [sd_join()] using
+#' expressions that reference columns from both tables. Table references
+#' are specified using `x$column` and `y$column` syntax to disambiguate
+#' columns from the left and right tables.
+#'
+#' @param ... Expressions specifying join conditions. These should be
+#'   comparison expressions (e.g., `x$id == y$id`, `x$value > y$threshold`)
+#'   or spatial predicate expressions
+#'   (e.g., `st_intersects(x$geometry, y$geometry)`).
+#'   Multiple conditions are combined with AND.
+#'
+#' @returns An object of class `sedonadb_join_by` containing the unevaluated
+#'   join condition expressions.
+#' @export
+#'
+#' @examples
+#' # Equality join on id column
+#' sd_join_by(x$id == y$id)
+#'
+#' # Multiple conditions (combined with AND)
+#' sd_join_by(x$id == y$id, x$date >= y$start_date)
+#'
+#' # Inequality join
+#' sd_join_by(x$value > y$threshold)
+#'
+sd_join_by <- function(...) {
+  exprs <- rlang::enquos(...)
+
+  if (length(exprs) == 0) {
+    stop("sd_join_by() requires at least one join condition")
+  }
+
+  structure(
+    list(
+      exprs = exprs
+    ),
+    class = "sedonadb_join_by"
+  )
+}
+
+#' @export
+print.sedonadb_join_by <- function(x, ...) {
+  cat("<sedonadb_join_by>\n")
+  for (i in seq_along(x$exprs)) {
+    cat("  ", rlang::expr_deparse(rlang::quo_get_expr(x$exprs[[i]])), "\n", 
sep = "")
+  }
+  invisible(x)
+}
+
+#' Expression evaluation context for joins
+#'
+#' Creates a context for evaluating join conditions that can reference columns
+#' from two tables using qualified references (`x$col` and `y$col`).
+#'
+#' @param x_schema Schema for the left table
+#' @param y_schema Schema for the right table
+#' @param env The expression environment
+#' @param ctx A SedonaDB context
+#' @param x_qualifier Qualifier for left table columns (default "x")
+#' @param y_qualifier Qualifier for right table columns (default "y")
+#'
+#' @return An object of class sedonadb_join_expr_ctx
+#' @noRd
+sd_join_expr_ctx <- function(
+  x_schema,
+  y_schema,
+  env = parent.frame(),
+  ctx = NULL
+) {
+  x_schema <- nanoarrow::as_nanoarrow_schema(x_schema)
+  y_schema <- nanoarrow::as_nanoarrow_schema(y_schema)
+
+  x_names <- as.character(names(x_schema$children))
+  y_names <- as.character(names(y_schema$children))
+
+  factory <- sd_expr_factory(ctx = ctx)
+
+  # We hard-code these for the purposes of the join expression
+  x_qualifier <- "x"
+  y_qualifier <- "y"
+
+  # Create qualified column references for both tables
+  # These are accessed via x$col and y$col syntax
+  x_cols <- lapply(x_names, function(name) {
+    sd_expr_column(name, qualifier = x_qualifier, factory = factory)
+  })
+  names(x_cols) <- x_names
+
+  y_cols <- lapply(y_names, function(name) {
+    sd_expr_column(name, qualifier = y_qualifier, factory = factory)
+  })
+  names(y_cols) <- y_names
+
+  # Create table reference objects that support `$` access
+  x_ref <- structure(x_cols, class = "sedonadb_table_ref", qualifier = 
x_qualifier)
+  y_ref <- structure(y_cols, class = "sedonadb_table_ref", qualifier = 
y_qualifier)
+
+  # The data mask contains x and y as table references
+  data <- list(x = x_ref, y = y_ref)
+
+  # Also include unqualified column references for unambiguous columns
+  all_names <- unique(c(x_names, y_names))
+  ambiguous <- intersect(x_names, y_names)
+
+  for (name in all_names) {
+    if (!(name %in% ambiguous)) {
+      # Unambiguous column - add to data mask
+      if (name %in% x_names) {
+        data[[name]] <- x_cols[[name]]
+      } else {
+        data[[name]] <- y_cols[[name]]
+      }
+    }
+  }
+
+  structure(
+    list(
+      factory = factory,
+      x_schema = x_schema,
+      y_schema = y_schema,
+      x_qualifier = x_qualifier,
+      y_qualifier = y_qualifier,
+      x_ref = x_ref,
+      y_ref = y_ref,
+      ambiguous_columns = ambiguous,
+      data = rlang::as_data_mask(data),
+      env = env,
+      fns = default_fns
+    ),
+    class = c("sedonadb_join_expr_ctx", "sedonadb_expr_ctx")
+  )
+}
+
+#' @export
+`$.sedonadb_table_ref` <- function(x, name) {
+  if (!(name %in% names(x))) {
+    qualifier <- attr(x, "qualifier")
+    stop(
+      sprintf("Column '%s' not found in table '%s'", name, qualifier),
+      call. = FALSE
+    )
+  }
+  x[[name]]
+}
+
+#' Evaluate join conditions
+#'
+#' Evaluates join condition expressions captured by [sd_join_by()] into
+#' SedonaDB expressions using a join expression context.
+#'
+#' @param join_by A `sedonadb_join_by` object from [sd_join_by()]
+#' @param join_expr_ctx A `sedonadb_join_expr_ctx` from `sd_join_expr_ctx()`
+#'
+#' @returns A list of `SedonaDBExpr` objects representing the join conditions
+#' @noRd
+sd_eval_join_conditions <- function(join_by, join_expr_ctx) {
+  ensure_translations_registered()
+
+  stopifnot(inherits(join_by, "sedonadb_join_by"))
+
+  lapply(join_by$exprs, function(quo) {
+    expr <- rlang::quo_get_expr(quo)
+    env <- rlang::quo_get_env(quo)
+
+    # Before we even attempt evaluation, we intercept bare names so that
+    # sd_join_by(x, y, z) creates an equijoin
+    if (rlang::is_symbol(expr)) {
+      col <- as.character(expr)
+      return(
+        sd_expr_binary(
+          "==",
+          sd_expr_column(col, qualifier = "x", factory = 
join_expr_ctx$factory),
+          sd_expr_column(col, qualifier = "y", factory = 
join_expr_ctx$factory),
+          factory = join_expr_ctx$factory
+        )
+      )
+    }
+
+    rlang::try_fetch(
+      {
+        result <- sd_eval_join_expr_inner(expr, join_expr_ctx, env)
+        as_sd_expr(result, factory = join_expr_ctx$factory)
+      },
+      error = function(e) {
+        rlang::abort(
+          sprintf("Error evaluating join condition %s", 
rlang::expr_label(expr)),
+          parent = e
+        )
+      }
+    )
+  })
+}
+
+sd_eval_join_expr_inner <- function(expr, join_expr_ctx, env) {
+  if (rlang::is_call(expr)) {
+    # Special handling for x$col and y$col syntax
+    if (rlang::is_call(expr, "$")) {
+      lhs <- expr[[2]]
+      rhs <- expr[[3]]
+
+      # Check if this is x$col or y$col pattern
+      if (rlang::is_symbol(lhs) && as.character(lhs) %in% c("x", "y")) {
+        table_ref <- rlang::eval_tidy(lhs, data = join_expr_ctx$data, env = 
env)
+        col_name <- as.character(rhs)
+        # Use the $ S3 method to get proper error handling for missing columns
+        return(`$.sedonadb_table_ref`(table_ref, col_name))
+      }
+    }
+
+    # Check for ambiguous unqualified column reference
+    if (rlang::is_symbol(expr)) {
+      name <- as.character(expr)
+      if (name %in% join_expr_ctx$ambiguous_columns) {
+        stop(
+          sprintf("Column '%s' is ambiguous (exists in both tables). ", name),
+          sprintf("Use x$%s or y$%s to disambiguate.", name, name),
+          call. = FALSE
+        )
+      }
+    }
+
+    # Extract function name
+    call_name <- rlang::call_name(expr)
+
+    # If we have a translation, use it (but with join-aware argument 
evaluation)
+    if (!is.null(call_name) && !is.null(join_expr_ctx$fns[[call_name]])) {
+      # Evaluate arguments with join context
+      evaluated_args <- lapply(
+        expr[-1],
+        sd_eval_join_expr_inner,
+        join_expr_ctx = join_expr_ctx,
+        env = env
+      )
+
+      # Build and evaluate the translated call
+      new_fn_expr <- rlang::call2("$", join_expr_ctx$fns, 
rlang::sym(call_name))
+      new_call <- rlang::call2(new_fn_expr, join_expr_ctx, !!!evaluated_args)
+      return(rlang::eval_tidy(new_call, data = join_expr_ctx$data, env = env))
+    }
+
+    # Default: evaluate with tidy eval
+    rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env)
+  } else if (rlang::is_symbol(expr)) {
+    # Check for ambiguous column reference
+    name <- as.character(expr)
+    if (name %in% join_expr_ctx$ambiguous_columns) {
+      stop(
+        sprintf(
+          "Column '%s' is ambiguous (exists in both tables). ",
+          name
+        ),
+        sprintf("Use x$%s or y$%s to disambiguate.", name, name),
+        call. = FALSE
+      )
+    }
+    rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env)
+  } else {
+    # Literal or other expression
+    rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env)
+  }
+}
+
+#' Build join conditions from a `by` specification
+#'
+#' Evaluates the `by` argument to produce a list of join condition expressions.
+#' Supports natural joins (NULL) and explicit conditions via [sd_join_by()].
+#'
+#' @param join_expr_ctx Object produced by `sd_join_expr_ctx()`
+#' @param by A `sedonadb_join_by` object from [sd_join_by()], or `NULL` for
+#'   a natural join on columns with matching names.
+#' @param ctx A SedonaDB context
+#'
+#' @returns A list of `SedonaDBExpr` objects representing the join conditions
+#' @noRd
+sd_build_join_conditions <- function(join_expr_ctx, by = NULL, ctx = NULL) {
+  if (is.null(by)) {
+    # Natural join: find common column names
+    x_names <- names(join_expr_ctx$x_schema$children)
+    y_names <- names(join_expr_ctx$y_schema$children)
+    common <- intersect(x_names, y_names)
+
+    if (length(common) == 0) {
+      stop(
+        "No common columns found for natural join. ",
+        "Use sd_join_by() to specify join conditions."
+      )
+    }
+
+    # Message
+    join_by_syms <- vapply(rlang::syms(common), rlang::expr_deparse, 
character(1))
+    message(sprintf(
+      "Joining with `by = sd_join_by(%s)`",
+      paste0(join_by_syms, collapse = ", ")
+    ))
+
+    # Build equality conditions for common columns
+    join_conditions <- lapply(common, function(col) {
+      sd_expr_binary(
+        "==",
+        sd_expr_column(col, qualifier = "x", factory = join_expr_ctx$factory),
+        sd_expr_column(col, qualifier = "y", factory = join_expr_ctx$factory),
+        factory = join_expr_ctx$factory
+      )
+    })
+  } else if (is.character(by)) {
+    by_unnamed <- !rlang::have_name(by)
+    names(by)[by_unnamed] <- by[by_unnamed]
+    join_conditions <- lapply(seq_along(by), function(i) {
+      sd_expr_binary(
+        "==",
+        sd_expr_column(names(by)[i], qualifier = "x", factory = 
join_expr_ctx$factory),
+        sd_expr_column(by[i], qualifier = "y", factory = 
join_expr_ctx$factory),
+        factory = join_expr_ctx$factory
+      )
+    })
+  } else if (inherits(by, "sedonadb_join_by")) {
+    join_conditions <- sd_eval_join_conditions(by, join_expr_ctx)
+  } else {
+    stop("`by` must be NULL (natural join) or a sd_join_by() object")
+  }
+
+  join_conditions
+}
+
+#' Specify default post-join column selection
+#'
+#' Use `sd_join_select_default()` to specify that the join result should
+#' remove duplicate equijoin key columns (keeping the x-side version) and
+#' apply suffixes to any remaining overlapping column names.
+#'
+#' @param suffix A character vector of length 2 specifying suffixes to add
+#'   to overlapping column names from the left (x) and right (y) tables.
+#'
+#' @returns An object of class `sedonadb_join_select_default` specifying
+#'   the default column selection behavior.
+#' @export
+#'
+#' @examples
+#' # Default suffixes
+#' sd_join_select_default()
+#'
+#' # Custom suffixes
+#' sd_join_select_default(suffix = c("_left", "_right"))
+#'
+sd_join_select_default <- function(suffix = c(".x", ".y")) {
+  if (!is.character(suffix) || length(suffix) != 2) {
+    stop("`suffix` must be a character vector of length 2")
+  }
+
+  structure(
+    list(suffix = suffix),
+    class = "sedonadb_join_select_default"
+  )
+}
+
+#' @export
+print.sedonadb_join_select_default <- function(x, ...) {
+  cat("<sedonadb_join_select_default>\n")
+  cat("  suffix: c(\"", x$suffix[1], "\", \"", x$suffix[2], "\")\n", sep = "")
+  invisible(x)
+}
+
+#' Specify custom post-join column selection
+#'
+#' Use `sd_join_select()` to specify which columns to include in the join
+#' result and optionally rename them. Columns are referenced using `x$column`
+#' and `y$column` syntax to disambiguate columns from the left and right 
tables.
+#'
+#' @param ... Named expressions specifying output columns. Each expression
+#'   should reference a column using `x$column` or `y$column` syntax.
+#'   The name of the argument becomes the output column name. Unnamed
+#'   arguments use the original column name (without table prefix).
+#'
+#' @returns An object of class `sedonadb_join_select` containing the
+#'   unevaluated column selection expressions.
+#' @export
+#'
+#' @examples
+#' # Select and rename columns
+#' sd_join_select(id = x$id, left_value = x$value, right_value = y$value)
+#'
+#' # Unnamed arguments keep original column name
+#' sd_join_select(x$id, x$name, y$value)
+#'
+sd_join_select <- function(...) {
+  exprs <- rlang::enquos(...)
+
+  if (length(exprs) == 0) {
+    stop("sd_join_select() requires at least one column selection")
+  }
+
+  structure(
+    list(exprs = exprs),
+    class = "sedonadb_join_select"
+  )
+}
+
+#' @export
+print.sedonadb_join_select <- function(x, ...) {
+  cat("<sedonadb_join_select>\n")
+  for (i in seq_along(x$exprs)) {
+    name <- names(x$exprs)[i]
+    expr_str <- rlang::expr_deparse(rlang::quo_get_expr(x$exprs[[i]]))
+    if (!is.null(name) && nzchar(name)) {
+      cat("  ", name, " = ", expr_str, "\n", sep = "")
+    } else {
+      cat("  ", expr_str, "\n", sep = "")
+    }
+  }
+  invisible(x)
+}
+
+#' Evaluate custom join select expressions
+#'
+#' Evaluates column selection expressions captured by [sd_join_select()] into
+#' a list of output column specifications.
+#'
+#' @param join_select A `sedonadb_join_select` object from [sd_join_select()]
+#' @param join_expr_ctx A `sedonadb_join_expr_ctx` from `sd_join_expr_ctx()`
+#'
+#' @returns A named list of expressions
+#' @noRd
+sd_eval_join_select_exprs <- function(join_select, join_expr_ctx) {
+  stopifnot(inherits(join_select, "sedonadb_join_select"))
+
+  exprs <- lapply(join_select$exprs, function(quo) {
+    expr <- rlang::quo_get_expr(quo)
+    env <- rlang::quo_get_env(quo)
+
+    rlang::try_fetch(
+      {
+        # Evaluate the expression to get a column reference
+        sd_eval_join_select_expr_inner(expr, join_expr_ctx, env)
+      },
+      error = function(e) {
+        rlang::abort(
+          sprintf(
+            "Error evaluating select expression %s",
+            rlang::expr_label(expr)
+          ),
+          parent = e
+        )
+      }
+    )
+  })
+
+  is_unnamed <- names(exprs) == ""
+  names(exprs)[is_unnamed] <- lapply(exprs[is_unnamed], function(e) 
e$qualified_name()[2])
+  exprs
+}
+
+#' Evaluate a single join select expression
+#'
+#' @param expr An unevaluated R expression
+#' @param join_expr_ctx A join expression context
+#' @param env The expression environment
+#'
+#' @returns A `SedonaDBExpr` column expression
+#' @noRd
+sd_eval_join_select_expr_inner <- function(expr, join_expr_ctx, env) {
+  if (rlang::is_call(expr, "$")) {
+    # x$col or y$col syntax
+    lhs <- expr[[2]]
+    rhs <- expr[[3]]
+
+    if (rlang::is_symbol(lhs) && as.character(lhs) %in% c("x", "y")) {
+      table_ref <- rlang::eval_tidy(lhs, data = join_expr_ctx$data, env = env)
+      col_name <- as.character(rhs)
+      return(`$.sedonadb_table_ref`(table_ref, col_name))
+    }
+  }
+
+  if (rlang::is_symbol(expr)) {
+    name <- as.character(expr)
+    if (name %in% join_expr_ctx$ambiguous_columns) {
+      stop(
+        sprintf("Column '%s' is ambiguous (exists in both tables). ", name),
+        sprintf("Use x$%s or y$%s to disambiguate.", name, name),
+        call. = FALSE
+      )
+    }
+    # Unambiguous column reference
+    return(rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env))
+  }
+
+  # For select, we only allow column references, not arbitrary expressions
+
+  stop(
+    "sd_join_select() expressions must be column references ",
+    "(e.g., x$col or y$col), not arbitrary expressions",
+    call. = FALSE
+  )
+}
+
+#' Build default column selection for join result
+#'
+#' Creates a column selection that:
+#' 1. Removes duplicate equijoin key columns (keeps x-side)
+#' 2. Applies suffixes to remaining overlapping column names
+#'
+#' @param join_expr_ctx A `sedonadb_join_expr_ctx` from `sd_join_expr_ctx()`
+#' @param join_conditions List of join condition expressions
+#' @param suffix Character vector of length 2 for left/right suffixes
+#'
+#' @returns A named list of expressions
+#' @noRd
+sd_build_default_select <- function(join_expr_ctx, join_conditions, suffix) {
+  x_names <- names(join_expr_ctx$x_schema$children)
+  y_names <- names(join_expr_ctx$y_schema$children)
+
+  # Extract equijoin key pairs (simple x$col == y$col conditions)
+  # and remove them from the y_names
+  equijoin_keys <- sd_extract_equijoin_keys(join_conditions)
+  y_names <- setdiff(y_names, equijoin_keys$y_cols)
+
+  # Calculate names that need suffixing
+  common_names <- intersect(x_names, y_names)
+  x_name_needs_suffix <- x_names %in% common_names
+  y_name_needs_suffix <- y_names %in% common_names
+
+  # Apply suffixes to column names that need it, but keep a copy of the input
+  # names unchanged since we'll need those to get the original column expr
+  x_names_out <- x_names
+  x_names_out[x_name_needs_suffix] <- paste0(x_names_out[x_name_needs_suffix], 
suffix[1])
+  y_names_out <- y_names
+  y_names_out[y_name_needs_suffix] <- paste0(y_names_out[y_name_needs_suffix], 
suffix[2])
+
+  # Create the expressions named with the appropriate output name
+  exprs <- c(
+    lapply(x_names, function(name) {
+      sd_expr_column(name, qualifier = "x", factory = join_expr_ctx$factory)
+    }),
+    lapply(y_names, function(name) {
+      sd_expr_column(name, qualifier = "y", factory = join_expr_ctx$factory)
+    })
+  )
+  names(exprs) <- c(x_names_out, y_names_out)
+  exprs
+}
+
+#' Extract equijoin key column pairs from join conditions
+#'
+#' Identifies simple equality conditions of the form `x$col == y$col` and
+#' returns the column names involved.
+#'
+#' @param join_conditions List of join condition expressions
+#'
+#' @returns A list with `x_cols` and `y_cols` character vectors of matching
+#'   column names from each side of equijoin conditions.
+#' @noRd
+sd_extract_equijoin_keys <- function(join_conditions) {
+  x_cols <- character()
+  y_cols <- character()
+
+  for (cond in join_conditions) {
+    stopifnot(inherits(cond, "SedonaDBExpr"))
+
+    parsed <- sd_expr_parse_binary(cond)
+    if (
+      is.null(parsed) ||
+        parsed$op != "=" ||
+        parsed$left$variant_name() != "Column" ||
+        parsed$right$variant_name() != "Column"
+    ) {
+      next
+    }
+
+    left <- parsed$left$qualified_name()
+    right <- parsed$right$qualified_name()
+    switch(
+      left[1],
+      x = x_cols <- append(x_cols, left[2]),
+      y = y_cols <- append(y_cols, left[2])
+    )
+    switch(
+      right[1],
+      x = x_cols <- append(x_cols, right[2]),
+      y = y_cols <- append(y_cols, right[2])
+    )

Review Comment:
   `sd_extract_equijoin_keys()` currently treats any `Column = Column` binary 
expression as an “equijoin key” and will drop any matching y-side columns 
later, even if both columns come from the same side (e.g., `y$a == y$b`). This 
can cause `sd_join_select_default()` to incorrectly remove non-join columns. 
Restrict key extraction to equalities where one side is qualified with `x` and 
the other with `y` (and skip/ignore unqualified columns).
   ```suggestion
   
       if (
         length(left) < 2 ||
           length(right) < 2 ||
           !left[1] %in% c("x", "y") ||
           !right[1] %in% c("x", "y")
       ) {
         next
       }
   
       if (left[1] == "x" && right[1] == "y") {
         x_cols <- append(x_cols, left[2])
         y_cols <- append(y_cols, right[2])
       } else if (left[1] == "y" && right[1] == "x") {
         x_cols <- append(x_cols, right[2])
         y_cols <- append(y_cols, left[2])
       }
   ```



##########
r/sedonadb/R/join-expression.R:
##########
@@ -0,0 +1,602 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Specify join conditions
+#'
+#' Use `sd_join_by()` to specify join conditions for [sd_join()] using
+#' expressions that reference columns from both tables. Table references
+#' are specified using `x$column` and `y$column` syntax to disambiguate
+#' columns from the left and right tables.
+#'
+#' @param ... Expressions specifying join conditions. These should be
+#'   comparison expressions (e.g., `x$id == y$id`, `x$value > y$threshold`)
+#'   or spatial predicate expressions
+#'   (e.g., `st_intersects(x$geometry, y$geometry)`).
+#'   Multiple conditions are combined with AND.
+#'
+#' @returns An object of class `sedonadb_join_by` containing the unevaluated
+#'   join condition expressions.
+#' @export
+#'
+#' @examples
+#' # Equality join on id column
+#' sd_join_by(x$id == y$id)
+#'
+#' # Multiple conditions (combined with AND)
+#' sd_join_by(x$id == y$id, x$date >= y$start_date)
+#'
+#' # Inequality join
+#' sd_join_by(x$value > y$threshold)
+#'
+sd_join_by <- function(...) {
+  exprs <- rlang::enquos(...)
+
+  if (length(exprs) == 0) {
+    stop("sd_join_by() requires at least one join condition")
+  }
+
+  structure(
+    list(
+      exprs = exprs
+    ),
+    class = "sedonadb_join_by"
+  )
+}
+
+#' @export
+print.sedonadb_join_by <- function(x, ...) {
+  cat("<sedonadb_join_by>\n")
+  for (i in seq_along(x$exprs)) {
+    cat("  ", rlang::expr_deparse(rlang::quo_get_expr(x$exprs[[i]])), "\n", 
sep = "")
+  }
+  invisible(x)
+}
+
+#' Expression evaluation context for joins
+#'
+#' Creates a context for evaluating join conditions that can reference columns
+#' from two tables using qualified references (`x$col` and `y$col`).
+#'
+#' @param x_schema Schema for the left table
+#' @param y_schema Schema for the right table
+#' @param env The expression environment
+#' @param ctx A SedonaDB context
+#' @param x_qualifier Qualifier for left table columns (default "x")
+#' @param y_qualifier Qualifier for right table columns (default "y")
+#'
+#' @return An object of class sedonadb_join_expr_ctx
+#' @noRd
+sd_join_expr_ctx <- function(
+  x_schema,
+  y_schema,
+  env = parent.frame(),
+  ctx = NULL
+) {
+  x_schema <- nanoarrow::as_nanoarrow_schema(x_schema)
+  y_schema <- nanoarrow::as_nanoarrow_schema(y_schema)
+
+  x_names <- as.character(names(x_schema$children))
+  y_names <- as.character(names(y_schema$children))
+
+  factory <- sd_expr_factory(ctx = ctx)
+
+  # We hard-code these for the purposes of the join expression
+  x_qualifier <- "x"
+  y_qualifier <- "y"
+
+  # Create qualified column references for both tables
+  # These are accessed via x$col and y$col syntax
+  x_cols <- lapply(x_names, function(name) {
+    sd_expr_column(name, qualifier = x_qualifier, factory = factory)
+  })
+  names(x_cols) <- x_names
+
+  y_cols <- lapply(y_names, function(name) {
+    sd_expr_column(name, qualifier = y_qualifier, factory = factory)
+  })
+  names(y_cols) <- y_names
+
+  # Create table reference objects that support `$` access
+  x_ref <- structure(x_cols, class = "sedonadb_table_ref", qualifier = 
x_qualifier)
+  y_ref <- structure(y_cols, class = "sedonadb_table_ref", qualifier = 
y_qualifier)
+
+  # The data mask contains x and y as table references
+  data <- list(x = x_ref, y = y_ref)
+
+  # Also include unqualified column references for unambiguous columns
+  all_names <- unique(c(x_names, y_names))
+  ambiguous <- intersect(x_names, y_names)
+
+  for (name in all_names) {
+    if (!(name %in% ambiguous)) {
+      # Unambiguous column - add to data mask
+      if (name %in% x_names) {
+        data[[name]] <- x_cols[[name]]
+      } else {
+        data[[name]] <- y_cols[[name]]
+      }
+    }
+  }
+
+  structure(
+    list(
+      factory = factory,
+      x_schema = x_schema,
+      y_schema = y_schema,
+      x_qualifier = x_qualifier,
+      y_qualifier = y_qualifier,
+      x_ref = x_ref,
+      y_ref = y_ref,
+      ambiguous_columns = ambiguous,
+      data = rlang::as_data_mask(data),
+      env = env,
+      fns = default_fns
+    ),
+    class = c("sedonadb_join_expr_ctx", "sedonadb_expr_ctx")
+  )
+}
+
+#' @export
+`$.sedonadb_table_ref` <- function(x, name) {
+  if (!(name %in% names(x))) {
+    qualifier <- attr(x, "qualifier")
+    stop(
+      sprintf("Column '%s' not found in table '%s'", name, qualifier),
+      call. = FALSE
+    )
+  }
+  x[[name]]
+}
+
+#' Evaluate join conditions
+#'
+#' Evaluates join condition expressions captured by [sd_join_by()] into
+#' SedonaDB expressions using a join expression context.
+#'
+#' @param join_by A `sedonadb_join_by` object from [sd_join_by()]
+#' @param join_expr_ctx A `sedonadb_join_expr_ctx` from `sd_join_expr_ctx()`
+#'
+#' @returns A list of `SedonaDBExpr` objects representing the join conditions
+#' @noRd
+sd_eval_join_conditions <- function(join_by, join_expr_ctx) {
+  ensure_translations_registered()
+
+  stopifnot(inherits(join_by, "sedonadb_join_by"))
+
+  lapply(join_by$exprs, function(quo) {
+    expr <- rlang::quo_get_expr(quo)
+    env <- rlang::quo_get_env(quo)
+
+    # Before we even attempt evaluation, we intercept bare names so that
+    # sd_join_by(x, y, z) creates an equijoin
+    if (rlang::is_symbol(expr)) {
+      col <- as.character(expr)
+      return(
+        sd_expr_binary(
+          "==",
+          sd_expr_column(col, qualifier = "x", factory = 
join_expr_ctx$factory),
+          sd_expr_column(col, qualifier = "y", factory = 
join_expr_ctx$factory),
+          factory = join_expr_ctx$factory
+        )
+      )
+    }
+
+    rlang::try_fetch(
+      {
+        result <- sd_eval_join_expr_inner(expr, join_expr_ctx, env)
+        as_sd_expr(result, factory = join_expr_ctx$factory)
+      },
+      error = function(e) {
+        rlang::abort(
+          sprintf("Error evaluating join condition %s", 
rlang::expr_label(expr)),
+          parent = e
+        )
+      }
+    )
+  })
+}
+
+sd_eval_join_expr_inner <- function(expr, join_expr_ctx, env) {
+  if (rlang::is_call(expr)) {
+    # Special handling for x$col and y$col syntax
+    if (rlang::is_call(expr, "$")) {
+      lhs <- expr[[2]]
+      rhs <- expr[[3]]
+
+      # Check if this is x$col or y$col pattern
+      if (rlang::is_symbol(lhs) && as.character(lhs) %in% c("x", "y")) {
+        table_ref <- rlang::eval_tidy(lhs, data = join_expr_ctx$data, env = 
env)
+        col_name <- as.character(rhs)
+        # Use the $ S3 method to get proper error handling for missing columns
+        return(`$.sedonadb_table_ref`(table_ref, col_name))
+      }
+    }
+
+    # Check for ambiguous unqualified column reference
+    if (rlang::is_symbol(expr)) {
+      name <- as.character(expr)
+      if (name %in% join_expr_ctx$ambiguous_columns) {
+        stop(
+          sprintf("Column '%s' is ambiguous (exists in both tables). ", name),
+          sprintf("Use x$%s or y$%s to disambiguate.", name, name),
+          call. = FALSE
+        )
+      }
+    }
+
+    # Extract function name
+    call_name <- rlang::call_name(expr)
+
+    # If we have a translation, use it (but with join-aware argument 
evaluation)
+    if (!is.null(call_name) && !is.null(join_expr_ctx$fns[[call_name]])) {
+      # Evaluate arguments with join context
+      evaluated_args <- lapply(
+        expr[-1],
+        sd_eval_join_expr_inner,
+        join_expr_ctx = join_expr_ctx,
+        env = env
+      )
+
+      # Build and evaluate the translated call
+      new_fn_expr <- rlang::call2("$", join_expr_ctx$fns, 
rlang::sym(call_name))
+      new_call <- rlang::call2(new_fn_expr, join_expr_ctx, !!!evaluated_args)
+      return(rlang::eval_tidy(new_call, data = join_expr_ctx$data, env = env))
+    }
+
+    # Default: evaluate with tidy eval
+    rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env)
+  } else if (rlang::is_symbol(expr)) {
+    # Check for ambiguous column reference
+    name <- as.character(expr)
+    if (name %in% join_expr_ctx$ambiguous_columns) {
+      stop(
+        sprintf(
+          "Column '%s' is ambiguous (exists in both tables). ",
+          name
+        ),
+        sprintf("Use x$%s or y$%s to disambiguate.", name, name),
+        call. = FALSE
+      )
+    }
+    rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env)
+  } else {
+    # Literal or other expression
+    rlang::eval_tidy(expr, data = join_expr_ctx$data, env = env)
+  }
+}
+
+#' Build join conditions from a `by` specification
+#'
+#' Evaluates the `by` argument to produce a list of join condition expressions.
+#' Supports natural joins (NULL) and explicit conditions via [sd_join_by()].
+#'
+#' @param join_expr_ctx Object produced by `sd_join_expr_ctx()`
+#' @param by A `sedonadb_join_by` object from [sd_join_by()], or `NULL` for
+#'   a natural join on columns with matching names.
+#' @param ctx A SedonaDB context
+#'
+#' @returns A list of `SedonaDBExpr` objects representing the join conditions
+#' @noRd
+sd_build_join_conditions <- function(join_expr_ctx, by = NULL, ctx = NULL) {
+  if (is.null(by)) {
+    # Natural join: find common column names
+    x_names <- names(join_expr_ctx$x_schema$children)
+    y_names <- names(join_expr_ctx$y_schema$children)
+    common <- intersect(x_names, y_names)
+
+    if (length(common) == 0) {
+      stop(
+        "No common columns found for natural join. ",
+        "Use sd_join_by() to specify join conditions."
+      )
+    }
+
+    # Message
+    join_by_syms <- vapply(rlang::syms(common), rlang::expr_deparse, 
character(1))
+    message(sprintf(
+      "Joining with `by = sd_join_by(%s)`",
+      paste0(join_by_syms, collapse = ", ")
+    ))
+
+    # Build equality conditions for common columns
+    join_conditions <- lapply(common, function(col) {
+      sd_expr_binary(
+        "==",
+        sd_expr_column(col, qualifier = "x", factory = join_expr_ctx$factory),
+        sd_expr_column(col, qualifier = "y", factory = join_expr_ctx$factory),
+        factory = join_expr_ctx$factory
+      )
+    })
+  } else if (is.character(by)) {
+    by_unnamed <- !rlang::have_name(by)
+    names(by)[by_unnamed] <- by[by_unnamed]
+    join_conditions <- lapply(seq_along(by), function(i) {
+      sd_expr_binary(
+        "==",
+        sd_expr_column(names(by)[i], qualifier = "x", factory = 
join_expr_ctx$factory),
+        sd_expr_column(by[i], qualifier = "y", factory = 
join_expr_ctx$factory),
+        factory = join_expr_ctx$factory
+      )
+    })
+  } else if (inherits(by, "sedonadb_join_by")) {
+    join_conditions <- sd_eval_join_conditions(by, join_expr_ctx)
+  } else {
+    stop("`by` must be NULL (natural join) or a sd_join_by() object")
+  }
+
+  join_conditions
+}
+
+#' Specify default post-join column selection
+#'
+#' Use `sd_join_select_default()` to specify that the join result should
+#' remove duplicate equijoin key columns (keeping the x-side version) and
+#' apply suffixes to any remaining overlapping column names.
+#'
+#' @param suffix A character vector of length 2 specifying suffixes to add
+#'   to overlapping column names from the left (x) and right (y) tables.
+#'
+#' @returns An object of class `sedonadb_join_select_default` specifying
+#'   the default column selection behavior.
+#' @export
+#'
+#' @examples
+#' # Default suffixes
+#' sd_join_select_default()
+#'
+#' # Custom suffixes
+#' sd_join_select_default(suffix = c("_left", "_right"))
+#'
+sd_join_select_default <- function(suffix = c(".x", ".y")) {
+  if (!is.character(suffix) || length(suffix) != 2) {
+    stop("`suffix` must be a character vector of length 2")
+  }
+
+  structure(
+    list(suffix = suffix),
+    class = "sedonadb_join_select_default"
+  )
+}
+
+#' @export
+print.sedonadb_join_select_default <- function(x, ...) {
+  cat("<sedonadb_join_select_default>\n")
+  cat("  suffix: c(\"", x$suffix[1], "\", \"", x$suffix[2], "\")\n", sep = "")
+  invisible(x)
+}
+
+#' Specify custom post-join column selection
+#'
+#' Use `sd_join_select()` to specify which columns to include in the join
+#' result and optionally rename them. Columns are referenced using `x$column`
+#' and `y$column` syntax to disambiguate columns from the left and right 
tables.
+#'
+#' @param ... Named expressions specifying output columns. Each expression
+#'   should reference a column using `x$column` or `y$column` syntax.
+#'   The name of the argument becomes the output column name. Unnamed

Review Comment:
   `sd_join_select()` documentation says selections “should reference a column 
using `x$column` or `y$column` syntax”, but the implementation also supports 
unqualified symbols when the column name is unambiguous (and tests rely on 
this, e.g., `sd_join_select(letters_x, ...)`). Update the docs to mention that 
bare column names are allowed when they exist on only one side, and that 
ambiguous names require `x$`/`y$`.
   ```suggestion
   #' result and optionally rename them. Columns may be referenced using
   #' `x$column` and `y$column` syntax to disambiguate columns from the left
   #' and right tables, or by bare column name when the name exists on only
   #' one side of the join.
   #'
   #' @param ... Named expressions specifying output columns. Each expression
   #'   may reference a column using `x$column` or `y$column` syntax, or use
   #'   a bare column name when it is unambiguous. If the same column name
   #'   exists on both sides of the join, it must be qualified with `x$` or
   #'   `y$`. The name of the argument becomes the output column name. Unnamed
   ```



##########
r/sedonadb/R/join-expression.R:
##########
@@ -0,0 +1,602 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#' Specify join conditions
+#'
+#' Use `sd_join_by()` to specify join conditions for [sd_join()] using
+#' expressions that reference columns from both tables. Table references
+#' are specified using `x$column` and `y$column` syntax to disambiguate
+#' columns from the left and right tables.
+#'
+#' @param ... Expressions specifying join conditions. These should be
+#'   comparison expressions (e.g., `x$id == y$id`, `x$value > y$threshold`)
+#'   or spatial predicate expressions
+#'   (e.g., `st_intersects(x$geometry, y$geometry)`).
+#'   Multiple conditions are combined with AND.
+#'
+#' @returns An object of class `sedonadb_join_by` containing the unevaluated
+#'   join condition expressions.
+#' @export
+#'
+#' @examples
+#' # Equality join on id column
+#' sd_join_by(x$id == y$id)
+#'
+#' # Multiple conditions (combined with AND)
+#' sd_join_by(x$id == y$id, x$date >= y$start_date)
+#'
+#' # Inequality join
+#' sd_join_by(x$value > y$threshold)
+#'
+sd_join_by <- function(...) {
+  exprs <- rlang::enquos(...)
+
+  if (length(exprs) == 0) {
+    stop("sd_join_by() requires at least one join condition")
+  }
+
+  structure(
+    list(
+      exprs = exprs
+    ),
+    class = "sedonadb_join_by"
+  )
+}
+
+#' @export
+print.sedonadb_join_by <- function(x, ...) {
+  cat("<sedonadb_join_by>\n")
+  for (i in seq_along(x$exprs)) {
+    cat("  ", rlang::expr_deparse(rlang::quo_get_expr(x$exprs[[i]])), "\n", 
sep = "")
+  }
+  invisible(x)
+}
+
+#' Expression evaluation context for joins
+#'
+#' Creates a context for evaluating join conditions that can reference columns
+#' from two tables using qualified references (`x$col` and `y$col`).
+#'
+#' @param x_schema Schema for the left table
+#' @param y_schema Schema for the right table
+#' @param env The expression environment
+#' @param ctx A SedonaDB context
+#' @param x_qualifier Qualifier for left table columns (default "x")
+#' @param y_qualifier Qualifier for right table columns (default "y")

Review Comment:
   The roxygen block for `sd_join_expr_ctx()` documents 
`x_qualifier`/`y_qualifier` parameters, but the function signature doesn’t 
accept them and the qualifiers are hard-coded. Either remove these parameters 
from the documentation or add them to the function signature so the docs match 
the actual API (even if it’s internal/noRd).
   ```suggestion
   
   ```



##########
r/sedonadb/R/dataframe.R:
##########
@@ -492,8 +492,78 @@ sd_summarise <- function(.data, ...) {
 
 #' @rdname sd_summarise
 #' @export
-sd_summarize <- function(.data, ...) {
-  sd_summarise(.data, ...)
+sd_summarize <- function(.data, ..., .env = parent.frame()) {
+  sd_summarise(.data, ..., .env = .env)
+}
+
+#' Join two SedonaDB DataFrames
+#'
+#' Perform a join operation between two dataframes. Use [sd_join_by()] to
+#' specify join conditions using `x$column` and `y$column` syntax to
+#' reference columns from the left and right tables respectively.
+#'
+#' @param x The left dataframe
+#' @param y The right dataframe (will use the same context as x)
+#' @param by A `sedonadb_join_by` object from [sd_join_by()], or `NULL` for
+#'   a natural join on columns with matching names.

Review Comment:
   `sd_build_join_conditions()` supports `by` as a character vector (including 
named mappings like `c(x_val = "y_val")`), but the `sd_join()` roxygen/docs 
currently describe `by` as only `sd_join_by()` or `NULL`. Either document the 
character-vector form (so users know it’s supported) or explicitly reject it in 
`sd_join()` to keep the public API aligned with the docs.
   ```suggestion
   #' @param by Join specification. One of:
   #'   - A `sedonadb_join_by` object from [sd_join_by()]
   #'   - A character vector of column names to join on in both tables
   #'   - A named character vector mapping left-table column names to
   #'     right-table column names, e.g. `c(x_val = "y_val")`
   #'   - `NULL` for a natural join on columns with matching names
   ```



##########
r/sedonadb/R/dataframe.R:
##########
@@ -492,8 +492,78 @@ sd_summarise <- function(.data, ...) {
 
 #' @rdname sd_summarise
 #' @export
-sd_summarize <- function(.data, ...) {
-  sd_summarise(.data, ...)
+sd_summarize <- function(.data, ..., .env = parent.frame()) {
+  sd_summarise(.data, ..., .env = .env)
+}
+
+#' Join two SedonaDB DataFrames
+#'
+#' Perform a join operation between two dataframes. Use [sd_join_by()] to
+#' specify join conditions using `x$column` and `y$column` syntax to
+#' reference columns from the left and right tables respectively.
+#'
+#' @param x The left dataframe
+#' @param y The right dataframe (will use the same context as x)
+#' @param by A `sedonadb_join_by` object from [sd_join_by()], or `NULL` for
+#'   a natural join on columns with matching names.
+#' @param join_type The type of join to perform. One of "inner", "left", 
"right",
+#'   "full", "leftsemi", "rightsemi", "leftanti", "rightanti", "leftmark",
+#'   or "rightmark".
+#' @param select Post-join column selection. One of
+#'   - `NULL` for no modification, which may result in duplicate (unqualified)
+#'     column names. The column may still be
+#'     referred to with a qualifier in advanced usage using [sd_expr_column()].
+#'   - [sd_join_select_default()] for dplyr-like behaviour (equi-join keys
+#'     removed, intersecting names suffixed)
+#'   - [sd_join_select()] for a custom selection
+#'
+#' @returns An object of class sedonadb_dataframe
+#' @export
+#'
+#' @examples
+#' df1 <- data.frame(x = letters[1:10], y = 1:10)
+#' df2 <- data.frame(y = 10:1, z = LETTERS[1:10])
+#' df1 |> sd_join(df2)
+#'
+sd_join <- function(
+  x,
+  y,
+  by = NULL,
+  join_type = "inner",
+  select = sd_join_select_default()
+) {
+  x <- as_sedonadb_dataframe(x)
+  y <- as_sedonadb_dataframe(y, ctx = x$ctx)
+
+  x_schema <- infer_nanoarrow_schema(x)
+  y_schema <- infer_nanoarrow_schema(y)
+  join_expr_ctx <- sd_join_expr_ctx(x_schema, y_schema, ctx = x$ctx)
+  join_conditions <- sd_build_join_conditions(join_expr_ctx, by, ctx = x$ctx)
+
+  df <- x$df$join(y$df, join_conditions, join_type, left_alias = "x", 
right_alias = "y")
+  out <- new_sedonadb_dataframe(x$ctx, df)

Review Comment:
   `sd_join()` passes `join_type` straight through to Rust where it’s parsed 
into `datafusion_expr::JoinType`. To make the R API more robust and provide 
clearer errors, validate `join_type` on the R side (type check + restrict to 
the documented set) and error with a user-facing message before calling into 
Rust.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(r/sedonadb): Add join expression evaluation [sedona-db]

Reply via email to