paleolimbot commented on code in PR #807: URL: https://github.com/apache/sedona-db/pull/807#discussion_r3177469050
########## python/sedonadb/src/expr.rs: ########## @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Python-facing expression wrapper. +//! +//! This module is the Rust half of `sedonadb.expr.Expr`. It exposes a thin +//! PyO3 class (`PyExpr`, exported to Python as `_lib.InternalExpr`) that owns +//! a single `datafusion_expr::Expr` and a small set of factory `#[pyfunction]`s +//! used by the Python wrapper to construct columns and literals. +//! +//! The high-level shape: +//! +//! - The Python side (`sedonadb.expr.Expr`) holds a handle to a `PyExpr` and +//! provides operator overloading, docstrings, and Pythonic ergonomics. +//! - This Rust side stays minimal: build the right `Expr` variant, do the +//! minimum amount of validation that benefits from native types (e.g. +//! inspecting Arrow extension metadata for `cast`), and bubble up errors +//! via `PySedonaError`. +//! +//! The design mirrors the equivalent layer in the R bindings +//! (`r/sedonadb/src/rust/src/expression.rs`), where the Python `Expr` plays the +//! role of `SedonaDBExpr` and `expr_col` / `expr_lit` mirror +//! `SedonaDBExprFactory::column` / `::literal`. + +use datafusion_common::Column; +use datafusion_expr::{expr::InList, Cast, Expr}; +use pyo3::prelude::*; + +use crate::error::PySedonaError; +use crate::import_from::{import_arrow_field, import_arrow_scalar}; + +/// PyO3 wrapper around a single DataFusion logical expression. +/// +/// `PyExpr` is exposed to Python as `_lib.InternalExpr` and is the value +/// type behind the user-facing `sedonadb.expr.Expr` Python class. It is +/// `Clone` so that PyO3 method signatures can take `Vec<PyExpr>` by value; +/// each clone is cheap because `Expr` itself owns its children behind `Box`, +/// and cloning copies only the small wrapper plus a few `Arc`s in the +/// underlying tree. +/// +/// The `inner` field is intentionally `pub` so that other PyO3 modules in +/// this crate (e.g. `dataframe.rs`) can take `Vec<PyExpr>` arguments and +/// move the inner `Expr` out without going through accessor methods. +#[pyclass(name = "InternalExpr")] +#[derive(Clone)] +pub struct PyExpr { + pub inner: Expr, +} + +impl PyExpr { + pub fn new(inner: Expr) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyExpr { + /// Pretty-print using DataFusion's `Display` impl (e.g. `"x AS y"`). + /// Used by Python's `repr()` / `str()` paths. + fn __repr__(&self) -> String { + format!("{}", self.inner) + } + + /// Debug-print the full Rust `Expr` tree. Useful for tests and + /// troubleshooting; not part of the user-facing surface. + fn debug_string(&self) -> String { + format!("{:?}", self.inner) + } + + /// Return the name of the `Expr` enum variant (e.g. `"Column"`, + /// `"Literal"`, `"BinaryExpr"`). This is a stable structural property + /// that tests can assert on without depending on Display formatting. + fn variant_name(&self) -> String { + self.inner.variant_name().to_string() + } + + /// Wrap this expression in `Expr::Alias { name }`. + /// + /// We use DataFusion's `alias_if_changed` helper so that aliasing an + /// expression to its existing name is a no-op. This avoids producing + /// e.g. `Expr::Alias("x", Expr::Column("x"))` which is redundant but + /// otherwise legal. + fn alias(&self, name: &str) -> Result<Self, PySedonaError> { + let inner = self.inner.clone().alias_if_changed(name.to_string())?; + Ok(Self { inner }) + } + + /// Wrap this expression in `Expr::Cast` to the storage type of the + /// provided Arrow field-like object. + /// + /// Steps: + /// + /// 1. Pull an `arrow_schema::Field` out of `target` via the Arrow + /// PyCapsule schema interface (any object exposing + /// `__arrow_c_schema__` works — `pyarrow.DataType`, + /// `pyarrow.Field`, etc.). + /// 2. Reject Arrow extension types up front. SedonaDB's spatial types + /// are extension types over WKB; users who reach for `cast` on + /// those almost certainly want a different operation, so we surface + /// a clear error rather than silently dropping the extension. + /// 3. Build `Expr::Cast { expr, data_type }` using only the storage + /// type. We don't carry field metadata through the cast. + fn cast(&self, target: Bound<'_, PyAny>) -> Result<Self, PySedonaError> { + let field = import_arrow_field(&target)?; + if let Some(type_name) = field.extension_type_name() { + return Err(PySedonaError::SedonaPython(format!( + "Can't cast to Arrow extension type '{type_name}'" + ))); + } + let inner = Expr::Cast(Cast::new( + Box::new(self.inner.clone()), + field.data_type().clone(), + )); + Ok(Self { inner }) + } + + /// Build `Expr::IsNull(self)`. SQL semantics — matches NULL only, + /// not floating-point NaN. The Pythonic NaN-aware accessor will live + /// on the future `Series` type. + fn is_null(&self) -> Self { + Self { + inner: Expr::IsNull(Box::new(self.inner.clone())), + } + } + + /// Build `Expr::IsNotNull(self)`. SQL semantics, mirror of `is_null`. + fn is_not_null(&self) -> Self { + Self { + inner: Expr::IsNotNull(Box::new(self.inner.clone())), + } + } + + /// Build `Expr::InList(self IN (values), negated)`. + /// + /// The Python side already coerces every element of the user's list + /// to a `PyExpr` (using `lit()` for non-Expr scalars), so the Rust + /// side just needs to clone each `Expr` out of its handle and hand + /// the resulting `Vec<Expr>` to `InList::new`. `negated=true` + /// produces `NOT IN`, exposed through the optional kwarg below. + #[pyo3(signature = (values, negated=false))] + fn isin(&self, values: Vec<PyRef<'_, PyExpr>>, negated: bool) -> Self { + let list = values.iter().map(|e| e.inner.clone()).collect(); + Self { + inner: Expr::InList(InList::new(Box::new(self.inner.clone()), list, negated)), + } + } + + /// Build `Expr::Negative(self)` — arithmetic negation, the unary `-`. + fn negate(&self) -> Self { + Self { + inner: Expr::Negative(Box::new(self.inner.clone())), + } + } +} + +/// Construct an unqualified column reference: `Expr::Column("x")`. +/// +/// Qualified columns (e.g. `t.x`) are not exposed yet; the Python +/// `col()` helper takes only a single name. When we add joins and +/// multi-table references we can grow this to accept an optional +/// table qualifier, matching the R side's `column(name, qualifier)`. +#[pyfunction] +pub fn expr_col(name: &str) -> PyExpr { Review Comment: If it's easy you can probably just add a qualifier argument here while you're at it ########## python/sedonadb/src/expr.rs: ########## @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Python-facing expression wrapper. +//! +//! This module is the Rust half of `sedonadb.expr.Expr`. It exposes a thin +//! PyO3 class (`PyExpr`, exported to Python as `_lib.InternalExpr`) that owns +//! a single `datafusion_expr::Expr` and a small set of factory `#[pyfunction]`s +//! used by the Python wrapper to construct columns and literals. +//! +//! The high-level shape: +//! +//! - The Python side (`sedonadb.expr.Expr`) holds a handle to a `PyExpr` and +//! provides operator overloading, docstrings, and Pythonic ergonomics. +//! - This Rust side stays minimal: build the right `Expr` variant, do the +//! minimum amount of validation that benefits from native types (e.g. +//! inspecting Arrow extension metadata for `cast`), and bubble up errors +//! via `PySedonaError`. +//! +//! The design mirrors the equivalent layer in the R bindings +//! (`r/sedonadb/src/rust/src/expression.rs`), where the Python `Expr` plays the +//! role of `SedonaDBExpr` and `expr_col` / `expr_lit` mirror +//! `SedonaDBExprFactory::column` / `::literal`. + +use datafusion_common::Column; +use datafusion_expr::{expr::InList, Cast, Expr}; +use pyo3::prelude::*; + +use crate::error::PySedonaError; +use crate::import_from::{import_arrow_field, import_arrow_scalar}; + +/// PyO3 wrapper around a single DataFusion logical expression. +/// +/// `PyExpr` is exposed to Python as `_lib.InternalExpr` and is the value +/// type behind the user-facing `sedonadb.expr.Expr` Python class. It is +/// `Clone` so that PyO3 method signatures can take `Vec<PyExpr>` by value; +/// each clone is cheap because `Expr` itself owns its children behind `Box`, +/// and cloning copies only the small wrapper plus a few `Arc`s in the +/// underlying tree. +/// +/// The `inner` field is intentionally `pub` so that other PyO3 modules in +/// this crate (e.g. `dataframe.rs`) can take `Vec<PyExpr>` arguments and +/// move the inner `Expr` out without going through accessor methods. +#[pyclass(name = "InternalExpr")] +#[derive(Clone)] +pub struct PyExpr { + pub inner: Expr, +} + +impl PyExpr { + pub fn new(inner: Expr) -> Self { + Self { inner } + } +} + +#[pymethods] +impl PyExpr { + /// Pretty-print using DataFusion's `Display` impl (e.g. `"x AS y"`). + /// Used by Python's `repr()` / `str()` paths. + fn __repr__(&self) -> String { + format!("{}", self.inner) + } + + /// Debug-print the full Rust `Expr` tree. Useful for tests and + /// troubleshooting; not part of the user-facing surface. + fn debug_string(&self) -> String { + format!("{:?}", self.inner) + } + + /// Return the name of the `Expr` enum variant (e.g. `"Column"`, + /// `"Literal"`, `"BinaryExpr"`). This is a stable structural property + /// that tests can assert on without depending on Display formatting. + fn variant_name(&self) -> String { + self.inner.variant_name().to_string() + } + + /// Wrap this expression in `Expr::Alias { name }`. + /// + /// We use DataFusion's `alias_if_changed` helper so that aliasing an + /// expression to its existing name is a no-op. This avoids producing + /// e.g. `Expr::Alias("x", Expr::Column("x"))` which is redundant but + /// otherwise legal. + fn alias(&self, name: &str) -> Result<Self, PySedonaError> { + let inner = self.inner.clone().alias_if_changed(name.to_string())?; + Ok(Self { inner }) + } + + /// Wrap this expression in `Expr::Cast` to the storage type of the + /// provided Arrow field-like object. + /// + /// Steps: + /// + /// 1. Pull an `arrow_schema::Field` out of `target` via the Arrow + /// PyCapsule schema interface (any object exposing + /// `__arrow_c_schema__` works — `pyarrow.DataType`, + /// `pyarrow.Field`, etc.). + /// 2. Reject Arrow extension types up front. SedonaDB's spatial types + /// are extension types over WKB; users who reach for `cast` on + /// those almost certainly want a different operation, so we surface + /// a clear error rather than silently dropping the extension. Review Comment: 👍 (I added support for these in DataFusion 53 that we can transform using an optimizer rule to a scalar function call). We can support these before DataFusion 53 here if we need to by inserting the scalar function call directly (I may give this a go in the near future since it's maybe useful for geometry/geography). ########## python/sedonadb/python/sedonadb/expr/expression.py: ########## @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Iterable + +from sedonadb._lib import expr_col as _expr_col +from sedonadb._lib import expr_lit as _expr_lit +from sedonadb.expr.literal import Literal + + +class Expr: + """A column expression. + + `Expr` represents a logical expression that will be evaluated against a + `DataFrame` when the frame is executed. Expressions are pure syntax — they + do not carry data and are not bound to a particular frame at construction + time. Errors such as referring to a column that does not exist surface only + when the expression is consumed (for example, by `DataFrame.select()` or + `DataFrame.filter()`). + + Construct an `Expr` with `col(name)` or `lit(value)`. + """ + + __slots__ = ("_impl",) + + def __init__(self, impl): + # impl is the underlying _lib.InternalExpr handle. Users normally + # do not construct Expr directly; use col() / lit() instead. + self._impl = impl + + def __repr__(self) -> str: + return f"Expr({self._impl!r})" + + def alias(self, name: str) -> "Expr": + """Return a copy of the expression with a new output name.""" + return Expr(self._impl.alias(name)) + + def cast(self, target) -> "Expr": + """Cast the expression to the given Arrow type. + + `target` must be an object exposing the Arrow C schema interface + (e.g. `pyarrow.int64()`, `pyarrow.string()`, a `pyarrow.Field`, or any + object with `__arrow_c_schema__`). Casting to Arrow extension types is + not supported. + """ + return Expr(self._impl.cast(target)) + + def is_null(self) -> "Expr": + """Return a boolean expression that is true where this expression + is SQL NULL. + + Note that floating-point NaN is *not* matched by `is_null` — the + SQL `IS NULL` predicate only matches NULL. A pandas-style + NaN-aware helper is planned on the future `Series` type. + """ + return Expr(self._impl.is_null()) + + def is_not_null(self) -> "Expr": + """Return a boolean expression that is true where this expression is + not null.""" + return Expr(self._impl.is_not_null()) + + def isin(self, values: Iterable[Any]) -> "Expr": + """Return a boolean expression that is true where this expression + equals any of the given values.""" + coerced = [_to_expr(v) for v in values] + return Expr(self._impl.isin([e._impl for e in coerced], False)) + + def negate(self) -> "Expr": + """Return the arithmetic negation of this expression.""" + return Expr(self._impl.negate()) + + +def col(name: str) -> Expr: + """Reference a column by name. + + Examples: + >>> from sedonadb.expr import col + >>> col("x").alias("y") + Expr(...) + """ + return Expr(_expr_col(name)) + + +def lit(value: Any) -> Expr: + """Wrap a Python value as a literal expression. + + Accepts the same value types as `sedonadb.expr.literal.lit`, including + Python scalars, pyarrow arrays/scalars, and Shapely geometries. Returns an + `Expr` suitable for composition with column expressions. + """ Review Comment: FWIW there is already a `lit()` function that returns `Literal`. Literal is intentionally lazy so (the python wrappers around) different functions can interpret then differently if they want to (e.g., RS_Intersects() doesn't actually need to convert a whole rasterio object to an Arrow scalar, which is expensive, to compute a correct result). Purely theoretical, not currently used, and can change, but that's why it's like that 🙂 . Probably you can just fold this logic into `_to_expr()` and leave the existing `lit()` (which powers parameterized queries). ########## python/sedonadb/tests/expr/test_expression.py: ########## @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# These tests assert structural properties of constructed expressions — +# primarily `variant_name()`, child variants reachable via `debug_string()`, +# and the presence of user-supplied identifiers (column names, literal +# values) inside the rendered representation. Where possible we avoid +# pinning exact substrings of DataFusion's `Display` formatting so the +# suite is not coupled to a specific DataFusion version. + +import pyarrow as pa +import pytest + +from sedonadb.expr import Expr, col, lit + + +def test_col_returns_expr(): + e = col("x") + assert isinstance(e, Expr) + assert e._impl.variant_name() == "Column" + assert "x" in repr(e) Review Comment: Testing the exact repr output is probably easy to do and is a slightly better test ########## python/sedonadb/python/sedonadb/expr/expression.py: ########## @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Iterable + +from sedonadb._lib import expr_col as _expr_col +from sedonadb._lib import expr_lit as _expr_lit +from sedonadb.expr.literal import Literal + + +class Expr: + """A column expression. + + `Expr` represents a logical expression that will be evaluated against a + `DataFrame` when the frame is executed. Expressions are pure syntax — they + do not carry data and are not bound to a particular frame at construction + time. Errors such as referring to a column that does not exist surface only + when the expression is consumed (for example, by `DataFrame.select()` or + `DataFrame.filter()`). + + Construct an `Expr` with `col(name)` or `lit(value)`. + """ + + __slots__ = ("_impl",) + + def __init__(self, impl): + # impl is the underlying _lib.InternalExpr handle. Users normally + # do not construct Expr directly; use col() / lit() instead. + self._impl = impl + + def __repr__(self) -> str: + return f"Expr({self._impl!r})" + + def alias(self, name: str) -> "Expr": + """Return a copy of the expression with a new output name.""" + return Expr(self._impl.alias(name)) Review Comment: I forget if we have CI checks for this, but we parameter docs and examples for most functions in the Python APIs ########## python/sedonadb/python/sedonadb/expr/expression.py: ########## @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any, Iterable + +from sedonadb._lib import expr_col as _expr_col +from sedonadb._lib import expr_lit as _expr_lit +from sedonadb.expr.literal import Literal + + +class Expr: + """A column expression. + + `Expr` represents a logical expression that will be evaluated against a + `DataFrame` when the frame is executed. Expressions are pure syntax — they + do not carry data and are not bound to a particular frame at construction + time. Errors such as referring to a column that does not exist surface only + when the expression is consumed (for example, by `DataFrame.select()` or + `DataFrame.filter()`). + + Construct an `Expr` with `col(name)` or `lit(value)`. + """ + + __slots__ = ("_impl",) + + def __init__(self, impl): + # impl is the underlying _lib.InternalExpr handle. Users normally + # do not construct Expr directly; use col() / lit() instead. + self._impl = impl Review Comment: An `isinstance()` check here would be good so that this errors if used incorrectly -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
