Re: [PR] feat(python/sedonadb): add DataFrame.select [sedona-db]

via GitHub Sun, 10 May 2026 20:30:34 -0700


Copilot commented on code in PR #832:
URL: https://github.com/apache/sedona-db/pull/832#discussion_r3216204672



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(

Review Comment:
   This test module creates a fresh `SedonaContext` via `sedonadb.connect()` 
for each test. The rest of the Python test suite generally reuses the shared 
`con` fixture from `tests/conftest.py`, which avoids repeated engine 
initialization and speeds up CI. Consider switching these fixtures to accept 
`con` and build `df_xy` from it instead of defining a new `sd` fixture here.
   



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(
+        "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)"
+    )
+
+
+def test_select_by_string(df_xy):
+    out = df_xy.select("x").to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_multiple_strings(df_xy):
+    out = df_xy.select("x", "y").to_arrow_table()
+    assert out.column_names == ["x", "y"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+    assert out.column("y").to_pylist() == [10, 20, 30, 40]
+
+
+def test_select_reorder_columns(df_xy):
+    out = df_xy.select("y", "x").to_arrow_table()
+    assert out.column_names == ["y", "x"]
+
+
+def test_select_by_col_expr(df_xy):
+    out = df_xy.select(col("x")).to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_arithmetic_expr(df_xy):
+    out = df_xy.select((col("x") + col("y")).alias("sum")).to_arrow_table()
+    assert out.column_names == ["sum"]
+    assert out.column("sum").to_pylist() == [11, 22, 33, 44]
+
+
+def test_select_mix_strings_and_exprs(df_xy):
+    out = df_xy.select("x", (col("y") * 2).alias("y2")).to_arrow_table()
+    assert out.column_names == ["x", "y2"]
+    assert out.column("y2").to_pylist() == [20, 40, 60, 80]
+
+
+def test_select_literal_via_isin_path(df_xy):
+    # No public lit() -> Expr in this PR; literals come in via operator
+    # coercion (and inside isin). This test exercises a literal-on-the-LHS
+    # arithmetic operation, where the int is auto-coerced to Expr.
+    out = df_xy.select((col("x") * 0 + 7).alias("seven")).to_arrow_table()
+    assert out.column("seven").to_pylist() == [7, 7, 7, 7]
+
+
+def test_select_returns_lazy_dataframe(df_xy):
+    out = df_xy.select("x")
+    # Plan should be lazy until materialization.
+    assert hasattr(out, "to_arrow_table")
+
+
+def test_select_empty_raises(df_xy):
+    with pytest.raises(ValueError, match="at least one"):
+        df_xy.select()
+
+
+def test_select_bad_arg_type_raises(df_xy):
+    with pytest.raises(TypeError, match="str or Expr"):
+        df_xy.select(123)
+
+
+def test_select_unknown_column_errors_at_plan_build(df_xy):
+    # DataFusion validates column references at plan-build time. The Expr
+    # itself is unbound (col("nonexistent") alone is fine), but selecting
+    # it against a frame that doesn't have that column fails immediately.
+    with pytest.raises(Exception, match="nonexistent"):

Review Comment:
   `df_xy.select(col("nonexistent"))` should raise `sedonadb._lib.SedonaError` 
(DataFusion errors are mapped to this type). Catching a bare `Exception` can 
hide unrelated failures and makes the test less specific; prefer asserting the 
concrete exception type used elsewhere in the suite.
   



##########
python/sedonadb/python/sedonadb/dataframe.py:
##########
@@ -85,6 +85,50 @@ def head(self, n: int = 5) -> "DataFrame":
         """
         return self.limit(n)
 
+    def select(self, *exprs) -> "DataFrame":
+        """Project a set of columns or expressions.
+

Review Comment:
   `select` is a new public API and currently leaves `*exprs` untyped. Since 
this module already uses type annotations elsewhere, consider annotating the 
varargs (e.g. `*exprs: Union[str, Expr]`) to improve editor/IDE support and 
static checking without changing runtime behavior.



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(
+        "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)"
+    )
+
+
+def test_select_by_string(df_xy):
+    out = df_xy.select("x").to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_multiple_strings(df_xy):
+    out = df_xy.select("x", "y").to_arrow_table()
+    assert out.column_names == ["x", "y"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+    assert out.column("y").to_pylist() == [10, 20, 30, 40]
+
+
+def test_select_reorder_columns(df_xy):
+    out = df_xy.select("y", "x").to_arrow_table()
+    assert out.column_names == ["y", "x"]
+
+
+def test_select_by_col_expr(df_xy):
+    out = df_xy.select(col("x")).to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_arithmetic_expr(df_xy):
+    out = df_xy.select((col("x") + col("y")).alias("sum")).to_arrow_table()
+    assert out.column_names == ["sum"]
+    assert out.column("sum").to_pylist() == [11, 22, 33, 44]
+
+
+def test_select_mix_strings_and_exprs(df_xy):
+    out = df_xy.select("x", (col("y") * 2).alias("y2")).to_arrow_table()
+    assert out.column_names == ["x", "y2"]
+    assert out.column("y2").to_pylist() == [20, 40, 60, 80]
+
+
+def test_select_literal_via_isin_path(df_xy):
+    # No public lit() -> Expr in this PR; literals come in via operator
+    # coercion (and inside isin). This test exercises a literal-on-the-LHS
+    # arithmetic operation, where the int is auto-coerced to Expr.
+    out = df_xy.select((col("x") * 0 + 7).alias("seven")).to_arrow_table()

Review Comment:
   The test name/comment mention the `isin` path and a literal on the LHS, but 
the expression `(col("x") * 0 + 7)` doesn’t call `isin` and only exercises RHS 
scalar coercion. Either rename/update the comment to match what’s being tested, 
or change the expression so it actually covers the reflected operator path 
(literal-on-LHS) if that’s the intent.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python/sedonadb): add DataFrame.select [sedona-db]

Reply via email to