jiayuasu commented on code in PR #832: URL: https://github.com/apache/sedona-db/pull/832#discussion_r3216293761
########## python/sedonadb/tests/expr/test_dataframe_select.py: ########## @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for DataFrame.select(). Output is materialized to an Arrow table +# and asserted with exact `column_names` and `to_pylist()` comparisons — +# substring or partial-match assertions are deliberately avoided so the +# tests fail loudly on any change in projection semantics. + +import pytest + +import sedonadb +from sedonadb.expr import col + + [email protected] +def sd(): + return sedonadb.connect() + + [email protected] +def df_xy(sd): + return sd.sql( Review Comment: Done in 098393c1 — fixtures now consume the shared `con` fixture from `tests/conftest.py` and the local `sd` fixture is gone. ########## python/sedonadb/tests/expr/test_dataframe_select.py: ########## @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for DataFrame.select(). Output is materialized to an Arrow table +# and asserted with exact `column_names` and `to_pylist()` comparisons — +# substring or partial-match assertions are deliberately avoided so the +# tests fail loudly on any change in projection semantics. + +import pytest + +import sedonadb +from sedonadb.expr import col + + [email protected] +def sd(): + return sedonadb.connect() + + [email protected] +def df_xy(sd): + return sd.sql( + "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)" + ) + + +def test_select_by_string(df_xy): + out = df_xy.select("x").to_arrow_table() + assert out.column_names == ["x"] + assert out.column("x").to_pylist() == [1, 2, 3, 4] + + +def test_select_multiple_strings(df_xy): + out = df_xy.select("x", "y").to_arrow_table() + assert out.column_names == ["x", "y"] + assert out.column("x").to_pylist() == [1, 2, 3, 4] + assert out.column("y").to_pylist() == [10, 20, 30, 40] + + +def test_select_reorder_columns(df_xy): + out = df_xy.select("y", "x").to_arrow_table() + assert out.column_names == ["y", "x"] + + +def test_select_by_col_expr(df_xy): + out = df_xy.select(col("x")).to_arrow_table() + assert out.column_names == ["x"] + assert out.column("x").to_pylist() == [1, 2, 3, 4] + + +def test_select_arithmetic_expr(df_xy): + out = df_xy.select((col("x") + col("y")).alias("sum")).to_arrow_table() + assert out.column_names == ["sum"] + assert out.column("sum").to_pylist() == [11, 22, 33, 44] + + +def test_select_mix_strings_and_exprs(df_xy): + out = df_xy.select("x", (col("y") * 2).alias("y2")).to_arrow_table() + assert out.column_names == ["x", "y2"] + assert out.column("y2").to_pylist() == [20, 40, 60, 80] + + +def test_select_literal_via_isin_path(df_xy): + # No public lit() -> Expr in this PR; literals come in via operator + # coercion (and inside isin). This test exercises a literal-on-the-LHS + # arithmetic operation, where the int is auto-coerced to Expr. + out = df_xy.select((col("x") * 0 + 7).alias("seven")).to_arrow_table() + assert out.column("seven").to_pylist() == [7, 7, 7, 7] + + +def test_select_returns_lazy_dataframe(df_xy): + out = df_xy.select("x") + # Plan should be lazy until materialization. + assert hasattr(out, "to_arrow_table") + + +def test_select_empty_raises(df_xy): + with pytest.raises(ValueError, match="at least one"): + df_xy.select() + + +def test_select_bad_arg_type_raises(df_xy): + with pytest.raises(TypeError, match="str or Expr"): + df_xy.select(123) + + +def test_select_unknown_column_errors_at_plan_build(df_xy): + # DataFusion validates column references at plan-build time. The Expr + # itself is unbound (col("nonexistent") alone is fine), but selecting + # it against a frame that doesn't have that column fails immediately. + with pytest.raises(Exception, match="nonexistent"): Review Comment: Done in 098393c1 — `pytest.raises` now catches `sedonadb._lib.SedonaError` specifically. ########## python/sedonadb/tests/expr/test_dataframe_select.py: ########## @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for DataFrame.select(). Output is materialized to an Arrow table +# and asserted with exact `column_names` and `to_pylist()` comparisons — +# substring or partial-match assertions are deliberately avoided so the +# tests fail loudly on any change in projection semantics. + +import pytest + +import sedonadb +from sedonadb.expr import col + + [email protected] +def sd(): + return sedonadb.connect() + + [email protected] +def df_xy(sd): + return sd.sql( + "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)" + ) + + +def test_select_by_string(df_xy): + out = df_xy.select("x").to_arrow_table() + assert out.column_names == ["x"] + assert out.column("x").to_pylist() == [1, 2, 3, 4] + + +def test_select_multiple_strings(df_xy): + out = df_xy.select("x", "y").to_arrow_table() + assert out.column_names == ["x", "y"] + assert out.column("x").to_pylist() == [1, 2, 3, 4] + assert out.column("y").to_pylist() == [10, 20, 30, 40] + + +def test_select_reorder_columns(df_xy): + out = df_xy.select("y", "x").to_arrow_table() + assert out.column_names == ["y", "x"] + + +def test_select_by_col_expr(df_xy): + out = df_xy.select(col("x")).to_arrow_table() + assert out.column_names == ["x"] + assert out.column("x").to_pylist() == [1, 2, 3, 4] + + +def test_select_arithmetic_expr(df_xy): + out = df_xy.select((col("x") + col("y")).alias("sum")).to_arrow_table() + assert out.column_names == ["sum"] + assert out.column("sum").to_pylist() == [11, 22, 33, 44] + + +def test_select_mix_strings_and_exprs(df_xy): + out = df_xy.select("x", (col("y") * 2).alias("y2")).to_arrow_table() + assert out.column_names == ["x", "y2"] + assert out.column("y2").to_pylist() == [20, 40, 60, 80] + + +def test_select_literal_via_isin_path(df_xy): + # No public lit() -> Expr in this PR; literals come in via operator + # coercion (and inside isin). This test exercises a literal-on-the-LHS + # arithmetic operation, where the int is auto-coerced to Expr. + out = df_xy.select((col("x") * 0 + 7).alias("seven")).to_arrow_table() Review Comment: Renamed to `test_select_literal_via_operator_coercion` and rewrote the comment in 098393c1 to describe what's actually being exercised (right-hand scalar coercion through `_to_expr`), not the `isin` path the old name implied. ########## python/sedonadb/python/sedonadb/dataframe.py: ########## @@ -85,6 +85,50 @@ def head(self, n: int = 5) -> "DataFrame": """ return self.limit(n) + def select(self, *exprs) -> "DataFrame": + """Project a set of columns or expressions. + Review Comment: Done in 098393c1 — `*exprs: "Expr | str"` (string annotation so the forward reference works without restructuring imports). This also fixes the docs-and-deploy CI failure, which was griffe rejecting the untyped vararg under strict mode. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
