paleolimbot commented on code in PR #832:
URL: https://github.com/apache/sedona-db/pull/832#discussion_r3216272711


##########
python/sedonadb/python/sedonadb/dataframe.py:
##########
@@ -85,6 +85,50 @@ def head(self, n: int = 5) -> "DataFrame":
         """
         return self.limit(n)
 
+    def select(self, *exprs) -> "DataFrame":
+        """Project a set of columns or expressions.
+
+        Returns a new lazy `DataFrame` whose columns are exactly the
+        projection. Column-name strings are converted to column references
+        via `sedonadb.expr.col` internally, so `df.select("x", "y")` and
+        `df.select(col("x"), col("y"))` produce the same plan.
+
+        Args:
+            *exprs: One or more arguments. Each argument is either a column
+                name (`str`) or a `sedonadb.expr.Expr`. At least one argument
+                is required.
+
+        Examples:
+
+            >>> from sedonadb.expr import col
+            >>> sd = sedona.db.connect()
+            >>> df = sd.sql("SELECT 1 AS a, 2 AS b")
+            >>> df.select("a", (col("b") + 1).alias("b_plus_1")).show()
+            ┌───────┬──────────┐
+            │   a   │ b_plus_1 │
+            │ int64 │   int64  │
+            ╞═══════╪══════════╡
+            │     1 │        3 │
+            └───────┴──────────┘
+        """
+        from sedonadb.expr import Expr
+        from sedonadb.expr import col as _col
+
+        if not exprs:
+            raise ValueError("select() requires at least one column or 
expression")
+
+        coerced = []
+        for e in exprs:
+            if isinstance(e, Expr):
+                coerced.append(e._impl)
+            elif isinstance(e, str):
+                coerced.append(_col(e)._impl)
+            else:
+                raise TypeError(
+                    f"select() expects str or Expr arguments, got 
{type(e).__name__}"
+                )

Review Comment:
   Literals are missing here: we can check `isinstance(e, Literal)` (so that 
literals wrapped in `lit()` work).
   
   I think it will also be useful to ensure that the error that occurs when 
`str` is an invalid column lists the valid columns (DataFusion will do this for 
you in SQL and the error we get might already include it but also might not 
when calling via the DataFrame API).



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(
+        "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)"
+    )
+
+
+def test_select_by_string(df_xy):
+    out = df_xy.select("x").to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_multiple_strings(df_xy):
+    out = df_xy.select("x", "y").to_arrow_table()
+    assert out.column_names == ["x", "y"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+    assert out.column("y").to_pylist() == [10, 20, 30, 40]
+
+
+def test_select_reorder_columns(df_xy):
+    out = df_xy.select("y", "x").to_arrow_table()
+    assert out.column_names == ["y", "x"]
+
+
+def test_select_by_col_expr(df_xy):
+    out = df_xy.select(col("x")).to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_arithmetic_expr(df_xy):
+    out = df_xy.select((col("x") + col("y")).alias("sum")).to_arrow_table()
+    assert out.column_names == ["sum"]
+    assert out.column("sum").to_pylist() == [11, 22, 33, 44]
+
+
+def test_select_mix_strings_and_exprs(df_xy):
+    out = df_xy.select("x", (col("y") * 2).alias("y2")).to_arrow_table()
+    assert out.column_names == ["x", "y2"]
+    assert out.column("y2").to_pylist() == [20, 40, 60, 80]
+
+
+def test_select_literal_via_isin_path(df_xy):
+    # No public lit() -> Expr in this PR; literals come in via operator
+    # coercion (and inside isin). This test exercises a literal-on-the-LHS
+    # arithmetic operation, where the int is auto-coerced to Expr.
+    out = df_xy.select((col("x") * 0 + 7).alias("seven")).to_arrow_table()
+    assert out.column("seven").to_pylist() == [7, 7, 7, 7]
+
+
+def test_select_returns_lazy_dataframe(df_xy):
+    out = df_xy.select("x")
+    # Plan should be lazy until materialization.
+    assert hasattr(out, "to_arrow_table")
+
+

Review Comment:
   I think you can skip this one
   
   ```suggestion
   ```



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(
+        "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)"
+    )

Review Comment:
   I know it seems repetitive, but it's worth the one extra line to keep all of 
the test context within the `test_` function for the day you have to debug a 
random failure:
   
   ```python
   df = sd.create_data_frame(pd.DataFrame({"x": [1, 2, 3]})
   ```



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(
+        "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)"
+    )
+
+
+def test_select_by_string(df_xy):
+    out = df_xy.select("x").to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_multiple_strings(df_xy):
+    out = df_xy.select("x", "y").to_arrow_table()
+    assert out.column_names == ["x", "y"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+    assert out.column("y").to_pylist() == [10, 20, 30, 40]
+
+
+def test_select_reorder_columns(df_xy):
+    out = df_xy.select("y", "x").to_arrow_table()
+    assert out.column_names == ["y", "x"]
+
+
+def test_select_by_col_expr(df_xy):
+    out = df_xy.select(col("x")).to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]
+
+
+def test_select_arithmetic_expr(df_xy):
+    out = df_xy.select((col("x") + col("y")).alias("sum")).to_arrow_table()
+    assert out.column_names == ["sum"]
+    assert out.column("sum").to_pylist() == [11, 22, 33, 44]
+
+
+def test_select_mix_strings_and_exprs(df_xy):
+    out = df_xy.select("x", (col("y") * 2).alias("y2")).to_arrow_table()
+    assert out.column_names == ["x", "y2"]
+    assert out.column("y2").to_pylist() == [20, 40, 60, 80]
+
+
+def test_select_literal_via_isin_path(df_xy):
+    # No public lit() -> Expr in this PR; literals come in via operator
+    # coercion (and inside isin). This test exercises a literal-on-the-LHS
+    # arithmetic operation, where the int is auto-coerced to Expr.

Review Comment:
   I think there's no problem exposing `lit()` and accepting it here?



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()

Review Comment:
   We already have this (the fixture is called `con`, because it predates 
inventing of `sd = sedona.db.connect()` 🙂 )



##########
python/sedonadb/tests/expr/test_dataframe_select.py:
##########
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for DataFrame.select(). Output is materialized to an Arrow table
+# and asserted with exact `column_names` and `to_pylist()` comparisons —
+# substring or partial-match assertions are deliberately avoided so the
+# tests fail loudly on any change in projection semantics.
+
+import pytest
+
+import sedonadb
+from sedonadb.expr import col
+
+
[email protected]
+def sd():
+    return sedonadb.connect()
+
+
[email protected]
+def df_xy(sd):
+    return sd.sql(
+        "SELECT * FROM (VALUES (1, 10), (2, 20), (3, 30), (4, 40)) AS t(x, y)"
+    )
+
+
+def test_select_by_string(df_xy):
+    out = df_xy.select("x").to_arrow_table()
+    assert out.column_names == ["x"]
+    assert out.column("x").to_pylist() == [1, 2, 3, 4]

Review Comment:
   Unless there is a good reason not to, use 
`pd.testing.assert_dataframe_equal()` for these (the failures are generally 
very informative). GeoPandas has a similar API for the GeoDataFrame.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to