Re: [PR] Add missing Dataframe functions [datafusion-python]

via GitHub Mon, 06 Apr 2026 04:52:59 -0700


ntjohnson1 commented on code in PR #1472:
URL: 
https://github.com/apache/datafusion-python/pull/1472#discussion_r3039241492



##########
crates/core/src/dataframe.rs:
##########
@@ -804,9 +812,30 @@ impl PyDataFrame {
     }
 
     /// Print the query plan
-    #[pyo3(signature = (verbose=false, analyze=false))]
-    fn explain(&self, py: Python, verbose: bool, analyze: bool) -> 
PyDataFusionResult<()> {
-        let df = self.df.as_ref().clone().explain(verbose, analyze)?;
+    #[pyo3(signature = (verbose=false, analyze=false, format=None))]
+    fn explain(
+        &self,
+        py: Python,
+        verbose: bool,
+        analyze: bool,
+        format: Option<&str>,
+    ) -> PyDataFusionResult<()> {
+        let explain_format = match format {
+            Some(f) => f
+                .parse::<datafusion::common::format::ExplainFormat>()
+                .map_err(|_| {
+                    PyDataFusionError::Common(format!(
+                        "Invalid explain format: '{}'. Valid options: indent, 
tree, pgjson, graphviz",

Review Comment:
   NIT: nice if the options weren't hard coded for easier maintainability 



##########
python/datafusion/dataframe.py:
##########
@@ -1036,6 +1105,166 @@ def except_all(self, other: DataFrame) -> DataFrame:
         """
         return DataFrame(self.df.except_all(other.df))
 
+    def except_distinct(self, other: DataFrame) -> DataFrame:
+        """Calculate the set difference with deduplication.
+
+        Returns rows that are in this DataFrame but not in ``other``,
+        removing any duplicates. In contrast, :py:meth:`except_all` preserves
+        duplicate rows.
+
+        The two :py:class:`DataFrame` must have exactly the same schema.
+
+        Args:
+            other: DataFrame to calculate exception with.
+
+        Returns:
+            DataFrame after set difference with deduplication.
+
+        Examples:
+            Remove rows present in ``df2`` and deduplicate:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df1 = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [10, 20, 30, 
10]})
+            >>> df2 = ctx.from_pydict({"a": [1, 2], "b": [10, 20]})
+            >>> df1.except_distinct(df2).sort("a").to_pydict()
+            {'a': [3], 'b': [30]}
+        """
+        return DataFrame(self.df.except_distinct(other.df))
+
+    def intersect_distinct(self, other: DataFrame) -> DataFrame:
+        """Calculate the intersection with deduplication.
+
+        Returns distinct rows that appear in both DataFrames. In contrast,
+        :py:meth:`intersect` preserves duplicate rows.
+
+        The two :py:class:`DataFrame` must have exactly the same schema.
+
+        Args:
+            other: DataFrame to intersect with.
+
+        Returns:
+            DataFrame after intersection with deduplication.
+
+        Examples:
+            Find rows common to both DataFrames:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df1 = ctx.from_pydict({"a": [1, 2, 3], "b": [10, 20, 30]})
+            >>> df2 = ctx.from_pydict({"a": [1, 4], "b": [10, 40]})
+            >>> df1.intersect_distinct(df2).to_pydict()
+            {'a': [1], 'b': [10]}
+        """
+        return DataFrame(self.df.intersect_distinct(other.df))
+
+    def union_by_name(self, other: DataFrame) -> DataFrame:
+        """Union two :py:class:`DataFrame` matching columns by name.
+
+        Unlike :py:meth:`union` which matches columns by position, this method
+        matches columns by their names, allowing DataFrames with different
+        column orders to be combined.
+
+        Args:
+            other: DataFrame to union with.
+
+        Returns:
+            DataFrame after union by name.
+
+        Examples:
+            Combine DataFrames with different column orders:
+
+            >>> ctx = dfn.SessionContext()
+            >>> df1 = ctx.from_pydict({"a": [1], "b": [10]})
+            >>> df2 = ctx.from_pydict({"b": [20], "a": [2]})
+            >>> df1.union_by_name(df2).sort("a").to_pydict()
+            {'a': [1, 2], 'b': [10, 20]}
+        """
+        return DataFrame(self.df.union_by_name(other.df))
+
+    def union_by_name_distinct(self, other: DataFrame) -> DataFrame:

Review Comment:
   NIT: I don't know if the style throughout is to try to keep clearer 1 to 1 
mapping with rust but I would find
   `def union_by_name(self, other: DataFrame, distinct=False)` to be more 
intuitive/pythonic



##########
python/datafusion/dataframe.py:
##########
@@ -1296,23 +1525,52 @@ def count(self) -> int:
         return self.df.count()
 
     @deprecated("Use :py:func:`unnest_columns` instead.")
-    def unnest_column(self, column: str, preserve_nulls: bool = True) -> 
DataFrame:
+    def unnest_column(
+        self,
+        column: str,
+        preserve_nulls: bool = True,
+    ) -> DataFrame:
         """See :py:func:`unnest_columns`."""
         return DataFrame(self.df.unnest_column(column, 
preserve_nulls=preserve_nulls))
 
-    def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> 
DataFrame:
+    def unnest_columns(
+        self,
+        *columns: str,
+        preserve_nulls: bool = True,
+        recursions: list[tuple[str, str, int]] | None = None,
+    ) -> DataFrame:
         """Expand columns of arrays into a single row per array element.
 
         Args:
             columns: Column names to perform unnest operation on.
             preserve_nulls: If False, rows with null entries will not be
                 returned.
+            recursions: Optional list of ``(input_column, output_column, 
depth)``
+                tuples that control how deeply nested columns are unnested. Any
+                column not mentioned here is unnested with depth 1.
 
         Returns:
             A DataFrame with the columns expanded.
+
+        Examples:

Review Comment:
   NIT: Doesn't do an example with preserve nulls but that is pretty clear



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Add missing Dataframe functions [datafusion-python]

Reply via email to