huan233usc commented on code in PR #2592:
URL: https://github.com/apache/iceberg-rust/pull/2592#discussion_r3400080817


##########
crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs:
##########
@@ -225,17 +225,115 @@ fn to_iceberg_operation(op: Operator) -> 
OpTransformedResult {
 /// identified by name at runtime, so we need to handle them here.
 fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> 
TransformedResult {
     match func_name {
-        // TODO: support complex expression arguments to scalar functions
-        "isnan" if args.len() == 1 => {
-            let operand = to_iceberg_predicate(&args[0]);
-            match operand {
-                TransformedResult::Column(r) => 
TransformedResult::Predicate(Predicate::Unary(
-                    UnaryExpression::new(PredicateOperator::IsNan, r),
-                )),
-                _ => TransformedResult::NotTransformed,
+        "isnan" if args.len() == 1 => match 
resolve_nan_preserving_reference(&args[0]) {
+            Some(r) => TransformedResult::Predicate(r.is_nan()),
+            None => TransformedResult::NotTransformed,
+        },
+        _ => TransformedResult::NotTransformed,
+    }
+}
+
+/// Attempts to resolve a numeric expression argument down to a single column
+/// [`Reference`] such that `isnan(arg)` is logically equivalent to
+/// `isnan(reference)`.
+///
+/// Filter pushdown is reported as `Inexact` (see
+/// [`IcebergTableProvider::supports_filters_pushdown`]), so DataFusion
+/// re-applies the original predicate after scanning. We therefore only need 
the
+/// pushed-down predicate to be implied by the original filter (it may match
+/// extra rows, but must never drop a matching one). Every transformation 
handled
+/// here preserves NaN-ness *exactly* — the result is NaN if and only if the
+/// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are 
sound:
+///
+/// * negation: `-x` is NaN iff `x` is NaN
+/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN
+/// * casts between numeric types preserve NaN
+/// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c`
+/// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c`
+///
+/// Multiplication/division by zero and `c / x` are intentionally rejected: 
e.g.
+/// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN.
+///
+/// [`IcebergTableProvider::supports_filters_pushdown`]: 
crate::table::IcebergTableProvider
+fn resolve_nan_preserving_reference(expr: &Expr) -> Option<Reference> {
+    match expr {
+        Expr::Column(column) => Some(Reference::new(column.name())),
+        Expr::Negative(inner) => resolve_nan_preserving_reference(inner),
+        Expr::Cast(cast) => {
+            // Casts to date truncate the value and are not numeric, so they
+            // cannot be treated as NaN-preserving.
+            if cast.data_type == DataType::Date32 || cast.data_type == 
DataType::Date64 {
+                return None;
             }
+            resolve_nan_preserving_reference(&cast.expr)
         }
-        _ => TransformedResult::NotTransformed,
+        Expr::ScalarFunction(ScalarFunction { func, args })
+            if func.name() == "abs" && args.len() == 1 =>
+        {
+            resolve_nan_preserving_reference(&args[0])
+        }
+        Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary),
+        _ => None,
+    }
+}
+
+/// Resolves the column reference from an arithmetic expression that combines a
+/// single column with a finite literal while preserving NaN-ness. See
+/// [`resolve_nan_preserving_reference`] for the soundness argument.
+fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option<Reference> {

Review Comment:
   Yes, `(x + 1) * (x - 2)` won't be pushed down in this PR.
   
   It requires more work to support it safely. To support that we need to:
   
   1. Recurse into both operands and confirm they reduce to the same single 
column — otherwise e.g. `x + y` reduces to two different columns and can't be 
expressed as a single `col IS NAN` (it would need `x IS NAN OR y IS NAN`, and 
picking just one would drop rows).
   2. Verify the operator combination is NaN-preserving (result is NaN iff that 
column is NaN) — e.g. `(x + 1) * (x - 2)` is NaN iff `x` is NaN, so it's safe, 
but `(x + 1) - (x - 2)` is NaN when `x = inf` (`inf - inf`) even though `x` 
isn't, so it is not.
   
   Maybe we could do it in the follow up, but iiuc Spark won't push down it as 
well so in practice this might be a little bit less seen.



##########
crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs:
##########
@@ -225,17 +225,115 @@ fn to_iceberg_operation(op: Operator) -> 
OpTransformedResult {
 /// identified by name at runtime, so we need to handle them here.
 fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) -> 
TransformedResult {
     match func_name {
-        // TODO: support complex expression arguments to scalar functions
-        "isnan" if args.len() == 1 => {
-            let operand = to_iceberg_predicate(&args[0]);
-            match operand {
-                TransformedResult::Column(r) => 
TransformedResult::Predicate(Predicate::Unary(
-                    UnaryExpression::new(PredicateOperator::IsNan, r),
-                )),
-                _ => TransformedResult::NotTransformed,
+        "isnan" if args.len() == 1 => match 
resolve_nan_preserving_reference(&args[0]) {
+            Some(r) => TransformedResult::Predicate(r.is_nan()),
+            None => TransformedResult::NotTransformed,
+        },
+        _ => TransformedResult::NotTransformed,
+    }
+}
+
+/// Attempts to resolve a numeric expression argument down to a single column
+/// [`Reference`] such that `isnan(arg)` is logically equivalent to
+/// `isnan(reference)`.
+///
+/// Filter pushdown is reported as `Inexact` (see
+/// [`IcebergTableProvider::supports_filters_pushdown`]), so DataFusion
+/// re-applies the original predicate after scanning. We therefore only need 
the
+/// pushed-down predicate to be implied by the original filter (it may match
+/// extra rows, but must never drop a matching one). Every transformation 
handled
+/// here preserves NaN-ness *exactly* — the result is NaN if and only if the
+/// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are 
sound:
+///
+/// * negation: `-x` is NaN iff `x` is NaN
+/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN
+/// * casts between numeric types preserve NaN
+/// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c`
+/// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c`
+///
+/// Multiplication/division by zero and `c / x` are intentionally rejected: 
e.g.
+/// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN.
+///
+/// [`IcebergTableProvider::supports_filters_pushdown`]: 
crate::table::IcebergTableProvider
+fn resolve_nan_preserving_reference(expr: &Expr) -> Option<Reference> {
+    match expr {
+        Expr::Column(column) => Some(Reference::new(column.name())),
+        Expr::Negative(inner) => resolve_nan_preserving_reference(inner),
+        Expr::Cast(cast) => {
+            // Casts to date truncate the value and are not numeric, so they
+            // cannot be treated as NaN-preserving.
+            if cast.data_type == DataType::Date32 || cast.data_type == 
DataType::Date64 {
+                return None;
             }
+            resolve_nan_preserving_reference(&cast.expr)
         }
-        _ => TransformedResult::NotTransformed,
+        Expr::ScalarFunction(ScalarFunction { func, args })
+            if func.name() == "abs" && args.len() == 1 =>
+        {
+            resolve_nan_preserving_reference(&args[0])
+        }
+        Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary),
+        _ => None,
+    }
+}
+
+/// Resolves the column reference from an arithmetic expression that combines a
+/// single column with a finite literal while preserving NaN-ness. See
+/// [`resolve_nan_preserving_reference`] for the soundness argument.
+fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option<Reference> {
+    let (left, right) = (&binary.left, &binary.right);
+    match binary.op {
+        // `x + c`, `c + x`, `x - c` and `c - x` are NaN iff `x` is NaN, for 
any
+        // finite literal `c`. The column may be on either side.
+        Operator::Plus | Operator::Minus => {
+            if finite_literal(right).is_some() {
+                resolve_nan_preserving_reference(left)
+            } else if finite_literal(left).is_some() {
+                resolve_nan_preserving_reference(right)
+            } else {
+                None
+            }
+        }
+
+        // `x * c` and `c * x` are NaN iff `x` is NaN, but only when `c` is
+        // non-zero: `±inf * 0` is NaN even though `±inf` is not. The column 
may
+        // be on either side.
+        Operator::Multiply => {
+            if matches!(finite_literal(right), Some(c) if c != 0.0) {
+                resolve_nan_preserving_reference(left)
+            } else if matches!(finite_literal(left), Some(c) if c != 0.0) {
+                resolve_nan_preserving_reference(right)
+            } else {
+                None
+            }
+        }
+
+        // `x / c` is NaN iff `x` is NaN, for a finite non-zero literal `c`.
+        // `c / x` is rejected because it is not NaN-preserving (e.g. `0 / 0` 
is
+        // NaN while `0` is not), so the column must be the dividend (left 
side).

Review Comment:
   Reworded in a7e81bc6 to spell out the IEEE-754 facts (0 is not NaN, but 0 / 
0 is NaN).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to