huan233usc commented on code in PR #2592:
URL: https://github.com/apache/iceberg-rust/pull/2592#discussion_r3400081280
##########
crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs:
##########
@@ -225,17 +225,115 @@ fn to_iceberg_operation(op: Operator) ->
OpTransformedResult {
/// identified by name at runtime, so we need to handle them here.
fn scalar_function_to_iceberg_predicate(func_name: &str, args: &[Expr]) ->
TransformedResult {
match func_name {
- // TODO: support complex expression arguments to scalar functions
- "isnan" if args.len() == 1 => {
- let operand = to_iceberg_predicate(&args[0]);
- match operand {
- TransformedResult::Column(r) =>
TransformedResult::Predicate(Predicate::Unary(
- UnaryExpression::new(PredicateOperator::IsNan, r),
- )),
- _ => TransformedResult::NotTransformed,
+ "isnan" if args.len() == 1 => match
resolve_nan_preserving_reference(&args[0]) {
+ Some(r) => TransformedResult::Predicate(r.is_nan()),
+ None => TransformedResult::NotTransformed,
+ },
+ _ => TransformedResult::NotTransformed,
+ }
+}
+
+/// Attempts to resolve a numeric expression argument down to a single column
+/// [`Reference`] such that `isnan(arg)` is logically equivalent to
+/// `isnan(reference)`.
+///
+/// Filter pushdown is reported as `Inexact` (see
+/// [`IcebergTableProvider::supports_filters_pushdown`]), so DataFusion
+/// re-applies the original predicate after scanning. We therefore only need
the
+/// pushed-down predicate to be implied by the original filter (it may match
+/// extra rows, but must never drop a matching one). Every transformation
handled
+/// here preserves NaN-ness *exactly* — the result is NaN if and only if the
+/// wrapped column is NaN — so both `isnan(arg)` and `NOT isnan(arg)` are
sound:
+///
+/// * negation: `-x` is NaN iff `x` is NaN
+/// * `abs(x)`: `abs(x)` is NaN iff `x` is NaN
+/// * casts between numeric types preserve NaN
+/// * `x + c`, `c + x`, `x - c`, `c - x` for a finite literal `c`
+/// * `x * c`, `c * x`, `x / c` for a finite, non-zero literal `c`
+///
+/// Multiplication/division by zero and `c / x` are intentionally rejected:
e.g.
+/// `x * 0` is NaN when `x` is `±inf`, so it does not imply `x` is NaN.
+///
+/// [`IcebergTableProvider::supports_filters_pushdown`]:
crate::table::IcebergTableProvider
+fn resolve_nan_preserving_reference(expr: &Expr) -> Option<Reference> {
+ match expr {
+ Expr::Column(column) => Some(Reference::new(column.name())),
+ Expr::Negative(inner) => resolve_nan_preserving_reference(inner),
+ Expr::Cast(cast) => {
+ // Casts to date truncate the value and are not numeric, so they
+ // cannot be treated as NaN-preserving.
+ if cast.data_type == DataType::Date32 || cast.data_type ==
DataType::Date64 {
+ return None;
}
+ resolve_nan_preserving_reference(&cast.expr)
}
- _ => TransformedResult::NotTransformed,
+ Expr::ScalarFunction(ScalarFunction { func, args })
+ if func.name() == "abs" && args.len() == 1 =>
+ {
+ resolve_nan_preserving_reference(&args[0])
+ }
+ Expr::BinaryExpr(binary) => resolve_nan_preserving_binary(binary),
+ _ => None,
+ }
+}
+
+/// Resolves the column reference from an arithmetic expression that combines a
+/// single column with a finite literal while preserving NaN-ness. See
+/// [`resolve_nan_preserving_reference`] for the soundness argument.
+fn resolve_nan_preserving_binary(binary: &BinaryExpr) -> Option<Reference> {
+ let (left, right) = (&binary.left, &binary.right);
+ match binary.op {
+ // `x + c`, `c + x`, `x - c` and `c - x` are NaN iff `x` is NaN, for
any
+ // finite literal `c`. The column may be on either side.
+ Operator::Plus | Operator::Minus => {
+ if finite_literal(right).is_some() {
+ resolve_nan_preserving_reference(left)
+ } else if finite_literal(left).is_some() {
+ resolve_nan_preserving_reference(right)
+ } else {
+ None
+ }
+ }
+
+ // `x * c` and `c * x` are NaN iff `x` is NaN, but only when `c` is
+ // non-zero: `±inf * 0` is NaN even though `±inf` is not. The column
may
Review Comment:
Done in a7e81bc6 — reworded to: per IEEE-754, inf is not NaN but inf * 0 is
NaN, so multiplying by zero is rejected.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]