ZENOTME commented on code in PR #309: URL: https://github.com/apache/iceberg-rust/pull/309#discussion_r1869860138
########## crates/iceberg/src/spec/transform.rs: ########## @@ -261,6 +269,323 @@ impl Transform { _ => self == other, } } + + /// Projects a given predicate according to the transformation + /// specified by the `Transform` instance. + /// + /// This allows predicates to be effectively applied to data + /// that has undergone transformation, enabling efficient querying + /// and filtering based on the original, untransformed data. + /// + /// # Example + /// Suppose, we have row filter `a = 10`, and a partition spec + /// `bucket(a, 37) as bs`, if one row matches `a = 10`, then its partition + /// value should match `bucket(10, 37) as bs`, and we project `a = 10` to + /// `bs = bucket(10, 37)` + pub fn project(&self, name: String, predicate: &BoundPredicate) -> Result<Option<Predicate>> { + let func = create_transform_function(self)?; + + match self { + Transform::Identity => match predicate { + BoundPredicate::Unary(expr) => Self::project_unary(expr.op(), name), + BoundPredicate::Binary(expr) => Ok(Some(Predicate::Binary(BinaryExpression::new( + expr.op(), + Reference::new(name), + expr.literal().to_owned(), + )))), + BoundPredicate::Set(expr) => Ok(Some(Predicate::Set(SetExpression::new( + expr.op(), + Reference::new(name), + expr.literals().to_owned(), + )))), + _ => Ok(None), + }, + Transform::Bucket(_) => match predicate { + BoundPredicate::Unary(expr) => Self::project_unary(expr.op(), name), + BoundPredicate::Binary(expr) => self.project_eq_operator(name, expr, &func), + BoundPredicate::Set(expr) => self.project_in_operator(expr, name, &func), + _ => Ok(None), + }, + Transform::Truncate(width) => match predicate { + BoundPredicate::Unary(expr) => Self::project_unary(expr.op(), name), + BoundPredicate::Binary(expr) => { + self.project_binary_with_adjusted_boundary(name, expr, &func, Some(*width)) + } + BoundPredicate::Set(expr) => self.project_in_operator(expr, name, &func), + _ => Ok(None), + }, + Transform::Year | Transform::Month | Transform::Day | Transform::Hour => { + match predicate { + BoundPredicate::Unary(expr) => Self::project_unary(expr.op(), name), + BoundPredicate::Binary(expr) => { + self.project_binary_with_adjusted_boundary(name, expr, &func, None) + } + BoundPredicate::Set(expr) => self.project_in_operator(expr, name, &func), + _ => Ok(None), + } + } + _ => Ok(None), + } + } + + /// Check if `Transform` is applicable on datum's `PrimitiveType` + fn can_transform(&self, datum: &Datum) -> bool { + let input_type = datum.data_type().clone(); + self.result_type(&Type::Primitive(input_type)).is_ok() + } + + /// Creates a unary predicate from a given operator and a reference name. + fn project_unary(op: PredicateOperator, name: String) -> Result<Option<Predicate>> { + Ok(Some(Predicate::Unary(UnaryExpression::new( + op, + Reference::new(name), + )))) + } + + /// Attempts to create a binary predicate based on a binary expression, + /// if applicable. + /// + /// This method evaluates a given binary expression and, if the operation + /// is equality (`Eq`) and the literal can be transformed, constructs a + /// `Predicate::Binary`variant representing the binary operation. + fn project_eq_operator( + &self, + name: String, + expr: &BinaryExpression<BoundReference>, + func: &BoxedTransformFunction, + ) -> Result<Option<Predicate>> { + if expr.op() != PredicateOperator::Eq || !self.can_transform(expr.literal()) { + return Ok(None); + } + + Ok(Some(Predicate::Binary(BinaryExpression::new( + expr.op(), + Reference::new(name), + func.transform_literal_result(expr.literal())?, + )))) + } + + /// Projects a binary expression to a predicate with an adjusted boundary. + /// + /// Checks if the literal within the given binary expression is + /// transformable. If transformable, it proceeds to potentially adjust + /// the boundary of the expression based on the comparison operator (`op`). + /// The potential adjustments involve incrementing or decrementing the + /// literal value and changing the `PredicateOperator` itself to its + /// inclusive variant. + fn project_binary_with_adjusted_boundary( + &self, + name: String, + expr: &BinaryExpression<BoundReference>, + func: &BoxedTransformFunction, + width: Option<u32>, + ) -> Result<Option<Predicate>> { + if !self.can_transform(expr.literal()) { + return Ok(None); + } + + let op = &expr.op(); + let datum = &expr.literal(); + + if let Some(boundary) = Self::adjust_boundary(op, datum)? { + let transformed_projection = func.transform_literal_result(&boundary)?; + + let adjusted_projection = + self.adjust_time_projection(op, datum, &transformed_projection); + + let adjusted_operator = Self::adjust_operator(op, datum, width); + + if let Some(op) = adjusted_operator { + let predicate = match adjusted_projection { + None => Predicate::Binary(BinaryExpression::new( + op, + Reference::new(name), + transformed_projection, + )), + Some(AdjustedProjection::Single(d)) => { + Predicate::Binary(BinaryExpression::new(op, Reference::new(name), d)) + } + Some(AdjustedProjection::Set(d)) => Predicate::Set(SetExpression::new( + PredicateOperator::In, + Reference::new(name), + d, + )), + }; + return Ok(Some(predicate)); + } + }; + + Ok(None) + } + + /// Projects a set expression to a predicate, + /// applying a transformation to each literal in the set. + fn project_in_operator( + &self, + expr: &SetExpression<BoundReference>, + name: String, + func: &BoxedTransformFunction, + ) -> Result<Option<Predicate>> { + if expr.op() != PredicateOperator::In + || expr.literals().iter().any(|d| !self.can_transform(d)) + { + return Ok(None); + } + + let mut new_set = FnvHashSet::default(); + + for lit in expr.literals() { + let datum = func.transform_literal_result(lit)?; + + if let Some(AdjustedProjection::Single(d)) = + self.adjust_time_projection(&PredicateOperator::In, lit, &datum) + { + new_set.insert(d); + }; + + new_set.insert(datum); + } + + Ok(Some(Predicate::Set(SetExpression::new( + expr.op(), + Reference::new(name), + new_set, + )))) + } + + /// Adjusts the boundary value for comparison operations + /// based on the specified `PredicateOperator` and `Datum`. + /// + /// This function modifies the boundary value for certain comparison + /// operators (`LessThan`, `GreaterThan`) by incrementing or decrementing + /// the literal value within the given `Datum`. For operators that do not + /// imply a boundary shift (`Eq`, `LessThanOrEq`, `GreaterThanOrEq`, + /// `StartsWith`, `NotStartsWith`), the original datum is returned + /// unmodified. + fn adjust_boundary(op: &PredicateOperator, datum: &Datum) -> Result<Option<Datum>> { + let literal = datum.literal(); + + let adjusted_boundary = match op { + PredicateOperator::LessThan => match literal { + PrimitiveLiteral::Int(v) => Some(Datum::int(v - 1)), + PrimitiveLiteral::Long(v) => Some(Datum::long(v - 1)), + PrimitiveLiteral::Decimal(v) => Some(Datum::decimal(v - 1)?), + PrimitiveLiteral::Date(v) => Some(Datum::date(v - 1)), + PrimitiveLiteral::Timestamp(v) => Some(Datum::timestamp_micros(v - 1)), + _ => Some(datum.to_owned()), + }, + PredicateOperator::GreaterThan => match literal { + PrimitiveLiteral::Int(v) => Some(Datum::int(v + 1)), + PrimitiveLiteral::Long(v) => Some(Datum::long(v + 1)), + PrimitiveLiteral::Decimal(v) => Some(Datum::decimal(v + 1)?), + PrimitiveLiteral::Date(v) => Some(Datum::date(v + 1)), + PrimitiveLiteral::Timestamp(v) => Some(Datum::timestamp_micros(v + 1)), + _ => Some(datum.to_owned()), + }, + PredicateOperator::Eq + | PredicateOperator::LessThanOrEq + | PredicateOperator::GreaterThanOrEq + | PredicateOperator::StartsWith + | PredicateOperator::NotStartsWith => Some(datum.to_owned()), + _ => None, + }; + + Ok(adjusted_boundary) + } + + /// Adjusts the comparison operator based on the specified datum and an + /// optional width constraint. + /// + /// This function modifies the comparison operator for `LessThan` and + /// `GreaterThan` cases to their inclusive counterparts (`LessThanOrEq`, + /// `GreaterThanOrEq`) unconditionally. For `StartsWith` and + /// `NotStartsWith` operators acting on string literals, the operator may + /// be adjusted to `Eq` or `NotEq` if the string length matches the + /// specified width, indicating a precise match rather than a prefix + /// condition. + fn adjust_operator( + op: &PredicateOperator, + datum: &Datum, + width: Option<u32>, + ) -> Option<PredicateOperator> { + match op { + PredicateOperator::LessThan => Some(PredicateOperator::LessThanOrEq), + PredicateOperator::GreaterThan => Some(PredicateOperator::GreaterThanOrEq), + PredicateOperator::StartsWith => match datum.literal() { + PrimitiveLiteral::String(s) => { + if let Some(w) = width { + if s.len() == w as usize { + return Some(PredicateOperator::Eq); + }; + }; + Some(*op) + } + _ => Some(*op), + }, + PredicateOperator::NotStartsWith => match datum.literal() { + PrimitiveLiteral::String(s) => { + if let Some(w) = width { + let w = w as usize; + + if s.len() == w { + return Some(PredicateOperator::NotEq); + } + + if s.len() < w { + return Some(*op); + } + + return None; + }; + Some(*op) + } + _ => Some(*op), + }, + _ => Some(*op), + } + } + + /// Adjust projection for temporal transforms, align with Java + /// implementation: https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/transforms/ProjectionUtil.java#L275 + fn adjust_time_projection( Review Comment: Seems this function is used to fix the bug from earlier iceberg. But I don't see same thing in pyiceberg transform. Do we really need this fix? cc @liurenjie1024 @Fokko -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org