liurenjie1024 commented on code in PR #404: URL: https://github.com/apache/iceberg-rust/pull/404#discussion_r1640748380
########## crates/iceberg/src/arrow/reader.rs: ########## @@ -741,42 +742,98 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> { fn starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn not_starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement not_starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn r#in( &mut self, - _reference: &BoundReference, - _literals: &FnvHashSet<Datum>, + reference: &BoundReference, + literals: &FnvHashSet<Datum>, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement in - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native is_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); + for literal in &literals { + acc = or(&acc, &eq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn not_in( &mut self, - _reference: &BoundReference, - _literals: &FnvHashSet<Datum>, + reference: &BoundReference, + literals: &FnvHashSet<Datum>, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement not_in - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native not_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); + for literal in &literals { + acc = and(&acc, &neq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() Review Comment: Ditto, this should be true? ########## crates/iceberg/src/arrow/reader.rs: ########## @@ -741,42 +742,98 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> { fn starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn not_starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement not_starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() Review Comment: I think this should be true? ########## crates/iceberg/src/arrow/reader.rs: ########## @@ -741,42 +742,98 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> { fn starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn not_starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement not_starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn r#in( &mut self, - _reference: &BoundReference, - _literals: &FnvHashSet<Datum>, + reference: &BoundReference, + literals: &FnvHashSet<Datum>, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement in - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native is_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); + for literal in &literals { + acc = or(&acc, &eq(&left, literal.as_ref())?)? Review Comment: This is interesting design. The advantage of this approach is that it could make cpu vectorized execution easier, but it involves more memory allocation and computation. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org