sdd commented on code in PR #404: URL: https://github.com/apache/iceberg-rust/pull/404#discussion_r1640927249
########## crates/iceberg/src/arrow/reader.rs: ########## @@ -741,42 +742,98 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> { fn starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn not_starts_with( &mut self, - _reference: &BoundReference, - _literal: &Datum, + reference: &BoundReference, + literal: &Datum, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement not_starts_with - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } } fn r#in( &mut self, - _reference: &BoundReference, - _literals: &FnvHashSet<Datum>, + reference: &BoundReference, + literals: &FnvHashSet<Datum>, _predicate: &BoundPredicate, ) -> Result<Box<PredicateResult>> { - // TODO: Implement in - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native is_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); + for literal in &literals { + acc = or(&acc, &eq(&left, literal.as_ref())?)? Review Comment: Agree. We might want to consider even pre-filtering with a bloom filter when the list of literals reaches a certain length or something like that, in the future. I was more keen on getting us compliant with the spec first so that everything works, and we can then revisit for improved performance. Is Arrow open to contribs to add features like handling this that aren't implemented by them already? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org