sdd commented on code in PR #404:
URL: https://github.com/apache/iceberg-rust/pull/404#discussion_r1640927249


##########
crates/iceberg/src/arrow/reader.rs:
##########
@@ -741,42 +742,98 @@ impl<'a> BoundPredicateVisitor for PredicateConverter<'a> 
{
 
     fn starts_with(
         &mut self,
-        _reference: &BoundReference,
-        _literal: &Datum,
+        reference: &BoundReference,
+        literal: &Datum,
         _predicate: &BoundPredicate,
     ) -> Result<Box<PredicateResult>> {
-        // TODO: Implement starts_with
-        self.build_always_true()
+        if let Some(idx) = self.bound_reference(reference)? {
+            let literal = get_arrow_datum(literal)?;
+
+            Ok(Box::new(move |batch| {
+                let left = project_column(&batch, idx)?;
+                starts_with(&left, literal.as_ref())
+            }))
+        } else {
+            // A missing column, treating it as null.
+            self.build_always_false()
+        }
     }
 
     fn not_starts_with(
         &mut self,
-        _reference: &BoundReference,
-        _literal: &Datum,
+        reference: &BoundReference,
+        literal: &Datum,
         _predicate: &BoundPredicate,
     ) -> Result<Box<PredicateResult>> {
-        // TODO: Implement not_starts_with
-        self.build_always_true()
+        if let Some(idx) = self.bound_reference(reference)? {
+            let literal = get_arrow_datum(literal)?;
+
+            Ok(Box::new(move |batch| {
+                let left = project_column(&batch, idx)?;
+
+                // update here if arrow ever adds a native not_starts_with
+                not(&starts_with(&left, literal.as_ref())?)
+            }))
+        } else {
+            // A missing column, treating it as null.
+            self.build_always_false()
+        }
     }
 
     fn r#in(
         &mut self,
-        _reference: &BoundReference,
-        _literals: &FnvHashSet<Datum>,
+        reference: &BoundReference,
+        literals: &FnvHashSet<Datum>,
         _predicate: &BoundPredicate,
     ) -> Result<Box<PredicateResult>> {
-        // TODO: Implement in
-        self.build_always_true()
+        if let Some(idx) = self.bound_reference(reference)? {
+            let literals: Vec<_> = literals
+                .iter()
+                .map(|lit| get_arrow_datum(lit).unwrap())
+                .collect();
+
+            Ok(Box::new(move |batch| {
+                // update this if arrow ever adds a native is_in kernel
+                let left = project_column(&batch, idx)?;
+                let mut acc = BooleanArray::from(vec![false; 
batch.num_rows()]);
+                for literal in &literals {
+                    acc = or(&acc, &eq(&left, literal.as_ref())?)?

Review Comment:
   Agree. We might want to consider even pre-filtering with a bloom filter when 
the list of literals reaches a certain length or something like that, in the 
future. I was more keen on getting us compliant with the spec first so that 
everything works, and we can then revisit for improved performance. Is Arrow 
open to contribs to add features like handling this that aren't implemented by 
them already?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to