sdd commented on code in PR #1017: URL: https://github.com/apache/iceberg-rust/pull/1017#discussion_r2179413795
########## crates/iceberg/src/arrow/caching_delete_file_loader.rs: ########## @@ -308,28 +319,231 @@ impl CachingDeleteFileLoader { Ok(result) } - /// Parses record batch streams from individual equality delete files - /// - /// Returns an unbound Predicate for each batch stream async fn parse_equality_deletes_record_batch_stream( - streams: ArrowRecordBatchStream, + mut stream: ArrowRecordBatchStream, + equality_ids: HashSet<i32>, ) -> Result<Predicate> { - // TODO + let mut result_predicate = AlwaysTrue; + + while let Some(record_batch) = stream.next().await { + let record_batch = record_batch?; + + if record_batch.num_columns() == 0 { + return Ok(AlwaysTrue); + } + + let batch_schema_arrow = record_batch.schema(); + let batch_schema_iceberg = arrow_schema_to_schema(batch_schema_arrow.as_ref())?; + + let mut datum_columns_with_names: Vec<_> = record_batch + .columns() + .iter() + .zip(batch_schema_iceberg.as_struct().fields()) + // only use columns that are in the set of equality_ids for this delete file + .filter(|(field, value)| equality_ids.contains(&value.id)) + .map(|(column, field)| { + let col_as_datum_vec = arrow_array_to_datum_iterator(column, field); + col_as_datum_vec.map(|c| (c, field.name.to_string())) + }) + .try_collect()?; + + // consume all the iterators in lockstep, creating per-row predicates that get combined + // into a single final predicate + + // (2025-06-12) can't use `is_empty` as it depends on unstable library feature `exact_size_is_empty` + #[allow(clippy::len_zero)] + while datum_columns_with_names[0].0.len() > 0 { + let mut row_predicate = AlwaysTrue; + for &mut (ref mut column, ref field_name) in &mut datum_columns_with_names { + if let Some(item) = column.next() { + if let Some(datum) = item? { + row_predicate = row_predicate + .and(Reference::new(field_name.clone()).equal_to(datum.clone())); + } + } + } + result_predicate = result_predicate.and(row_predicate.not()); + } + } + Ok(result_predicate.rewrite_not()) + } +} + +macro_rules! prim_to_datum { Review Comment: There is a `get_arrow_datum` in `arrow/schema.rs` [here](https://github.com/apache/iceberg-rust/blob/36cc12087bc118f1fe10efa6b30db98bd9655ba7/crates/iceberg/src/arrow/schema.rs#L652) which does the inverse, or at least close to it. `arrow_struct_to_literal` in `arrow/value.rs` does something similar, especially in the `primitive` function of `ArrowArrayToIcebergStructConverter`'s visitor implementation [here](https://github.com/apache/iceberg-rust/blob/36cc12087bc118f1fe10efa6b30db98bd9655ba7/crates/iceberg/src/arrow/value.rs#L203). But, this returns Iceberg `Literal`s rather than `Datum`s. We need `Datum` because we're building an unbound predicate. Nothing that already exists is an exact match for what we need to do here, as far as I can find. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org