laskoviymishka commented on code in PR #1053:
URL: https://github.com/apache/iceberg-go/pull/1053#discussion_r3257821937
##########
table/arrow_scanner.go:
##########
@@ -193,6 +323,44 @@ func processPositionalDeletes(ctx context.Context, deletes
set[int64]) recProces
}
}
+// filterByDeletionVector returns a pipeline step that drops rows present in
+// the bitmap by building a per-batch Arrow Boolean keep-mask and applying
+// compute.Filter. Preferred over processPositionalDeletes for DV-sourced
+// deletes because RoaringPositionBitmap.Contains is O(1) and Filter with a
+// bit-packed boolean mask is more vectorized than Take with an int64 index
+// array. Tracks absolute row position across batches via a closure-captured
+// counter, identical pattern to processPositionalDeletes.
+func filterByDeletionVector(ctx context.Context, bitmap
*dv.RoaringPositionBitmap) recProcessFn {
+ nextIdx, mem := int64(0), compute.GetAllocator(ctx)
+
+ return func(r arrow.RecordBatch) (arrow.RecordBatch, error) {
+ defer r.Release()
+
+ currentIdx := nextIdx
+ nextIdx += r.NumRows()
+
+ maskBuilder := array.NewBooleanBuilder(mem)
+ defer maskBuilder.Release()
+ maskBuilder.Reserve(int(r.NumRows()))
+ for i := int64(0); i < r.NumRows(); i++ {
Review Comment:
good catch, fixe
##########
table/arrow_scanner.go:
##########
@@ -193,6 +323,44 @@ func processPositionalDeletes(ctx context.Context, deletes
set[int64]) recProces
}
}
+// filterByDeletionVector returns a pipeline step that drops rows present in
+// the bitmap by building a per-batch Arrow Boolean keep-mask and applying
+// compute.Filter. Preferred over processPositionalDeletes for DV-sourced
+// deletes because RoaringPositionBitmap.Contains is O(1) and Filter with a
+// bit-packed boolean mask is more vectorized than Take with an int64 index
+// array. Tracks absolute row position across batches via a closure-captured
+// counter, identical pattern to processPositionalDeletes.
+func filterByDeletionVector(ctx context.Context, bitmap
*dv.RoaringPositionBitmap) recProcessFn {
+ nextIdx, mem := int64(0), compute.GetAllocator(ctx)
+
+ return func(r arrow.RecordBatch) (arrow.RecordBatch, error) {
+ defer r.Release()
+
+ currentIdx := nextIdx
+ nextIdx += r.NumRows()
+
+ maskBuilder := array.NewBooleanBuilder(mem)
+ defer maskBuilder.Release()
+ maskBuilder.Reserve(int(r.NumRows()))
+ for i := int64(0); i < r.NumRows(); i++ {
+ // mask[i] = keep row i? → row i of the batch is at
absolute
+ // position currentIdx+i in the source file; keep if
NOT in
+ // the deletion vector.
+ maskBuilder.Append(!bitmap.Contains(uint64(currentIdx +
i)))
+ }
+ mask := maskBuilder.NewBooleanArray()
+ defer mask.Release()
+
+ out, err := compute.Filter(ctx,
compute.NewDatumWithoutOwning(r),
Review Comment:
done
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]