robreeves commented on code in PR #3046:
URL: https://github.com/apache/iceberg-python/pull/3046#discussion_r2824558512
##########
pyiceberg/io/pyarrow.py:
##########
@@ -1789,54 +1834,114 @@ def to_table(self, tasks: Iterable[FileScanTask]) ->
pa.Table:
return result
- def to_record_batches(self, tasks: Iterable[FileScanTask]) ->
Iterator[pa.RecordBatch]:
+ def to_record_batches(
+ self,
+ tasks: Iterable[FileScanTask],
+ batch_size: int | None = None,
+ order: ScanOrder = ScanOrder.TASK,
+ concurrent_files: int = 1,
+ ) -> Iterator[pa.RecordBatch]:
"""Scan the Iceberg table and return an Iterator[pa.RecordBatch].
Returns an Iterator of pa.RecordBatch with data from the Iceberg table
by resolving the right columns that match the current table schema.
Only data that matches the provided row_filter expression is returned.
+ Ordering semantics:
+ - ScanOrder.TASK (default): Batches are grouped by file in task
submission order.
+ - ScanOrder.ARRIVAL: Batches may be interleaved across files.
Within each file,
+ batch ordering follows row order.
+
Args:
tasks: FileScanTasks representing the data files and delete files
to read from.
+ batch_size: The number of rows per batch. If None, PyArrow's
default is used.
+ order: Controls the order in which record batches are returned.
+ ScanOrder.TASK (default) returns batches in task order, with
each task
+ fully materialized before proceeding to the next. Allows
parallel file
+ reads via executor. ScanOrder.ARRIVAL yields batches as they
are
+ produced without materializing entire files into memory.
+ concurrent_files: Number of files to read concurrently when
order=ScanOrder.ARRIVAL.
+ Must be >= 1. When > 1, batches may arrive interleaved across
files.
+ Ignored when order=ScanOrder.TASK.
Returns:
An Iterator of PyArrow RecordBatches.
Total number of rows will be capped if specified.
Raises:
ResolveError: When a required field cannot be found in the file
- ValueError: When a field type in the file cannot be projected to
the schema type
+ ValueError: When a field type in the file cannot be projected to
the schema type,
+ or when an invalid order value is provided, or when
concurrent_files < 1.
"""
- deletes_per_file = _read_all_delete_files(self._io, tasks)
+ if not isinstance(order, ScanOrder):
+ raise ValueError(f"Invalid order: {order!r}. Must be a ScanOrder
enum value (ScanOrder.TASK or ScanOrder.ARRIVAL).")
- total_row_count = 0
+ if concurrent_files < 1:
+ raise ValueError(f"concurrent_files must be >= 1, got
{concurrent_files}")
+
+ task_list, deletes_per_file = self._prepare_tasks_and_deletes(tasks)
+
+ if order == ScanOrder.ARRIVAL:
Review Comment:
LGTM
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]