Fokko commented on code in PR #1388: URL: https://github.com/apache/iceberg-python/pull/1388#discussion_r1944751408
########## pyiceberg/table/__init__.py: ########## @@ -1624,6 +1659,40 @@ def to_ray(self) -> ray.data.dataset.Dataset: return ray.data.from_arrow(self.to_arrow()) + def count(self) -> int: + # Usage: Calculates the total number of records in a Scan that haven't had positional deletes. + res = 0 + # every task is a FileScanTask + tasks = self.plan_files() + + for task in tasks: + # task.residual is a Boolean Expression if the filter condition is fully satisfied by the + # partition value and task.delete_files represents that positional delete haven't been merged yet + # hence those files have to read as a pyarrow table applying the filter and deletes + if task.residual == AlwaysTrue() and len(task.delete_files) == 0: + # Every File has a metadata stat that stores the file record count + res += task.file.record_count + else: + arrow_scan = ArrowScan( + table_metadata=self.table_metadata, + io=self.io, + projected_schema=self.projection(), + row_filter=self.row_filter, + case_sensitive=self.case_sensitive, + ) + if task.file.file_size_in_bytes > 512 * 1024 * 1024: + target_schema = schema_to_pyarrow(self.projection()) + batches = arrow_scan.to_record_batches([task]) + from pyarrow import RecordBatchReader + + reader = RecordBatchReader.from_batches(target_schema, batches) + for batch in reader: + res += batch.num_rows + else: + tbl = arrow_scan.to_table([task]) + res += len(tbl) Review Comment: Let's keep it simple for now, I don't think we cover the other case in a test ```suggestion tbl = arrow_scan.to_table([task]) res += len(tbl) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org