Fokko commented on code in PR #6775:
URL: https://github.com/apache/iceberg/pull/6775#discussion_r1129941505
##########
python/pyiceberg/io/pyarrow.py:
##########
@@ -484,12 +490,82 @@ def expression_to_pyarrow(expr: BooleanExpression) ->
pc.Expression:
return boolean_expression_visit(expr, _ConvertToArrowExpression())
-def _file_to_table(
+@lru_cache
+def _get_file_format(file_format: FileFormat, **kwargs: Dict[str, Any]) ->
ds.FileFormat:
+ if file_format == FileFormat.PARQUET:
+ return ds.ParquetFileFormat(**kwargs)
+ else:
+ raise ValueError(f"Unsupported file format: {file_format}")
+
+
+def _construct_fragment(fs: FileSystem, data_file: DataFile,
file_format_kwargs: Dict[str, Any] = EMPTY_DICT) -> ds.Fragment:
+ _, path = PyArrowFileIO.parse_location(data_file.file_path)
+ return _get_file_format(data_file.file_format,
**file_format_kwargs).make_fragment(path, fs)
+
+
+def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str,
pa.ChunkedArray]:
+ delete_fragment = _construct_fragment(
+ fs, data_file, file_format_kwargs={"dictionary_columns":
("file_path",), "pre_buffer": True, "buffer_size": ONE_MEGABYTE}
Review Comment:
Good question. I noticed that my deletes parquet file is only 81kb on disk.
Looking at it in memory:
```
>>>
pq.read_table(source='00098-7715-aedfa79b-f3b0-423d-8753-0224852e5aa2-00001-deletes.parquet').nbytes
7358065
>>>
pq.read_table(source='00098-7715-aedfa79b-f3b0-423d-8753-0224852e5aa2-00001-deletes.parquet').to_pandas().info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51455 entries, 0 to 51454
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 file_path 51455 non-null object
1 pos 51455 non-null int64
dtypes: int64(1), object(1)
memory usage: 804.1+ KB
```
With `read_dictionary`:
```
>>>
pq.read_table(source='00098-7715-aedfa79b-f3b0-423d-8753-0224852e5aa2-00001-deletes.parquet',
read_dictionary=['file_path']).nbytes
617595
>>>
pq.read_table(source='00098-7715-aedfa79b-f3b0-423d-8753-0224852e5aa2-00001-deletes.parquet',
read_dictionary=['file_path']).to_pandas().info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51455 entries, 0 to 51454
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 file_path 51455 non-null category
1 pos 51455 non-null int64
dtypes: category(1), int64(1)
memory usage: 452.5 KB
```
It seems to be quite efficient. Also, reading the `file_path` into a
`DictionaryArray` give us a smaller footprint as well. But I'm also comfortable
bumping this to 8 megabytes. Keep in mind that this is unrelated to the
datafiles, I agree that we should keep those at least 8 megabytes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]