hombit opened a new issue, #46329:
URL: https://github.com/apache/arrow/issues/46329
### Describe the enhancement requested
Currently, `pyarrow.parquet.read_table(columns)` supports selection of
nested columns with dot notation. For example, `columns=["a.b"]` will select
field "b" from the struct column "a". It would be really nice if this also
worked for list columns with struct values.
Currently, this code fails:
```python
import pyarrow as pa
import pyarrow.parquet as pq
list_struct = pa.ListArray.from_arrays(
values = [{"b": 1, "c": 1}] * 10,
offsets = [0, 4, 10],
)
pq.write_table(pa.table({"a": list_struct}), "/tmp/table.parquet")
pq.read_table("/tmp/table.parquet", columns=["a.b"])
```
<details><summary></summary>
```
File
~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/parquet/core.py:1824, in
read_table(source, columns, use_threads, schema, use_pandas_metadata,
read_dictionary, memory_map, buffer_size, partitioning, filesystem, filters,
ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit,
decryption_properties, thrift_string_size_limit, thrift_container_size_limit,
page_checksum_verification)
1812 # TODO test that source is not a directory or a list
1813 dataset = ParquetFile(
1814 source, read_dictionary=read_dictionary,
1815 memory_map=memory_map, buffer_size=buffer_size,
(...) 1821
page_checksum_verification=page_checksum_verification,
1822 )
-> 1824 return dataset.read(columns=columns, use_threads=use_threads,
1825 use_pandas_metadata=use_pandas_metadata)
File
~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/parquet/core.py:1475, in
ParquetDataset.read(self, columns, use_threads, use_pandas_metadata)
1467 index_columns = [
1468 col for col in _get_pandas_index_columns(metadata)
1469 if not isinstance(col, dict)
1470 ]
1471 columns = (
1472 list(columns) + list(set(index_columns) - set(columns))
1473 )
-> 1475 table = self._dataset.to_table(
1476 columns=columns, filter=self._filter_expression,
1477 use_threads=use_threads
1478 )
1480 # if use_pandas_metadata, restore the pandas metadata (which gets
1481 # lost if doing a specific `columns` selection in to_table)
1482 if use_pandas_metadata:
File ~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/_dataset.pyx:579,
in pyarrow._dataset.Dataset.to_table()
File ~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/_dataset.pyx:415,
in pyarrow._dataset.Dataset.scanner()
File
~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/_dataset.pyx:3676, in
pyarrow._dataset.Scanner.from_dataset()
File
~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/_dataset.pyx:3589, in
pyarrow._dataset.Scanner._make_scan_options()
File
~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/_dataset.pyx:3539, in
pyarrow._dataset._populate_builder()
File ~/.virtualenvs/v/lib/python3.13/site-packages/pyarrow/error.pxi:92, in
pyarrow.lib.check_status()
ArrowInvalid: No match for FieldRef.Nested(FieldRef.Name(a)
FieldRef.Name(b)) in a: list<element: struct<b: int64, c: int64>>
__fragment_index: int32
__batch_index: int32
__last_in_fragment: bool
__filename: string
```
</details>
### Component(s)
Python
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]