adamreeve opened a new issue, #48168:
URL: https://github.com/apache/arrow/issues/48168
### Describe the bug, including details regarding any error messages,
version, and platform.
When writing a Parquet Dataset and enabling statistics for specific fields,
no fields have statistics written. This works correctly when writing a single
Parquet file without using the Dataset API.
Python repro:
```python
import base64
import datetime
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.parquet.encryption as pe
dataset_path = Path("./tmp_dataset")
data_file_path = Path("./tmp_data.parquet")
class MockKmsClient(pe.KmsClient):
def __init__(self, _kms_connection_configuration):
super().__init__()
def wrap_key(self, key_bytes, _master_key_identifier):
return base64.b64encode(key_bytes)
def unwrap_key(self, wrapped_key, _master_key_identifier):
return base64.b64decode(wrapped_key)
crypto_factory = pe.CryptoFactory(lambda config: MockKmsClient(config))
config = pe.KmsConnectionConfig()
encryption_config = pe.EncryptionConfiguration(
"kf",
uniform_encryption=True,
plaintext_footer=False)
decryption_config = pe.DecryptionConfiguration()
decryption_properties = crypto_factory.file_decryption_properties(config,
decryption_config)
num_rows = 100
data = pa.Table.from_pydict({
'id': pa.array(list(range(num_rows)), type=pa.int32()),
'timestamp': pa.array([datetime.datetime(2025, 9, 16, 0, 0, 0) +
datetime.timedelta(seconds=i) for i in range(num_rows)],
type=pa.timestamp('ms')),
'x': pa.array(list(range(num_rows)), type=pa.float32()),
})
ds_encryption_config = ds.ParquetEncryptionConfig(crypto_factory, config,
encryption_config)
encryption_properties = crypto_factory.file_encryption_properties(config,
encryption_config)
# Write with dataset
pq.write_to_dataset(
data,
dataset_path,
compression=None,
encryption_config=ds_encryption_config,
write_statistics=["id", "timestamp"])
# Write Parquet file directly
pq.write_table(
data,
data_file_path,
encryption_properties=encryption_properties,
write_statistics=["id", "timestamp"])
# Read back stats
ds_file_path = next(f for f in dataset_path.iterdir() if f.suffix ==
".parquet")
for file_path in [ds_file_path, data_file_path]:
print(f"\nReading {file_path}")
with pq.ParquetFile(
file_path,
decryption_properties=decryption_properties) as f:
rg = f.metadata.row_group(0)
for col_idx in range(f.metadata.num_columns):
col = rg.column(col_idx)
statistics = "None" if col.statistics is None else
str(col.statistics)
print(f"Column '{col.path_in_schema}' statistics:\n
{statistics}")
```
This outputs:
```
Reading tmp_dataset/02779199901f4b51a6eb343881ba1a0f-0.parquet
Column 'id' statistics:
None
Column 'timestamp' statistics:
None
Column 'x' statistics:
None
Reading tmp_data.parquet
Column 'id' statistics:
<pyarrow._parquet.Statistics object at 0x7f11cc9e5580>
has_min_max: True
min: 0
max: 99
null_count: 0
distinct_count: None
num_values: 100
physical_type: INT32
logical_type: None
converted_type (legacy): NONE
Column 'timestamp' statistics:
<pyarrow._parquet.Statistics object at 0x7f11cc9e5670>
has_min_max: True
min: 2025-09-16 00:00:00
max: 2025-09-16 00:01:39
null_count: 0
distinct_count: None
num_values: 100
physical_type: INT64
logical_type: Timestamp(isAdjustedToUTC=false, timeUnit=milliseconds,
is_from_converted_type=false, force_set_converted_type=false)
converted_type (legacy): NONE
Column 'x' statistics:
None
```
This is caused by adding the file encryption properties to the writer
properties here:
https://github.com/apache/arrow/blob/5a480444da35fa26bc6952755510ad39df9f7002/cpp/src/arrow/dataset/file_parquet.cc#L736-L739
The conversion to a `WriterProperties::Builder` from `WriterProperties` is
lossy and loses any settings overridden for specific columns.
### Component(s)
Parquet, C++
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]