This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new ed91f6f7a5 GH-47441: [Python][Parquet] Allow passing
write_time_adjusted_to_utc to Python's ParquetWriter (#47745)
ed91f6f7a5 is described below
commit ed91f6f7a5998d74a60d590fea17d89adef49039
Author: Bogdan Romenskii <[email protected]>
AuthorDate: Tue Oct 14 10:47:30 2025 +0200
GH-47441: [Python][Parquet] Allow passing write_time_adjusted_to_utc to
Python's ParquetWriter (#47745)
### Rationale for this change
Please see #47441 and #41476.
The `ArrowWriterProperties.write_time_adjusted_to_utc` flag is available in
C++, yet isn't accessible from Python. This PR introduces the said flag to
Python API as well.
### What changes are included in this PR?
Exposure of `use_time_adjusted_to_utc` boolean argument in Python's API.
### Are these changes tested?
Yes, roundtrip parquet tests for all combinations of time types and their
respective time units.
### Are there any user-facing changes?
The users will be able to adjust the said flag directly from Python API.
* GitHub Issue: #47441
Lead-authored-by: Bogdan Romenskii <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/_parquet.pxd | 1 +
python/pyarrow/_parquet.pyx | 9 +++--
python/pyarrow/includes/libparquet.pxd | 1 +
python/pyarrow/parquet/core.py | 10 ++++++
.../pyarrow/tests/parquet/test_parquet_writer.py | 40 ++++++++++++++++++++++
5 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 94365f0f7c..704eb06cc3 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -66,6 +66,7 @@ cdef shared_ptr[ArrowWriterProperties]
_create_arrow_writer_properties(
writer_engine_version=*,
use_compliant_nested_type=*,
store_schema=*,
+ write_time_adjusted_to_utc=*,
) except *
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index d59c70a274..14cd3e363a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -2202,7 +2202,8 @@ cdef shared_ptr[ArrowWriterProperties]
_create_arrow_writer_properties(
allow_truncated_timestamps=False,
writer_engine_version=None,
use_compliant_nested_type=True,
- store_schema=True) except *:
+ store_schema=True,
+ write_time_adjusted_to_utc=False) except *:
"""Arrow writer properties"""
cdef:
shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2251,6 +2252,8 @@ cdef shared_ptr[ArrowWriterProperties]
_create_arrow_writer_properties(
elif writer_engine_version != "V2":
raise ValueError(f"Unsupported Writer Engine Version:
{writer_engine_version}")
+ arrow_props.set_time_adjusted_to_utc(write_time_adjusted_to_utc)
+
arrow_properties = arrow_props.build()
return arrow_properties
@@ -2312,7 +2315,8 @@ cdef class ParquetWriter(_Weakrefable):
write_page_checksum=False,
sorting_columns=None,
store_decimal_as_integer=False,
- use_content_defined_chunking=False):
+ use_content_defined_chunking=False,
+ write_time_adjusted_to_utc=False):
cdef:
shared_ptr[WriterProperties] properties
shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2356,6 +2360,7 @@ cdef class ParquetWriter(_Weakrefable):
writer_engine_version=writer_engine_version,
use_compliant_nested_type=use_compliant_nested_type,
store_schema=store_schema,
+ write_time_adjusted_to_utc=write_time_adjusted_to_utc,
)
pool = maybe_unbox_memory_pool(memory_pool)
diff --git a/python/pyarrow/includes/libparquet.pxd
b/python/pyarrow/includes/libparquet.pxd
index d9dd9d1aec..42d48ba050 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -519,6 +519,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet"
nogil:
Builder* enable_compliant_nested_types()
Builder* disable_compliant_nested_types()
Builder* set_engine_version(ArrowWriterEngineVersion version)
+ Builder* set_time_adjusted_to_utc(c_bool adjusted)
shared_ptr[ArrowWriterProperties] build()
c_bool support_deprecated_int96_timestamps()
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index aaf15c2028..24cb586c82 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -939,6 +939,12 @@ use_content_defined_chunking : bool or dict, default False
balance between deduplication ratio and fragmentation. Use norm_level=1
or
norm_level=2 to reach a higher deduplication ratio at the expense of
fragmentation.
+write_time_adjusted_to_utc : bool, default False
+ Set the value of isAdjustedTOUTC when writing a TIME column.
+ If True, this tells the Parquet reader that the TIME columns
+ are expressed in reference to midnight in the UTC timezone.
+ If False (the default), the TIME columns are assumed to be expressed
+ in reference to midnight in an unknown, presumably local, timezone.
"""
_parquet_writer_example_doc = """\
@@ -1035,6 +1041,7 @@ Examples
write_page_checksum=False,
sorting_columns=None,
store_decimal_as_integer=False,
+ write_time_adjusted_to_utc=False,
**options):
if use_deprecated_int96_timestamps is None:
# Use int96 timestamps for Spark
@@ -1088,6 +1095,7 @@ Examples
write_page_checksum=write_page_checksum,
sorting_columns=sorting_columns,
store_decimal_as_integer=store_decimal_as_integer,
+ write_time_adjusted_to_utc=write_time_adjusted_to_utc,
**options)
self.is_open = True
@@ -1949,6 +1957,7 @@ def write_table(table, where, row_group_size=None,
version='2.6',
write_page_checksum=False,
sorting_columns=None,
store_decimal_as_integer=False,
+ write_time_adjusted_to_utc=False,
**kwargs):
# Implementor's note: when adding keywords here / updating defaults, also
# update it in write_to_dataset and _dataset_parquet.pyx
ParquetFileWriteOptions
@@ -1980,6 +1989,7 @@ def write_table(table, where, row_group_size=None,
version='2.6',
write_page_checksum=write_page_checksum,
sorting_columns=sorting_columns,
store_decimal_as_integer=store_decimal_as_integer,
+ write_time_adjusted_to_utc=write_time_adjusted_to_utc,
**kwargs) as writer:
writer.write_table(table, row_group_size=row_group_size)
except Exception:
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py
b/python/pyarrow/tests/parquet/test_parquet_writer.py
index f9df90b135..3e7352428c 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -447,3 +447,43 @@ def
test_parquet_content_defined_chunking_parameters(tempdir):
# using min_chunk_size, max_chunk_size and norm_level
cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536,
"norm_level": 1}
pq.write_table(table, path, use_content_defined_chunking=cdc_options)
+
+
[email protected]("time_type, time_unit", [
+ (pa.time32, "s"),
+ (pa.time32, "ms"),
+ (pa.time64, "us"),
+ (pa.time64, "ns"),
+])
[email protected]("utc_flag_val", [False, True])
+def test_arrow_writer_props_time_adjusted_to_utc(
+ tempdir,
+ utc_flag_val,
+ time_type,
+ time_unit,
+):
+ # GH-47441
+ filename = tempdir / "time_adjusted_to_utc.parquet"
+
+ time_values = [0, 123, 10_000, 86_399]
+
+ table = pa.table({
+ "time_col": pa.array(time_values, type=time_type(time_unit)),
+ })
+
+ schema = pa.schema([
+ ("time_col", time_type(time_unit)),
+ ])
+
+ with pq.ParquetWriter(
+ where=filename,
+ schema=schema,
+ write_time_adjusted_to_utc=utc_flag_val,
+ ) as writer:
+ writer.write_table(table)
+
+ result = pq.read_table(filename, schema=schema)
+
+ result.validate(full=True)
+
+ assert result.equals(table)