This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new ed91f6f7a5 GH-47441: [Python][Parquet] Allow passing 
write_time_adjusted_to_utc to Python's ParquetWriter (#47745)
ed91f6f7a5 is described below

commit ed91f6f7a5998d74a60d590fea17d89adef49039
Author: Bogdan Romenskii <[email protected]>
AuthorDate: Tue Oct 14 10:47:30 2025 +0200

    GH-47441: [Python][Parquet] Allow passing write_time_adjusted_to_utc to 
Python's ParquetWriter (#47745)
    
    ### Rationale for this change
    Please see #47441 and #41476.
    The `ArrowWriterProperties.write_time_adjusted_to_utc` flag is available in 
C++, yet isn't accessible from Python. This PR introduces the said flag to 
Python API as well.
    
    ### What changes are included in this PR?
    Exposure of `use_time_adjusted_to_utc` boolean argument in Python's API.
    
    ### Are these changes tested?
    Yes, roundtrip parquet tests for all combinations of time types and their 
respective time units.
    
    ### Are there any user-facing changes?
    The users will be able to adjust the said flag directly from Python API.
    * GitHub Issue: #47441
    
    Lead-authored-by: Bogdan Romenskii <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/_parquet.pxd                        |  1 +
 python/pyarrow/_parquet.pyx                        |  9 +++--
 python/pyarrow/includes/libparquet.pxd             |  1 +
 python/pyarrow/parquet/core.py                     | 10 ++++++
 .../pyarrow/tests/parquet/test_parquet_writer.py   | 40 ++++++++++++++++++++++
 5 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 94365f0f7c..704eb06cc3 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -66,6 +66,7 @@ cdef shared_ptr[ArrowWriterProperties] 
_create_arrow_writer_properties(
     writer_engine_version=*,
     use_compliant_nested_type=*,
     store_schema=*,
+    write_time_adjusted_to_utc=*,
 ) except *
 
 
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index d59c70a274..14cd3e363a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -2202,7 +2202,8 @@ cdef shared_ptr[ArrowWriterProperties] 
_create_arrow_writer_properties(
         allow_truncated_timestamps=False,
         writer_engine_version=None,
         use_compliant_nested_type=True,
-        store_schema=True) except *:
+        store_schema=True,
+        write_time_adjusted_to_utc=False) except *:
     """Arrow writer properties"""
     cdef:
         shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2251,6 +2252,8 @@ cdef shared_ptr[ArrowWriterProperties] 
_create_arrow_writer_properties(
     elif writer_engine_version != "V2":
         raise ValueError(f"Unsupported Writer Engine Version: 
{writer_engine_version}")
 
+    arrow_props.set_time_adjusted_to_utc(write_time_adjusted_to_utc)
+
     arrow_properties = arrow_props.build()
 
     return arrow_properties
@@ -2312,7 +2315,8 @@ cdef class ParquetWriter(_Weakrefable):
                   write_page_checksum=False,
                   sorting_columns=None,
                   store_decimal_as_integer=False,
-                  use_content_defined_chunking=False):
+                  use_content_defined_chunking=False,
+                  write_time_adjusted_to_utc=False):
         cdef:
             shared_ptr[WriterProperties] properties
             shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2356,6 +2360,7 @@ cdef class ParquetWriter(_Weakrefable):
             writer_engine_version=writer_engine_version,
             use_compliant_nested_type=use_compliant_nested_type,
             store_schema=store_schema,
+            write_time_adjusted_to_utc=write_time_adjusted_to_utc,
         )
 
         pool = maybe_unbox_memory_pool(memory_pool)
diff --git a/python/pyarrow/includes/libparquet.pxd 
b/python/pyarrow/includes/libparquet.pxd
index d9dd9d1aec..42d48ba050 100644
--- a/python/pyarrow/includes/libparquet.pxd
+++ b/python/pyarrow/includes/libparquet.pxd
@@ -519,6 +519,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" 
nogil:
             Builder* enable_compliant_nested_types()
             Builder* disable_compliant_nested_types()
             Builder* set_engine_version(ArrowWriterEngineVersion version)
+            Builder* set_time_adjusted_to_utc(c_bool adjusted)
             shared_ptr[ArrowWriterProperties] build()
         c_bool support_deprecated_int96_timestamps()
 
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
index aaf15c2028..24cb586c82 100644
--- a/python/pyarrow/parquet/core.py
+++ b/python/pyarrow/parquet/core.py
@@ -939,6 +939,12 @@ use_content_defined_chunking : bool or dict, default False
       balance between deduplication ratio and fragmentation. Use norm_level=1 
or
       norm_level=2 to reach a higher deduplication ratio at the expense of
       fragmentation.
+write_time_adjusted_to_utc : bool, default False
+    Set the value of isAdjustedTOUTC when writing a TIME column.
+    If True, this tells the Parquet reader that the TIME columns
+    are expressed in reference to midnight in the UTC timezone.
+    If False (the default), the TIME columns are assumed to be expressed
+    in reference to midnight in an unknown, presumably local, timezone.
 """
 
 _parquet_writer_example_doc = """\
@@ -1035,6 +1041,7 @@ Examples
                  write_page_checksum=False,
                  sorting_columns=None,
                  store_decimal_as_integer=False,
+                 write_time_adjusted_to_utc=False,
                  **options):
         if use_deprecated_int96_timestamps is None:
             # Use int96 timestamps for Spark
@@ -1088,6 +1095,7 @@ Examples
             write_page_checksum=write_page_checksum,
             sorting_columns=sorting_columns,
             store_decimal_as_integer=store_decimal_as_integer,
+            write_time_adjusted_to_utc=write_time_adjusted_to_utc,
             **options)
         self.is_open = True
 
@@ -1949,6 +1957,7 @@ def write_table(table, where, row_group_size=None, 
version='2.6',
                 write_page_checksum=False,
                 sorting_columns=None,
                 store_decimal_as_integer=False,
+                write_time_adjusted_to_utc=False,
                 **kwargs):
     # Implementor's note: when adding keywords here / updating defaults, also
     # update it in write_to_dataset and _dataset_parquet.pyx 
ParquetFileWriteOptions
@@ -1980,6 +1989,7 @@ def write_table(table, where, row_group_size=None, 
version='2.6',
                 write_page_checksum=write_page_checksum,
                 sorting_columns=sorting_columns,
                 store_decimal_as_integer=store_decimal_as_integer,
+                write_time_adjusted_to_utc=write_time_adjusted_to_utc,
                 **kwargs) as writer:
             writer.write_table(table, row_group_size=row_group_size)
     except Exception:
diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py 
b/python/pyarrow/tests/parquet/test_parquet_writer.py
index f9df90b135..3e7352428c 100644
--- a/python/pyarrow/tests/parquet/test_parquet_writer.py
+++ b/python/pyarrow/tests/parquet/test_parquet_writer.py
@@ -447,3 +447,43 @@ def 
test_parquet_content_defined_chunking_parameters(tempdir):
     # using min_chunk_size, max_chunk_size and norm_level
     cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, 
"norm_level": 1}
     pq.write_table(table, path, use_content_defined_chunking=cdc_options)
+
+
[email protected]("time_type, time_unit", [
+    (pa.time32, "s"),
+    (pa.time32, "ms"),
+    (pa.time64, "us"),
+    (pa.time64, "ns"),
+])
[email protected]("utc_flag_val", [False, True])
+def test_arrow_writer_props_time_adjusted_to_utc(
+    tempdir,
+    utc_flag_val,
+    time_type,
+    time_unit,
+):
+    # GH-47441
+    filename = tempdir / "time_adjusted_to_utc.parquet"
+
+    time_values = [0, 123, 10_000, 86_399]
+
+    table = pa.table({
+        "time_col": pa.array(time_values, type=time_type(time_unit)),
+    })
+
+    schema = pa.schema([
+        ("time_col", time_type(time_unit)),
+    ])
+
+    with pq.ParquetWriter(
+        where=filename,
+        schema=schema,
+        write_time_adjusted_to_utc=utc_flag_val,
+    ) as writer:
+        writer.write_table(table)
+
+    result = pq.read_table(filename, schema=schema)
+
+    result.validate(full=True)
+
+    assert result.equals(table)

Reply via email to