kevinjqliu commented on code in PR #1799:
URL: https://github.com/apache/iceberg-python/pull/1799#discussion_r1999324401


##########
tests/io/test_pyarrow_stats.py:
##########
@@ -72,7 +72,7 @@
     StringType,
 )
 from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, 
time_to_micros
-
+from decimal import Decimal

Review Comment:
   i think we should use 
[pyarrow.decimal128](https://arrow.apache.org/docs/python/generated/pyarrow.decimal128.html)
 instead



##########
tests/io/test_pyarrow_stats.py:
##########
@@ -470,6 +473,9 @@ def construct_test_table_primitive_types() -> 
Tuple[pq.FileMetaData, Union[Table
     strings = ["hello", "world"]
     uuids = [uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes, 
uuid.uuid3(uuid.NAMESPACE_DNS, "bar").bytes]
     binaries = [b"hello", b"world"]
+    decimal8 = [Decimal("123.45"), Decimal("678.91")]
+    decimal16 = [Decimal("123456789.123456"), Decimal("678912345.678912")]
+    decimal32 = [Decimal("12345678901234.123456"), 
Decimal("98765432109870.654321")]

Review Comment:
   these should use the proper precision for decimal128



##########
tests/io/test_pyarrow_stats.py:
##########
@@ -529,8 +538,12 @@ def test_metrics_primitive_types() -> None:
     assert datafile.lower_bounds[10] == b"he"
     assert datafile.lower_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, 
"foo").bytes
     assert datafile.lower_bounds[12] == b"he"
+    assert int.from_bytes(datafile.lower_bounds[13], byteorder="big", 
signed=True) == int(12345)
+    assert int.from_bytes(datafile.lower_bounds[14], byteorder="big", 
signed=True) == int(123456789123456)
+    assert int.from_bytes(datafile.lower_bounds[15], byteorder="big", 
signed=True) == int(12345678901234123456)

Review Comment:
   we should test the physical type, such as `STRUCT_INT32`, `STRUCT_INT64`, 
and  `b"he"` (`FIXED_LEN_BYTE_ARRAY`)



##########
tests/io/test_pyarrow_stats.py:
##########
@@ -485,14 +491,17 @@ def construct_test_table_primitive_types() -> 
Tuple[pq.FileMetaData, Union[Table
             "strings": strings,
             "uuids": uuids,
             "binaries": binaries,
+            "decimal8": decimal8,
+            "decimal16": decimal16,
+            "decimal32": decimal32,
         },
         schema=arrow_schema,
     )
 
     metadata_collector: List[Any] = []
 
     with pa.BufferOutputStream() as f:
-        with pq.ParquetWriter(f, table.schema, 
metadata_collector=metadata_collector) as writer:
+        with pq.ParquetWriter(f, table.schema, 
metadata_collector=metadata_collector, store_decimal_as_integer=True) as writer:

Review Comment:
   do we need `store_decimal_as_integer=True`?  



##########
pyiceberg/io/pyarrow.py:
##########
@@ -2335,9 +2340,18 @@ def data_file_statistics_from_parquet_metadata(
                         col_aggs[field_id] = StatsAggregator(
                             stats_col.iceberg_type, statistics.physical_type, 
stats_col.mode.length
                         )
-
-                    col_aggs[field_id].update_min(statistics.min)
-                    col_aggs[field_id].update_max(statistics.max)
+                    matches=DECIMAL_REGEX.search(str(stats_col.iceberg_type))
+                    if matches and statistics.physical_type != 
"FIXED_LEN_BYTE_ARRAY":
+                        precision=int(matches.group(1))
+                        scale=int(matches.group(2))
+                        local_context = Context(prec=precision)
+                        decoded_min = 
local_context.create_decimal(Decimal(statistics.min_raw)/ (10 ** scale))
+                        decoded_max = 
local_context.create_decimal(Decimal(statistics.max_raw)/ (10 ** scale))
+                        col_aggs[field_id].update_min(decoded_min)
+                        col_aggs[field_id].update_max(decoded_max)
+                    else:
+                        col_aggs[field_id].update_min(statistics.min)
+                        col_aggs[field_id].update_max(statistics.max)

Review Comment:
   im confused why we need to use the regex here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to