kevinjqliu commented on code in PR #1799: URL: https://github.com/apache/iceberg-python/pull/1799#discussion_r1999324401
########## tests/io/test_pyarrow_stats.py: ########## @@ -72,7 +72,7 @@ StringType, ) from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, time_to_micros - +from decimal import Decimal Review Comment: i think we should use [pyarrow.decimal128](https://arrow.apache.org/docs/python/generated/pyarrow.decimal128.html) instead ########## tests/io/test_pyarrow_stats.py: ########## @@ -470,6 +473,9 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table strings = ["hello", "world"] uuids = [uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes, uuid.uuid3(uuid.NAMESPACE_DNS, "bar").bytes] binaries = [b"hello", b"world"] + decimal8 = [Decimal("123.45"), Decimal("678.91")] + decimal16 = [Decimal("123456789.123456"), Decimal("678912345.678912")] + decimal32 = [Decimal("12345678901234.123456"), Decimal("98765432109870.654321")] Review Comment: these should use the proper precision for decimal128 ########## tests/io/test_pyarrow_stats.py: ########## @@ -529,8 +538,12 @@ def test_metrics_primitive_types() -> None: assert datafile.lower_bounds[10] == b"he" assert datafile.lower_bounds[11] == uuid.uuid3(uuid.NAMESPACE_DNS, "foo").bytes assert datafile.lower_bounds[12] == b"he" + assert int.from_bytes(datafile.lower_bounds[13], byteorder="big", signed=True) == int(12345) + assert int.from_bytes(datafile.lower_bounds[14], byteorder="big", signed=True) == int(123456789123456) + assert int.from_bytes(datafile.lower_bounds[15], byteorder="big", signed=True) == int(12345678901234123456) Review Comment: we should test the physical type, such as `STRUCT_INT32`, `STRUCT_INT64`, and `b"he"` (`FIXED_LEN_BYTE_ARRAY`) ########## tests/io/test_pyarrow_stats.py: ########## @@ -485,14 +491,17 @@ def construct_test_table_primitive_types() -> Tuple[pq.FileMetaData, Union[Table "strings": strings, "uuids": uuids, "binaries": binaries, + "decimal8": decimal8, + "decimal16": decimal16, + "decimal32": decimal32, }, schema=arrow_schema, ) metadata_collector: List[Any] = [] with pa.BufferOutputStream() as f: - with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector) as writer: + with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, store_decimal_as_integer=True) as writer: Review Comment: do we need `store_decimal_as_integer=True`? ########## pyiceberg/io/pyarrow.py: ########## @@ -2335,9 +2340,18 @@ def data_file_statistics_from_parquet_metadata( col_aggs[field_id] = StatsAggregator( stats_col.iceberg_type, statistics.physical_type, stats_col.mode.length ) - - col_aggs[field_id].update_min(statistics.min) - col_aggs[field_id].update_max(statistics.max) + matches=DECIMAL_REGEX.search(str(stats_col.iceberg_type)) + if matches and statistics.physical_type != "FIXED_LEN_BYTE_ARRAY": + precision=int(matches.group(1)) + scale=int(matches.group(2)) + local_context = Context(prec=precision) + decoded_min = local_context.create_decimal(Decimal(statistics.min_raw)/ (10 ** scale)) + decoded_max = local_context.create_decimal(Decimal(statistics.max_raw)/ (10 ** scale)) + col_aggs[field_id].update_min(decoded_min) + col_aggs[field_id].update_max(decoded_max) + else: + col_aggs[field_id].update_min(statistics.min) + col_aggs[field_id].update_max(statistics.max) Review Comment: im confused why we need to use the regex here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org