kevinjqliu commented on code in PR #1354: URL: https://github.com/apache/iceberg-python/pull/1354#discussion_r1852506143
########## tests/io/test_pyarrow_stats.py: ########## @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] +def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: Review Comment: nit: this seems to be similar to `construct_test_table` above, wydt of adding a parameter to `construct_test_table` to control stat generation? ``` def construct_test_table(write_statistics: bool = True) -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: ``` that way, we don't need this extra function ########## tests/io/test_pyarrow_stats.py: ########## @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] +def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: + table_metadata = { + "format-version": 2, + "location": "s3://bucket/test/location", + "last-column-id": 7, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "strings", "required": False, "type": "string"}, + {"id": 2, "name": "floats", "required": False, "type": "float"} + ] + } + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": []}], + "properties": {}, + } + + table_metadata = TableMetadataUtil.parse_obj(table_metadata) + arrow_schema = schema_to_pyarrow(table_metadata.schemas[0]) + _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"] + _floats = [3.14, math.nan, 1.69, 100] + + table = pa.Table.from_pydict( + { + "strings": _strings, + "floats": _floats + }, + schema=arrow_schema, + ) + + metadata_collector: List[Any] = [] + + with pa.BufferOutputStream() as f: + with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer: + writer.write_table(table) + + return metadata_collector[0], table_metadata + + +def test_is_stats_set_false() -> None: + metadata, table_metadata = construct_test_table_without_stats() + schema = get_current_schema(table_metadata) + statistics = data_file_statistics_from_parquet_metadata( + parquet_metadata=metadata, + stats_columns=compute_statistics_plan(schema, table_metadata.properties), + parquet_column_mapping=parquet_path_to_id_mapping(schema), + ) + datafile = DataFile(**statistics.to_serialized_dict()) + + # assert attributes except for column_aggregates and null_value_counts are present Review Comment: I think the error case is when the `parquet_metadata` for `data_file_statistics_from_parquet_metadata` does not have specific `col_aggs` and `null_value_counts`, leading the function to error. in that case, can we assert that the input (`metadata`) does not contain them, and maybe assert that the output (`datafile`) also doesn't contain them. this function should fail without the `pyarrow` change above ########## tests/io/test_pyarrow_stats.py: ########## @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] +def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: Review Comment: looks like write_statistics can also take in a list https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html ########## tests/io/test_pyarrow_stats.py: ########## @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None: ] +def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]: + table_metadata = { + "format-version": 2, + "location": "s3://bucket/test/location", + "last-column-id": 7, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + {"id": 1, "name": "strings", "required": False, "type": "string"}, + {"id": 2, "name": "floats", "required": False, "type": "float"} + ] + } + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": []}], + "properties": {}, + } + + table_metadata = TableMetadataUtil.parse_obj(table_metadata) + arrow_schema = schema_to_pyarrow(table_metadata.schemas[0]) + _strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"] + _floats = [3.14, math.nan, 1.69, 100] + + table = pa.Table.from_pydict( + { + "strings": _strings, + "floats": _floats + }, + schema=arrow_schema, + ) + + metadata_collector: List[Any] = [] + + with pa.BufferOutputStream() as f: + with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer: + writer.write_table(table) + + return metadata_collector[0], table_metadata + + +def test_is_stats_set_false() -> None: Review Comment: nit: maybe something like, `test_read_missing_statistics` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org