HonahX commented on code in PR #906: URL: https://github.com/apache/iceberg-python/pull/906#discussion_r1671571771
########## tests/io/test_pyarrow.py: ########## @@ -1718,3 +1720,85 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: # and will produce half the number of files if we double the target size bin_packed = bin_pack_arrow_table(bigger_arrow_tbl, target_file_size=arrow_table_with_null.nbytes * 2) assert len(list(bin_packed)) == 5 + + +def test_partition_for_demo() -> None: + import pyarrow as pa Review Comment: ```suggestion ``` Since the test is moved to `test_pyarrow.py`, we do not need to import pyarrow separately in the test. pyarrow is imported for the whole test file. ########## tests/io/test_pyarrow.py: ########## @@ -1718,3 +1720,85 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: # and will produce half the number of files if we double the target size bin_packed = bin_pack_arrow_table(bigger_arrow_tbl, target_file_size=arrow_table_with_null.nbytes * 2) assert len(list(bin_packed)) == 5 + + +def test_partition_for_demo() -> None: + import pyarrow as pa + + test_pa_schema = pa.schema([("year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_schema = Schema( + NestedField(field_id=1, name="year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), + schema_id=1, + ) + test_data = { + "year": [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], + "n_legs": [2, 2, 2, 4, 4, 4, 4, 5, 100], + "animal": ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], + } + arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) + partition_spec = PartitionSpec( + PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="n_legs_identity"), + PartitionField(source_id=1, field_id=1001, transform=IdentityTransform(), name="year_identity"), + ) + result = _determine_partitions(partition_spec, test_schema, arrow_table) + assert {table_partition.partition_key.partition for table_partition in result} == { + Record(n_legs_identity=2, year_identity=2020), + Record(n_legs_identity=100, year_identity=2021), + Record(n_legs_identity=4, year_identity=2021), + Record(n_legs_identity=4, year_identity=2022), + Record(n_legs_identity=2, year_identity=2022), + Record(n_legs_identity=5, year_identity=2019), + } + assert ( + pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]).num_rows == arrow_table.num_rows + ) + + +def test_identity_partition_on_multi_columns() -> None: + import pyarrow as pa Review Comment: ```suggestion ``` ########## pyiceberg/io/pyarrow.py: ########## @@ -2102,3 +2100,100 @@ def _dataframe_to_data_files( for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) ]), ) + + +@dataclass(frozen=True) +class TablePartition: Review Comment: ```suggestion class _TablePartition: ``` One minor request: I prefer to make this private because this class primarily serves as an intermediate state of our internal logic of write support -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org