kevinjqliu commented on code in PR #2204: URL: https://github.com/apache/iceberg-python/pull/2204#discussion_r2206212298
########## pyiceberg/io/pyarrow.py: ########## @@ -2765,3 +2767,22 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T ) return table_partitions + + +def _get_field_from_arrow_table(arrow_table: pa.Table, field_path: str) -> pa.Array: + """Get a nested field from an Arrow table struct type field using dot notation. + + Args: + arrow_table: The Arrow table containing the field + field_path: Dot-separated field path (e.g., "name" or "bar.baz.timestamp") + + Returns: + The unnested field as a PyArrow Array + """ + if "." not in field_path: Review Comment: this is fine since we use `"."` to implicitly reference nested fields https://github.com/apache/iceberg-python/blob/f475b8e692bf50bb54f468bd5bb55906721a900c/pyiceberg/expressions/parser.py#L100-L102 https://github.com/apache/iceberg-python/blob/f475b8e692bf50bb54f468bd5bb55906721a900c/pyiceberg/table/update/schema.py#L167-L171 ########## tests/io/test_pyarrow.py: ########## @@ -2350,6 +2350,72 @@ def test_partition_for_demo() -> None: ) +def test_partition_for_nested_field() -> None: + schema = Schema( + NestedField(id=1, name="foo", field_type=StringType(), required=True), + NestedField( + id=2, + name="bar", + field_type=StructType( + NestedField(id=3, name="baz", field_type=TimestampType(), required=False), + NestedField(id=4, name="qux", field_type=IntegerType(), required=False), + ), + required=True, + ), + ) + + spec = PartitionSpec(PartitionField(source_id=3, field_id=1000, transform=HourTransform(), name="ts")) + + from datetime import datetime + + t1 = datetime(2025, 7, 11, 9, 30, 0) + t2 = datetime(2025, 7, 11, 10, 30, 0) + + test_data = [ + {"foo": "a", "bar": {"baz": t1, "qux": 1}}, + {"foo": "b", "bar": {"baz": t2, "qux": 2}}, + ] + + arrow_table = pa.Table.from_pylist(test_data, schema=schema.as_arrow()) + partitions = _determine_partitions(spec, schema, arrow_table) Review Comment: nit: instead of using `_determine_partitions` directly, wydt of roundtriping a write/read with`.append` and then read by the partition value via `.inspect`? ########## pyiceberg/io/pyarrow.py: ########## @@ -2765,3 +2767,22 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T ) return table_partitions + + +def _get_field_from_arrow_table(arrow_table: pa.Table, field_path: str) -> pa.Array: + """Get a nested field from an Arrow table struct type field using dot notation. + + Args: + arrow_table: The Arrow table containing the field + field_path: Dot-separated field path (e.g., "name" or "bar.baz.timestamp") + + Returns: + The unnested field as a PyArrow Array + """ + if "." not in field_path: + return arrow_table[field_path] + + path_parts = field_path.split(".") + field_array = arrow_table[path_parts[0]] + field_array = pc.struct_field(field_array, path_parts[1:]) Review Comment: interesting, so we first reference the struct field in the pa.Table and then navigate to it using [`struct_field`'s indices by name](https://arrow.apache.org/docs/python/generated/pyarrow.compute.struct_field.html#pyarrow-compute-struct-field) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org