rdblue commented on code in PR #6437: URL: https://github.com/apache/iceberg/pull/6437#discussion_r1059515855
########## python/pyiceberg/io/pyarrow.py: ########## @@ -437,3 +468,170 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) + + +def project_table( + files: Iterable[FileScanTask], table: Table, row_filter: BooleanExpression, projected_schema: Schema, case_sensitive: bool +) -> pa.Table: + """Resolves the right columns based on the identifier + + Args: + files(Iterable[FileScanTask]): A URI or a path to a local file + table(Table): The table that's being queried + row_filter(BooleanExpression): The expression for filtering rows + projected_schema(Schema): The output schema + case_sensitive(bool): Case sensitivity when looking up column names + + Raises: + ResolveException: When an incompatible query is done + """ + + if isinstance(table.io, PyArrowFileIO): + scheme, path = PyArrowFileIO.parse_location(table.location()) + fs = table.io.get_fs(scheme) + else: + raise ValueError(f"Expected PyArrowFileIO, got: {table.io}") + + bound_row_filter = bind(table.schema(), row_filter, case_sensitive=case_sensitive) + + projected_field_ids = { + id for id in projected_schema.field_ids if not isinstance(projected_schema.find_type(id), (MapType, ListType)) + }.union(extract_field_ids(bound_row_filter)) + + tables = [] + for task in files: + _, path = PyArrowFileIO.parse_location(task.file.file_path) + + # Get the schema + with fs.open_input_file(path) as fout: + parquet_schema = pq.read_schema(fout) + schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA) + file_schema = Schema.parse_raw(schema_raw) + + pyarrow_filter = None + if row_filter is not AlwaysTrue(): + translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive) + bound_row_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) + pyarrow_filter = expression_to_pyarrow(bound_row_filter) + + file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False) + + if file_schema is None: + raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}") + + # Prune the stuff that we don't need anyway + file_project_schema_arrow = schema_to_pyarrow(file_project_schema) + + arrow_table = ds.dataset( + source=[path], schema=file_project_schema_arrow, format=ds.ParquetFileFormat(), filesystem=fs + ).to_table(filter=pyarrow_filter) + + tables.append(to_requested_schema(projected_schema, file_project_schema, arrow_table)) + + if len(tables) > 1: + return pa.concat_tables(tables) + else: + return tables[0] + + +def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa.Table) -> pa.Table: + struct_array = visit_with_partner( + requested_schema, table, ArrowProjectionVisitor(file_schema, len(table)), ArrowAccessor(file_schema) + ) + + arrays = [] + fields = [] + for pos, field in enumerate(requested_schema.fields): + array = struct_array.field(pos) + arrays.append(array) + fields.append(pa.field(field.name, array.type, field.optional)) + return pa.Table.from_arrays(arrays, schema=pa.schema(fields)) + + +class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Array]]): + file_schema: Schema + table_length: int + + def __init__(self, file_schema: Schema, table_length: int): + self.file_schema = file_schema + self.table_length = table_length + + def cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: + file_field = self.file_schema.find_field(field.field_id) + if field.field_type != file_field.field_type: + return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type))) + + return values + + def schema(self, schema: Schema, schema_partner: Optional[pa.Array], struct_result: Optional[pa.Array]) -> Optional[pa.Array]: + return struct_result + + def struct( + self, struct: StructType, struct_array: Optional[pa.Array], field_results: List[Optional[pa.Array]] + ) -> Optional[pa.Array]: + if struct_array is None: + return None + return pa.StructArray.from_arrays(arrays=field_results, fields=pa.struct(schema_to_pyarrow(struct))) + + def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional[pa.Array]) -> Optional[pa.Array]: + if field_array is not None: + return self.cast_if_needed(field, field_array) + elif field.optional: + arrow_type = schema_to_pyarrow(field.field_type) + return pa.nulls(self.table_length, type=arrow_type) Review Comment: This isn't correct. The array length may not necessarily be the number of rows if the array is in a list element, map key, or map value. Here's a test case that demonstrates the problem: ```python @pytest.fixture def schema_list_of_structs() -> Schema: return Schema( NestedField(5, "locations", ListType(51, StructType( NestedField(511, "lat", DoubleType()), NestedField(512, "long", DoubleType()) ), element_required=False), required=False), ) @pytest.fixture def file_list_of_structs(schema_list_of_structs: Schema, tmpdir: str) -> str: pyarrow_schema = pa.schema( schema_to_pyarrow(schema_list_of_structs), metadata={"iceberg.schema": schema_list_of_structs.json()}) return _write_table_to_file( f"file:{tmpdir}/e.parquet", pyarrow_schema, pa.Table.from_pylist( [ {"locations": [{"lat": 52.371807, "long": 4.896029}, {"lat": 52.387386, "long": 4.646219}]}, {"locations": []}, {"locations": [{"lat": 52.078663, "long": 4.288788}, {"lat": 52.387386, "long": 4.646219}]}, ], schema=pyarrow_schema, ), ) def test_projection_list_of_structs(schema_list_of_structs: Schema, file_list_of_structs: str) -> None: schema = Schema( NestedField(5, "locations", ListType(51, StructType( NestedField(511, "latitude", DoubleType()), NestedField(512, "longitude", DoubleType()), NestedField(513, "altitude", DoubleType(), required=False) ), element_required=False), required=False), ) result_table = project(schema, [file_list_of_structs]) assert len(result_table.columns) == 1 assert len(result_table.columns[0]) == 3 for actual, expected in zip(result_table.columns[0], [ [{"latitude": 52.371807, "longitude": 4.896029, "altitude": None}, {"latitude": 52.387386, "longitude": 4.646219, "altitude": None}], [], [{"latitude": 52.078663, "longitude": 4.288788, "altitude": None}, {"latitude": 52.387386, "longitude": 4.646219, "altitude": None}], ]): assert actual.as_py() == expected assert ( repr(result_table.schema) == """locations: list<item: struct<latitude: double not null, longitude: double not null, altitude: double>> child 0, item: struct<latitude: double not null, longitude: double not null, altitude: double> child 0, latitude: double not null child 1, longitude: double not null child 2, altitude: double""" ) ``` If this uses the table length, then the test fails with `pyarrow.lib.ArrowInvalid: Mismatching child array lengths`. This works when you use `len(struct_array)`, which is why I originally had this logic inside the `struct` method: ```python def struct( self, struct: StructType, struct_array: Optional[pa.Array], field_results: List[Optional[pa.Array]] ) -> Optional[pa.Array]: if struct_array is None: return None field_arrays: List[pa.Array] = [] fields: List[pa.Field] = [] for field, field_array in zip(struct.fields, field_results): if field_array is not None: array = self.cast_if_needed(field, field_array) field_arrays.append(array) fields.append(pa.field(field.name, array.type, field.optional)) elif field.optional: arrow_type = schema_to_pyarrow(field.field_type) field_arrays.append(pa.nulls(len(struct_array), type=arrow_type)) fields.append(pa.field(field.name, arrow_type, field.optional)) else: raise ResolveError(f"Field is required, and could not be found in the file: {field}") return pa.StructArray.from_arrays(arrays=field_arrays, fields=pa.struct(fields)) ``` I also think that it is better to put this logic in the `struct` method to avoid calling `schema_to_pyarrow(struct)`. In the version above, the struct is built from each field rather than converting another tree-like structure in the middle of visiting. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org