Fokko commented on code in PR #6437: URL: https://github.com/apache/iceberg/pull/6437#discussion_r1052610988
########## python/pyiceberg/io/pyarrow.py: ########## @@ -437,3 +459,120 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) + + +class _ConstructFinalSchema(SchemaVisitor[pa.ChunkedArray]): + file_schema: Schema + table: pa.Table + + def __init__(self, file_schema: Schema, table: pa.Table): + self.file_schema = file_schema + self.table = table + + def schema(self, schema: Schema, struct_result: List[pa.ChunkedArray]) -> pa.Table: + return pa.table(struct_result, schema=schema_to_pyarrow(schema)) + + def struct(self, _: StructType, field_results: List[pa.ChunkedArray]) -> List[pa.ChunkedArray]: + return field_results + + def field(self, field: NestedField, _: pa.ChunkedArray) -> pa.ChunkedArray: + column_name = self.file_schema.find_column_name(field.field_id) + + if column_name: + column_idx = self.table.schema.get_field_index(column_name) + else: + column_idx = -1 + + expected_arrow_type = schema_to_pyarrow(field.field_type) + + # The idx will be -1 when the column can't be found + if column_idx >= 0: + column_field: pa.Field = self.table.schema[column_idx] + column_arrow_type: pa.DataType = column_field.type + column_data: pa.ChunkedArray = self.table[column_idx] + file_type = self.file_schema.find_type(field.field_id) + + # In case of schema evolution + if column_arrow_type != expected_arrow_type: + # To check if the promotion is allowed + _ = promote(file_type, field.field_type) + column_data = column_data.cast(expected_arrow_type) + else: + import numpy as np + + column_data = pa.array(np.full(shape=len(self.table), fill_value=None), type=expected_arrow_type) + return column_data + + def list(self, _: ListType, element_result: pa.ChunkedArray) -> pa.ChunkedArray: + pass + + def map(self, _: MapType, key_result: pa.ChunkedArray, value_result: pa.ChunkedArray) -> pa.DataType: + pass + + def primitive(self, primitive: PrimitiveType) -> pa.ChunkedArray: + pass + + +def to_final_schema(final_schema: Schema, schema: Schema, table: pa.Table) -> pa.Table: + return visit(final_schema, _ConstructFinalSchema(schema, table)) + + +def project_table( + files: Iterable["FileScanTask"], table: "Table", row_filter: BooleanExpression, projected_schema: Schema, case_sensitive: bool +) -> pa.Table: + """Resolves the right columns based on the identifier + + Args: + files(Iterable[FileScanTask]): A URI or a path to a local file + table(Table): The table that's being queried + row_filter(BooleanExpression): The expression for filtering rows + projected_schema(Schema): The output schema + case_sensitive(bool): Case sensitivity when looking up column names + + Raises: + ResolveException: When an incompatible query is done + """ + + if isinstance(table.io, PyArrowFileIO): Review Comment: Should we keep that for another PR? ########## python/pyiceberg/io/pyarrow.py: ########## @@ -437,3 +459,120 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) + + +class _ConstructFinalSchema(SchemaVisitor[pa.ChunkedArray]): + file_schema: Schema + table: pa.Table + + def __init__(self, file_schema: Schema, table: pa.Table): + self.file_schema = file_schema + self.table = table + + def schema(self, schema: Schema, struct_result: List[pa.ChunkedArray]) -> pa.Table: + return pa.table(struct_result, schema=schema_to_pyarrow(schema)) + + def struct(self, _: StructType, field_results: List[pa.ChunkedArray]) -> List[pa.ChunkedArray]: + return field_results + + def field(self, field: NestedField, _: pa.ChunkedArray) -> pa.ChunkedArray: + column_name = self.file_schema.find_column_name(field.field_id) + + if column_name: + column_idx = self.table.schema.get_field_index(column_name) + else: + column_idx = -1 + + expected_arrow_type = schema_to_pyarrow(field.field_type) + + # The idx will be -1 when the column can't be found + if column_idx >= 0: + column_field: pa.Field = self.table.schema[column_idx] + column_arrow_type: pa.DataType = column_field.type + column_data: pa.ChunkedArray = self.table[column_idx] + file_type = self.file_schema.find_type(field.field_id) + + # In case of schema evolution + if column_arrow_type != expected_arrow_type: + # To check if the promotion is allowed + _ = promote(file_type, field.field_type) + column_data = column_data.cast(expected_arrow_type) + else: + import numpy as np + + column_data = pa.array(np.full(shape=len(self.table), fill_value=None), type=expected_arrow_type) + return column_data + + def list(self, _: ListType, element_result: pa.ChunkedArray) -> pa.ChunkedArray: + pass + + def map(self, _: MapType, key_result: pa.ChunkedArray, value_result: pa.ChunkedArray) -> pa.DataType: + pass + + def primitive(self, primitive: PrimitiveType) -> pa.ChunkedArray: + pass + + +def to_final_schema(final_schema: Schema, schema: Schema, table: pa.Table) -> pa.Table: Review Comment: I like requested a lot, thanks! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org