rdblue commented on code in PR #6437: URL: https://github.com/apache/iceberg/pull/6437#discussion_r1056562592
########## python/pyiceberg/io/pyarrow.py: ########## @@ -437,3 +465,198 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) + + +def project_table( + files: Iterable[FileScanTask], table: Table, row_filter: BooleanExpression, projected_schema: Schema, case_sensitive: bool +) -> pa.Table: + """Resolves the right columns based on the identifier + + Args: + files(Iterable[FileScanTask]): A URI or a path to a local file + table(Table): The table that's being queried + row_filter(BooleanExpression): The expression for filtering rows + projected_schema(Schema): The output schema + case_sensitive(bool): Case sensitivity when looking up column names + + Raises: + ResolveException: When an incompatible query is done + """ + + if isinstance(table.io, PyArrowFileIO): + scheme, path = PyArrowFileIO.parse_location(table.location()) + fs = table.io.get_fs(scheme) + else: + raise ValueError(f"Expected PyArrowFileIO, got: {table.io}") + + bound_row_filter = bind(table.schema(), row_filter, case_sensitive=case_sensitive) + + projected_field_ids = { + id for id in projected_schema.field_ids if not isinstance(projected_schema.find_type(id), (MapType, ListType)) + }.union(extract_field_ids(bound_row_filter)) + + tables = [] + for task in files: + _, path = PyArrowFileIO.parse_location(task.file.file_path) + + # Get the schema + with fs.open_input_file(path) as fout: + parquet_schema = pq.read_schema(fout) + schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA) + file_schema = Schema.parse_raw(schema_raw) + + pyarrow_filter = None + if row_filter is not AlwaysTrue(): + translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive) + bound_row_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) + pyarrow_filter = expression_to_pyarrow(bound_row_filter) + + file_project_schema = prune_columns(file_schema, projected_field_ids, select_full_types=False) + + if file_schema is None: + raise ValueError(f"Missing Iceberg schema in Metadata for file: {path}") + + # Prune the stuff that we don't need anyway + file_project_schema_arrow = schema_to_pyarrow(file_project_schema) + + arrow_table = ds.dataset( + source=[path], schema=file_project_schema_arrow, format=ds.ParquetFileFormat(), filesystem=fs + ).to_table(filter=pyarrow_filter) + + tables.append(to_requested_schema(projected_schema, file_project_schema, arrow_table)) + + if len(tables) > 1: + return pa.concat_tables(tables) + else: + return tables[0] + + +def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa.Table) -> pa.Table: + return VisitWithArrow(requested_schema, file_schema, table).visit() + + +class VisitWithArrow: + requested_schema: Schema + file_schema: Schema + table: pa.Table + + def __init__(self, requested_schema: Schema, file_schema: Schema, table: pa.Table) -> None: + self.requested_schema = requested_schema + self.file_schema = file_schema + self.table = table + + def visit(self) -> pa.Table: + return self.visit_with_arrow(self.requested_schema, self.file_schema) + + @singledispatchmethod + def visit_with_arrow(self, requested_schema: Union[Schema, IcebergType], file_schema: Union[Schema, IcebergType]) -> pa.Table: + """A generic function for applying a schema visitor to any point within a schema + + The function traverses the schema in post-order fashion + + Args: + obj(Schema | IcebergType): An instance of a Schema or an IcebergType + visitor (VisitWithArrow[T]): An instance of an implementation of the generic VisitWithArrow base class + + Raises: + NotImplementedError: If attempting to visit an unrecognized object type + """ + raise NotImplementedError(f"Cannot visit non-type: {requested_schema}") Review Comment: Unsupported type? We can't necessarily conclude that it is a non-type. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org