rdblue commented on code in PR #6437: URL: https://github.com/apache/iceberg/pull/6437#discussion_r1059496896
########## python/pyiceberg/io/pyarrow.py: ########## @@ -437,3 +468,170 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) + + +def project_table( + files: Iterable[FileScanTask], table: Table, row_filter: BooleanExpression, projected_schema: Schema, case_sensitive: bool +) -> pa.Table: + """Resolves the right columns based on the identifier + + Args: + files(Iterable[FileScanTask]): A URI or a path to a local file + table(Table): The table that's being queried + row_filter(BooleanExpression): The expression for filtering rows + projected_schema(Schema): The output schema + case_sensitive(bool): Case sensitivity when looking up column names + + Raises: + ResolveException: When an incompatible query is done + """ + + if isinstance(table.io, PyArrowFileIO): + scheme, path = PyArrowFileIO.parse_location(table.location()) + fs = table.io.get_fs(scheme) + else: + raise ValueError(f"Expected PyArrowFileIO, got: {table.io}") + + bound_row_filter = bind(table.schema(), row_filter, case_sensitive=case_sensitive) + + projected_field_ids = { + id for id in projected_schema.field_ids if not isinstance(projected_schema.find_type(id), (MapType, ListType)) + }.union(extract_field_ids(bound_row_filter)) + + tables = [] + for task in files: + _, path = PyArrowFileIO.parse_location(task.file.file_path) + + # Get the schema + with fs.open_input_file(path) as fout: + parquet_schema = pq.read_schema(fout) + schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA) + file_schema = Schema.parse_raw(schema_raw) + + pyarrow_filter = None + if row_filter is not AlwaysTrue(): + translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive) + bound_row_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) Review Comment: Looks like the name `bound_row_filter` is reused, which is going to cause problems because it leaks the filter for the last iteration into the current iteration. Probably best to use `bound_file_filter` or something. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org