Fokko commented on code in PR #6437: URL: https://github.com/apache/iceberg/pull/6437#discussion_r1059517229
########## python/pyiceberg/io/pyarrow.py: ########## @@ -437,3 +468,170 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) + + +def project_table( + files: Iterable[FileScanTask], table: Table, row_filter: BooleanExpression, projected_schema: Schema, case_sensitive: bool +) -> pa.Table: + """Resolves the right columns based on the identifier + + Args: + files(Iterable[FileScanTask]): A URI or a path to a local file + table(Table): The table that's being queried + row_filter(BooleanExpression): The expression for filtering rows + projected_schema(Schema): The output schema + case_sensitive(bool): Case sensitivity when looking up column names + + Raises: + ResolveException: When an incompatible query is done + """ + + if isinstance(table.io, PyArrowFileIO): + scheme, path = PyArrowFileIO.parse_location(table.location()) + fs = table.io.get_fs(scheme) + else: + raise ValueError(f"Expected PyArrowFileIO, got: {table.io}") + + bound_row_filter = bind(table.schema(), row_filter, case_sensitive=case_sensitive) + + projected_field_ids = { + id for id in projected_schema.field_ids if not isinstance(projected_schema.find_type(id), (MapType, ListType)) + }.union(extract_field_ids(bound_row_filter)) + + tables = [] + for task in files: + _, path = PyArrowFileIO.parse_location(task.file.file_path) + + # Get the schema + with fs.open_input_file(path) as fout: + parquet_schema = pq.read_schema(fout) + schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA) + file_schema = Schema.parse_raw(schema_raw) + + pyarrow_filter = None + if row_filter is not AlwaysTrue(): + translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive) + bound_row_filter = bind(file_schema, translated_row_filter, case_sensitive=case_sensitive) + pyarrow_filter = expression_to_pyarrow(bound_row_filter) Review Comment: That's a good question. Currently it throws: ``` _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ test_pyarrow.py:740: in project return project_table( ../../pyiceberg/io/pyarrow.py:495: in project_table bound_row_filter = bind(table.schema(), row_filter, case_sensitive=case_sensitive) ../../pyiceberg/expressions/visitors.py:203: in bind return visit(expression, BindVisitor(schema, case_sensitive)) /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/functools.py:877: in wrapper return dispatch(args[0].__class__)(*args, **kw) ../../pyiceberg/expressions/visitors.py:175: in _ return visitor.visit_unbound_predicate(predicate=obj) ../../pyiceberg/expressions/visitors.py:240: in visit_unbound_predicate return predicate.bind(self.schema, case_sensitive=self.case_sensitive) ../../pyiceberg/expressions/__init__.py:615: in bind bound_term = self.term.bind(schema, case_sensitive) ../../pyiceberg/expressions/__init__.py:180: in bind field = schema.find_field(name_or_id=self.name, case_sensitive=case_sensitive) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = Schema(NestedField(field_id=1, name='id', field_type=IntegerType(), required=False), NestedField(field_id=2, name='data', field_type=StringType(), required=False), schema_id=0, identifier_field_ids=[]) name_or_id = 'unknown_field', case_sensitive = True def find_field(self, name_or_id: Union[str, int], case_sensitive: bool = True) -> NestedField: """Find a field using a field name or field ID Args: name_or_id (str | int): Either a field name or a field ID case_sensitive (bool, optional): Whether to perform a case-sensitive lookup using a field name. Defaults to True. Raises: ValueError: When the value cannot be found Returns: NestedField: The matched NestedField """ if isinstance(name_or_id, int): if name_or_id not in self._lazy_id_to_field: raise ValueError(f"Could not find field with id: {name_or_id}") return self._lazy_id_to_field[name_or_id] if case_sensitive: field_id = self._name_to_id.get(name_or_id) else: field_id = self._lazy_name_to_id_lower.get(name_or_id.lower()) if field_id is None: > raise ValueError(f"Could not find field with name {name_or_id}, case_sensitive={case_sensitive}") E ValueError: Could not find field with name unknown_field, case_sensitive=True ``` Also embedded this in a test-case -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org