Fokko commented on code in PR #6437:
URL: https://github.com/apache/iceberg/pull/6437#discussion_r1059517229
##########
python/pyiceberg/io/pyarrow.py:
##########
@@ -437,3 +468,170 @@ def visit_or(self, left_result: pc.Expression,
right_result: pc.Expression) -> p
def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression:
return boolean_expression_visit(expr, _ConvertToArrowExpression())
+
+
+def project_table(
+ files: Iterable[FileScanTask], table: Table, row_filter:
BooleanExpression, projected_schema: Schema, case_sensitive: bool
+) -> pa.Table:
+ """Resolves the right columns based on the identifier
+
+ Args:
+ files(Iterable[FileScanTask]): A URI or a path to a local file
+ table(Table): The table that's being queried
+ row_filter(BooleanExpression): The expression for filtering rows
+ projected_schema(Schema): The output schema
+ case_sensitive(bool): Case sensitivity when looking up column names
+
+ Raises:
+ ResolveException: When an incompatible query is done
+ """
+
+ if isinstance(table.io, PyArrowFileIO):
+ scheme, path = PyArrowFileIO.parse_location(table.location())
+ fs = table.io.get_fs(scheme)
+ else:
+ raise ValueError(f"Expected PyArrowFileIO, got: {table.io}")
+
+ bound_row_filter = bind(table.schema(), row_filter,
case_sensitive=case_sensitive)
+
+ projected_field_ids = {
+ id for id in projected_schema.field_ids if not
isinstance(projected_schema.find_type(id), (MapType, ListType))
+ }.union(extract_field_ids(bound_row_filter))
+
+ tables = []
+ for task in files:
+ _, path = PyArrowFileIO.parse_location(task.file.file_path)
+
+ # Get the schema
+ with fs.open_input_file(path) as fout:
+ parquet_schema = pq.read_schema(fout)
+ schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA)
+ file_schema = Schema.parse_raw(schema_raw)
+
+ pyarrow_filter = None
+ if row_filter is not AlwaysTrue():
+ translated_row_filter = translate_column_names(bound_row_filter,
file_schema, case_sensitive=case_sensitive)
+ bound_row_filter = bind(file_schema, translated_row_filter,
case_sensitive=case_sensitive)
+ pyarrow_filter = expression_to_pyarrow(bound_row_filter)
Review Comment:
That's a good question. Currently it throws:
```
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _
test_pyarrow.py:740: in project
return project_table(
../../pyiceberg/io/pyarrow.py:495: in project_table
bound_row_filter = bind(table.schema(), row_filter,
case_sensitive=case_sensitive)
../../pyiceberg/expressions/visitors.py:203: in bind
return visit(expression, BindVisitor(schema, case_sensitive))
/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/functools.py:877:
in wrapper
return dispatch(args[0].__class__)(*args, **kw)
../../pyiceberg/expressions/visitors.py:175: in _
return visitor.visit_unbound_predicate(predicate=obj)
../../pyiceberg/expressions/visitors.py:240: in visit_unbound_predicate
return predicate.bind(self.schema, case_sensitive=self.case_sensitive)
../../pyiceberg/expressions/__init__.py:615: in bind
bound_term = self.term.bind(schema, case_sensitive)
../../pyiceberg/expressions/__init__.py:180: in bind
field = schema.find_field(name_or_id=self.name,
case_sensitive=case_sensitive)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
_ _
self = Schema(NestedField(field_id=1, name='id', field_type=IntegerType(),
required=False), NestedField(field_id=2, name='data', field_type=StringType(),
required=False), schema_id=0, identifier_field_ids=[])
name_or_id = 'unknown_field', case_sensitive = True
def find_field(self, name_or_id: Union[str, int], case_sensitive: bool =
True) -> NestedField:
"""Find a field using a field name or field ID
Args:
name_or_id (str | int): Either a field name or a field ID
case_sensitive (bool, optional): Whether to perform a
case-sensitive lookup using a field name. Defaults to True.
Raises:
ValueError: When the value cannot be found
Returns:
NestedField: The matched NestedField
"""
if isinstance(name_or_id, int):
if name_or_id not in self._lazy_id_to_field:
raise ValueError(f"Could not find field with id:
{name_or_id}")
return self._lazy_id_to_field[name_or_id]
if case_sensitive:
field_id = self._name_to_id.get(name_or_id)
else:
field_id = self._lazy_name_to_id_lower.get(name_or_id.lower())
if field_id is None:
> raise ValueError(f"Could not find field with name {name_or_id},
case_sensitive={case_sensitive}")
E ValueError: Could not find field with name unknown_field,
case_sensitive=True
```
Also embedded this in a test-case
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]