[GitHub] [iceberg] rdblue commented on a diff in pull request #6437: Python: Projection by Field ID

GitBox Fri, 23 Dec 2022 10:34:10 -0800


rdblue commented on code in PR #6437:
URL: https://github.com/apache/iceberg/pull/6437#discussion_r1056562592



##########
python/pyiceberg/io/pyarrow.py:
##########
@@ -437,3 +465,198 @@ def visit_or(self, left_result: pc.Expression, 
right_result: pc.Expression) -> p
 
 def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression:
     return boolean_expression_visit(expr, _ConvertToArrowExpression())
+
+
+def project_table(
+    files: Iterable[FileScanTask], table: Table, row_filter: 
BooleanExpression, projected_schema: Schema, case_sensitive: bool
+) -> pa.Table:
+    """Resolves the right columns based on the identifier
+
+    Args:
+        files(Iterable[FileScanTask]): A URI or a path to a local file
+        table(Table): The table that's being queried
+        row_filter(BooleanExpression): The expression for filtering rows
+        projected_schema(Schema): The output schema
+        case_sensitive(bool): Case sensitivity when looking up column names
+
+    Raises:
+        ResolveException: When an incompatible query is done
+    """
+
+    if isinstance(table.io, PyArrowFileIO):
+        scheme, path = PyArrowFileIO.parse_location(table.location())
+        fs = table.io.get_fs(scheme)
+    else:
+        raise ValueError(f"Expected PyArrowFileIO, got: {table.io}")
+
+    bound_row_filter = bind(table.schema(), row_filter, 
case_sensitive=case_sensitive)
+
+    projected_field_ids = {
+        id for id in projected_schema.field_ids if not 
isinstance(projected_schema.find_type(id), (MapType, ListType))
+    }.union(extract_field_ids(bound_row_filter))
+
+    tables = []
+    for task in files:
+        _, path = PyArrowFileIO.parse_location(task.file.file_path)
+
+        # Get the schema
+        with fs.open_input_file(path) as fout:
+            parquet_schema = pq.read_schema(fout)
+            schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA)
+            file_schema = Schema.parse_raw(schema_raw)
+
+        pyarrow_filter = None
+        if row_filter is not AlwaysTrue():
+            translated_row_filter = translate_column_names(bound_row_filter, 
file_schema, case_sensitive=case_sensitive)
+            bound_row_filter = bind(file_schema, translated_row_filter, 
case_sensitive=case_sensitive)
+            pyarrow_filter = expression_to_pyarrow(bound_row_filter)
+
+        file_project_schema = prune_columns(file_schema, projected_field_ids, 
select_full_types=False)
+
+        if file_schema is None:
+            raise ValueError(f"Missing Iceberg schema in Metadata for file: 
{path}")
+
+        # Prune the stuff that we don't need anyway
+        file_project_schema_arrow = schema_to_pyarrow(file_project_schema)
+
+        arrow_table = ds.dataset(
+            source=[path], schema=file_project_schema_arrow, 
format=ds.ParquetFileFormat(), filesystem=fs
+        ).to_table(filter=pyarrow_filter)
+
+        tables.append(to_requested_schema(projected_schema, 
file_project_schema, arrow_table))
+
+    if len(tables) > 1:
+        return pa.concat_tables(tables)
+    else:
+        return tables[0]
+
+
+def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: 
pa.Table) -> pa.Table:
+    return VisitWithArrow(requested_schema, file_schema, table).visit()
+
+
+class VisitWithArrow:
+    requested_schema: Schema
+    file_schema: Schema
+    table: pa.Table
+
+    def __init__(self, requested_schema: Schema, file_schema: Schema, table: 
pa.Table) -> None:
+        self.requested_schema = requested_schema
+        self.file_schema = file_schema
+        self.table = table
+
+    def visit(self) -> pa.Table:
+        return self.visit_with_arrow(self.requested_schema, self.file_schema)
+
+    @singledispatchmethod
+    def visit_with_arrow(self, requested_schema: Union[Schema, IcebergType], 
file_schema: Union[Schema, IcebergType]) -> pa.Table:
+        """A generic function for applying a schema visitor to any point 
within a schema
+
+        The function traverses the schema in post-order fashion
+
+        Args:
+            obj(Schema | IcebergType): An instance of a Schema or an 
IcebergType
+            visitor (VisitWithArrow[T]): An instance of an implementation of 
the generic VisitWithArrow base class
+
+        Raises:
+            NotImplementedError: If attempting to visit an unrecognized object 
type
+        """
+        raise NotImplementedError(f"Cannot visit non-type: {requested_schema}")

Review Comment:
   Unsupported type? We can't necessarily conclude that it is a non-type.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [iceberg] rdblue commented on a diff in pull request #6437: Python: Projection by Field ID

Reply via email to