[GitHub] [iceberg] rdblue commented on a diff in pull request #6437: Python: Projection by Field ID

GitBox Fri, 30 Dec 2022 12:45:34 -0800


rdblue commented on code in PR #6437:
URL: https://github.com/apache/iceberg/pull/6437#discussion_r1059515855



##########
python/pyiceberg/io/pyarrow.py:
##########
@@ -437,3 +468,170 @@ def visit_or(self, left_result: pc.Expression, 
right_result: pc.Expression) -> p
 
 def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression:
     return boolean_expression_visit(expr, _ConvertToArrowExpression())
+
+
+def project_table(
+    files: Iterable[FileScanTask], table: Table, row_filter: 
BooleanExpression, projected_schema: Schema, case_sensitive: bool
+) -> pa.Table:
+    """Resolves the right columns based on the identifier
+
+    Args:
+        files(Iterable[FileScanTask]): A URI or a path to a local file
+        table(Table): The table that's being queried
+        row_filter(BooleanExpression): The expression for filtering rows
+        projected_schema(Schema): The output schema
+        case_sensitive(bool): Case sensitivity when looking up column names
+
+    Raises:
+        ResolveException: When an incompatible query is done
+    """
+
+    if isinstance(table.io, PyArrowFileIO):
+        scheme, path = PyArrowFileIO.parse_location(table.location())
+        fs = table.io.get_fs(scheme)
+    else:
+        raise ValueError(f"Expected PyArrowFileIO, got: {table.io}")
+
+    bound_row_filter = bind(table.schema(), row_filter, 
case_sensitive=case_sensitive)
+
+    projected_field_ids = {
+        id for id in projected_schema.field_ids if not 
isinstance(projected_schema.find_type(id), (MapType, ListType))
+    }.union(extract_field_ids(bound_row_filter))
+
+    tables = []
+    for task in files:
+        _, path = PyArrowFileIO.parse_location(task.file.file_path)
+
+        # Get the schema
+        with fs.open_input_file(path) as fout:
+            parquet_schema = pq.read_schema(fout)
+            schema_raw = parquet_schema.metadata.get(ICEBERG_SCHEMA)
+            file_schema = Schema.parse_raw(schema_raw)
+
+        pyarrow_filter = None
+        if row_filter is not AlwaysTrue():
+            translated_row_filter = translate_column_names(bound_row_filter, 
file_schema, case_sensitive=case_sensitive)
+            bound_row_filter = bind(file_schema, translated_row_filter, 
case_sensitive=case_sensitive)
+            pyarrow_filter = expression_to_pyarrow(bound_row_filter)
+
+        file_project_schema = prune_columns(file_schema, projected_field_ids, 
select_full_types=False)
+
+        if file_schema is None:
+            raise ValueError(f"Missing Iceberg schema in Metadata for file: 
{path}")
+
+        # Prune the stuff that we don't need anyway
+        file_project_schema_arrow = schema_to_pyarrow(file_project_schema)
+
+        arrow_table = ds.dataset(
+            source=[path], schema=file_project_schema_arrow, 
format=ds.ParquetFileFormat(), filesystem=fs
+        ).to_table(filter=pyarrow_filter)
+
+        tables.append(to_requested_schema(projected_schema, 
file_project_schema, arrow_table))
+
+    if len(tables) > 1:
+        return pa.concat_tables(tables)
+    else:
+        return tables[0]
+
+
+def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: 
pa.Table) -> pa.Table:
+    struct_array = visit_with_partner(
+        requested_schema, table, ArrowProjectionVisitor(file_schema, 
len(table)), ArrowAccessor(file_schema)
+    )
+
+    arrays = []
+    fields = []
+    for pos, field in enumerate(requested_schema.fields):
+        array = struct_array.field(pos)
+        arrays.append(array)
+        fields.append(pa.field(field.name, array.type, field.optional))
+    return pa.Table.from_arrays(arrays, schema=pa.schema(fields))
+
+
+class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, 
Optional[pa.Array]]):
+    file_schema: Schema
+    table_length: int
+
+    def __init__(self, file_schema: Schema, table_length: int):
+        self.file_schema = file_schema
+        self.table_length = table_length
+
+    def cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
+        file_field = self.file_schema.find_field(field.field_id)
+        if field.field_type != file_field.field_type:
+            return 
values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type)))
+
+        return values
+
+    def schema(self, schema: Schema, schema_partner: Optional[pa.Array], 
struct_result: Optional[pa.Array]) -> Optional[pa.Array]:
+        return struct_result
+
+    def struct(
+        self, struct: StructType, struct_array: Optional[pa.Array], 
field_results: List[Optional[pa.Array]]
+    ) -> Optional[pa.Array]:
+        if struct_array is None:
+            return None
+        return pa.StructArray.from_arrays(arrays=field_results, 
fields=pa.struct(schema_to_pyarrow(struct)))
+
+    def field(self, field: NestedField, _: Optional[pa.Array], field_array: 
Optional[pa.Array]) -> Optional[pa.Array]:
+        if field_array is not None:
+            return self.cast_if_needed(field, field_array)
+        elif field.optional:
+            arrow_type = schema_to_pyarrow(field.field_type)
+            return pa.nulls(self.table_length, type=arrow_type)

Review Comment:
   This isn't correct. The array length may not necessarily be the number of 
rows if the array is in a list element, map key, or map value. Here's a test 
case that demonstrates the problem:
   
   ```python
   @pytest.fixture
   def schema_list_of_structs() -> Schema:
       return Schema(
           NestedField(5, "locations", ListType(51, StructType(
               NestedField(511, "lat", DoubleType()),
               NestedField(512, "long", DoubleType())
           ), element_required=False), required=False),
       )
   
   
   @pytest.fixture
   def file_list_of_structs(schema_list_of_structs: Schema, tmpdir: str) -> str:
       pyarrow_schema = pa.schema(
           schema_to_pyarrow(schema_list_of_structs),
           metadata={"iceberg.schema": schema_list_of_structs.json()})
       return _write_table_to_file(
           f"file:{tmpdir}/e.parquet",
           pyarrow_schema,
           pa.Table.from_pylist(
               [
                   {"locations": [{"lat": 52.371807, "long": 4.896029}, {"lat": 
52.387386, "long": 4.646219}]},
                   {"locations": []},
                   {"locations": [{"lat": 52.078663, "long": 4.288788}, {"lat": 
52.387386, "long": 4.646219}]},
               ],
               schema=pyarrow_schema,
           ),
       )
   
   
   def test_projection_list_of_structs(schema_list_of_structs: Schema, 
file_list_of_structs: str) -> None:
       schema = Schema(
           NestedField(5, "locations", ListType(51, StructType(
               NestedField(511, "latitude", DoubleType()),
               NestedField(512, "longitude", DoubleType()),
               NestedField(513, "altitude", DoubleType(), required=False)
           ), element_required=False), required=False),
       )
   
       result_table = project(schema, [file_list_of_structs])
       assert len(result_table.columns) == 1
       assert len(result_table.columns[0]) == 3
       for actual, expected in zip(result_table.columns[0], [
           [{"latitude": 52.371807, "longitude": 4.896029, "altitude": None}, 
{"latitude": 52.387386, "longitude": 4.646219, "altitude": None}],
           [],
           [{"latitude": 52.078663, "longitude": 4.288788, "altitude": None}, 
{"latitude": 52.387386, "longitude": 4.646219, "altitude": None}],
       ]):
           assert actual.as_py() == expected
       assert (
           repr(result_table.schema) == """locations: list<item: 
struct<latitude: double not null, longitude: double not null, altitude: double>>
     child 0, item: struct<latitude: double not null, longitude: double not 
null, altitude: double>
         child 0, latitude: double not null
         child 1, longitude: double not null
         child 2, altitude: double"""
       )
   ```
   
   If this uses the table length, then the test fails with 
`pyarrow.lib.ArrowInvalid: Mismatching child array lengths`.
   
   This works when you use `len(struct_array)`, which is why I originally had 
this logic inside the `struct` method:
   
   ```python
       def struct(
           self, struct: StructType, struct_array: Optional[pa.Array], 
field_results: List[Optional[pa.Array]]
       ) -> Optional[pa.Array]:
           if struct_array is None:
               return None
           field_arrays: List[pa.Array] = []
           fields: List[pa.Field] = []
           for field, field_array in zip(struct.fields, field_results):
               if field_array is not None:
                   array = self.cast_if_needed(field, field_array)
                   field_arrays.append(array)
                   fields.append(pa.field(field.name, array.type, 
field.optional))
               elif field.optional:
                   arrow_type = schema_to_pyarrow(field.field_type)
                   field_arrays.append(pa.nulls(len(struct_array), 
type=arrow_type))
                   fields.append(pa.field(field.name, arrow_type, 
field.optional))
               else:
                   raise ResolveError(f"Field is required, and could not be 
found in the file: {field}")
   
           return pa.StructArray.from_arrays(arrays=field_arrays, 
fields=pa.struct(fields))
   ```
   
   I also think that it is better to put this logic in the `struct` method to 
avoid calling `schema_to_pyarrow(struct)`. In the version above, the struct is 
built from each field rather than converting another tree-like structure in the 
middle of visiting.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [iceberg] rdblue commented on a diff in pull request #6437: Python: Projection by Field ID

Reply via email to