Fokko commented on code in PR #6437: URL: https://github.com/apache/iceberg/pull/6437#discussion_r1056878066
########## python/pyiceberg/schema.py: ########## @@ -1046,3 +1055,79 @@ def _project_map(map_type: MapType, value_result: IcebergType) -> MapType: value_type=value_result, value_required=map_type.value_required, ) + + +@singledispatch +def promote(file_type: IcebergType, read_type: IcebergType) -> IcebergType: + """Promotes reading a file type to a read type + + Args: + file_type (IcebergType): The type of the Avro file + read_type (IcebergType): The requested read type + + Raises: + ResolveException: If attempting to resolve an unrecognized object type + """ + raise ResolveException(f"Cannot promote {file_type} to {read_type}") + + +@promote.register(IntegerType) +def _(file_type: IntegerType, read_type: IcebergType) -> IcebergType: + if isinstance(read_type, LongType): + # Ints/Longs are binary compatible in Avro, so this is okay + return read_type + else: + raise ResolveException(f"Cannot promote an int to {read_type}") + + +@promote.register(FloatType) +def _(file_type: FloatType, read_type: IcebergType) -> IcebergType: + if isinstance(read_type, DoubleType): + # A double type is wider + return read_type + else: + raise ResolveException(f"Cannot promote an float to {read_type}") + + +@promote.register(StringType) +def _(file_type: StringType, read_type: IcebergType) -> IcebergType: + if isinstance(read_type, BinaryType): + return read_type + else: + raise ResolveException(f"Cannot promote an string to {read_type}") + + +@promote.register(BinaryType) +def _(file_type: BinaryType, read_type: IcebergType) -> IcebergType: + if isinstance(read_type, StringType): + return read_type + else: + raise ResolveException(f"Cannot promote an binary to {read_type}") + + +@promote.register(DecimalType) +def _(file_type: DecimalType, read_type: IcebergType) -> IcebergType: + if isinstance(read_type, DecimalType): + if file_type.precision <= read_type.precision and file_type.scale == file_type.scale: + return read_type + else: + raise ResolveException(f"Cannot reduce precision from {file_type} to {read_type}") + else: + raise ResolveException(f"Cannot promote an decimal to {read_type}") + + +@promote.register(StructType) +def _(file_type: StructType, read_type: IcebergType) -> IcebergType: Review Comment: I would say we can rewrite: ```python def struct( self, struct: StructType, struct_array: Optional[pa.Array], field_results: List[Optional[pa.Array]] ) -> Optional[pa.Array]: if struct_array is None: return None field_arrays: List[pa.Array] = [] fields: List[pa.Field] = [] for pos, field_array in enumerate(field_results): field = struct.fields[pos] if field_array is not None: array = self.cast_if_needed(field, field_array) field_arrays.append(array) fields.append(pa.field(field.name, array.type, field.optional)) elif field.optional: arrow_type = schema_to_pyarrow(field.field_type) field_arrays.append(pa.nulls(len(struct_array)).cast(arrow_type)) fields.append(pa.field(field.name, arrow_type, field.optional)) else: raise ResolveException(f"Field is required, and could not be found in the file: {field}") return pa.StructArray.from_arrays(arrays=field_arrays, fields=pa.struct(fields)) def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional[pa.Array]) -> Optional[pa.Array]: return field_array ``` I think this is cleaner because we just handle the field on the field method: ```python def struct( self, struct: StructType, struct_array: Optional[pa.Array], field_results: List[Optional[pa.Array]] ) -> Optional[pa.Array]: if struct_array is None: return None return pa.StructArray.from_arrays(arrays=field_results, fields=pa.struct(schema_to_pyarrow(struct))) def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional[pa.Array]) -> Optional[pa.Array]: if field_array is not None: return self.cast_if_needed(field, field_array) elif field.optional: arrow_type = schema_to_pyarrow(field.field_type) return pa.nulls(3, type=arrow_type) # We need to find the length somehow else: raise ResolveError(f"Field is required, and could not be found in the file: {field}") ``` But then the field_type can still be a StructType -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org