Fokko commented on code in PR #219: URL: https://github.com/apache/iceberg-python/pull/219#discussion_r1457510803
########## pyiceberg/io/pyarrow.py: ########## @@ -733,42 +854,178 @@ def _get_field_id(field: pa.Field) -> Optional[int]: ) -class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]): - def _convert_fields(self, arrow_fields: Iterable[pa.Field], field_results: List[Optional[IcebergType]]) -> List[NestedField]: - fields = [] - for i, field in enumerate(arrow_fields): - field_id = _get_field_id(field) - field_doc = doc_str.decode() if (field.metadata and (doc_str := field.metadata.get(PYARROW_FIELD_DOC_KEY))) else None - field_type = field_results[i] - if field_type is not None and field_id is not None: - fields.append(NestedField(field_id, field.name, field_type, required=not field.nullable, doc=field_doc)) - return fields - - def schema(self, schema: pa.Schema, field_results: List[Optional[IcebergType]]) -> Schema: - return Schema(*self._convert_fields(schema, field_results)) - - def struct(self, struct: pa.StructType, field_results: List[Optional[IcebergType]]) -> IcebergType: - return StructType(*self._convert_fields(struct, field_results)) - - def list(self, list_type: pa.ListType, element_result: Optional[IcebergType]) -> Optional[IcebergType]: +class _HasIds(PyArrowSchemaVisitor[bool]): + def schema(self, schema: pa.Schema, struct_result: bool) -> bool: + return struct_result + + def struct(self, struct: pa.StructType, field_results: List[bool]) -> bool: + return all(field_results) + + def field(self, field: pa.Field, field_result: bool) -> bool: + return all([_get_field_id(field) is not None, field_result]) + + def list(self, list_type: pa.ListType, element_result: bool) -> bool: element_field = list_type.value_field element_id = _get_field_id(element_field) - if element_result is not None and element_id is not None: - return ListType(element_id, element_result, element_required=not element_field.nullable) - return None + return element_result and element_id is not None - def map( - self, map_type: pa.MapType, key_result: Optional[IcebergType], value_result: Optional[IcebergType] - ) -> Optional[IcebergType]: + def map(self, map_type: pa.MapType, key_result: bool, value_result: bool) -> bool: key_field = map_type.key_field key_id = _get_field_id(key_field) value_field = map_type.item_field value_id = _get_field_id(value_field) - if key_result is not None and value_result is not None and key_id is not None and value_id is not None: - return MapType(key_id, key_result, value_id, value_result, value_required=not value_field.nullable) - return None + return all([key_id is not None, value_id is not None, key_result, value_result]) + + def primitive(self, primitive: pa.DataType) -> bool: + return True + + +class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]): + """Converts PyArrowSchema to Iceberg Schema. Applies the IDs from name_mapping if provided.""" + + _field_names: List[str] + _name_mapping: Optional[NameMapping] + + def __init__(self, name_mapping: Optional[NameMapping] = None) -> None: + self._field_names = [] + self._name_mapping = name_mapping + + def _current_path(self) -> str: + return ".".join(self._field_names) + + def _field_id(self, field: pa.Field) -> int: + if self._name_mapping: + return self._name_mapping.find(self._current_path()).field_id + elif (field_id := _get_field_id(field)) is not None: + return field_id + else: + raise ValueError(f"Cannot convert {field} to Iceberg Field as field_id is empty.") + + def schema(self, schema: pa.Schema, struct_result: StructType) -> Schema: + return Schema(*struct_result.fields) + + def struct(self, struct: pa.StructType, field_results: List[NestedField]) -> StructType: + return StructType(*field_results) + + def field(self, field: pa.Field, field_result: IcebergType) -> NestedField: + field_id = self._field_id(field) + field_doc = doc_str.decode() if (field.metadata and (doc_str := field.metadata.get(PYARROW_FIELD_DOC_KEY))) else None + field_type = field_result + return NestedField(field_id, field.name, field_type, required=not field.nullable, doc=field_doc) + + def list(self, list_type: pa.ListType, element_result: IcebergType) -> ListType: + element_field = list_type.value_field + self._field_names.append(LIST_ELEMENT_NAME) + element_id = self._field_id(element_field) + self._field_names.pop() + return ListType(element_id, element_result, element_required=not element_field.nullable) - def primitive(self, primitive: pa.DataType) -> IcebergType: + def map(self, map_type: pa.MapType, key_result: IcebergType, value_result: IcebergType) -> MapType: + key_field = map_type.key_field + self._field_names.append(MAP_KEY_NAME) + key_id = self._field_id(key_field) + self._field_names.pop() + value_field = map_type.item_field + self._field_names.append(MAP_VALUE_NAME) + value_id = self._field_id(value_field) + self._field_names.pop() + return MapType(key_id, key_result, value_id, value_result, value_required=not value_field.nullable) + + def primitive(self, primitive: pa.DataType) -> PrimitiveType: + if pa.types.is_boolean(primitive): + return BooleanType() + elif pa.types.is_int32(primitive): + return IntegerType() + elif pa.types.is_int64(primitive): + return LongType() + elif pa.types.is_float32(primitive): + return FloatType() + elif pa.types.is_float64(primitive): + return DoubleType() + elif isinstance(primitive, pa.Decimal128Type): + primitive = cast(pa.Decimal128Type, primitive) + return DecimalType(primitive.precision, primitive.scale) + elif pa.types.is_string(primitive): + return StringType() + elif pa.types.is_date32(primitive): + return DateType() + elif isinstance(primitive, pa.Time64Type) and primitive.unit == "us": + return TimeType() + elif pa.types.is_timestamp(primitive): + primitive = cast(pa.TimestampType, primitive) + if primitive.unit == "us": + if primitive.tz == "UTC" or primitive.tz == "+00:00": + return TimestamptzType() + elif primitive.tz is None: + return TimestampType() + elif pa.types.is_binary(primitive): + return BinaryType() + elif pa.types.is_fixed_size_binary(primitive): + primitive = cast(pa.FixedSizeBinaryType, primitive) + return FixedType(primitive.byte_width) + + raise TypeError(f"Unsupported type: {primitive}") + + def before_field(self, field: pa.Field) -> None: + self._field_names.append(field.name) + + def after_field(self, field: pa.Field) -> None: + self._field_names.pop() + + def before_list_element(self, element: pa.Field) -> None: + self._field_names.append(LIST_ELEMENT_NAME) + + def after_list_element(self, element: pa.Field) -> None: + self._field_names.pop() + + def before_map_key(self, key: pa.Field) -> None: + self._field_names.append(MAP_KEY_NAME) + + def after_map_key(self, element: pa.Field) -> None: + self._field_names.pop() + + def before_map_value(self, value: pa.Field) -> None: + self._field_names.append(MAP_VALUE_NAME) + + def after_map_value(self, element: pa.Field) -> None: + self._field_names.pop() + + +class _ConvertToIcebergWithFreshIds(PreOrderPyArrowSchemaVisitor[Union[IcebergType, Schema]]): Review Comment: Let me share your concern. Currently, we have an API like: ```python tbl.write(df: pa.Table) ``` I would say at some point we get something like: ```python tbl.write(df: pa.Table, merge_schema=True) # actual name TBD, could also be a property ``` Assuming that the Arrow dataframe doesn't have a schema, we'll use name mapping to set the names and convert it to an Iceberg schema, and that's all safe. So we need to have the ability to apply name-mapping on a PyArrow schema. It gets dangerous when people start doing: ```python new_schema = new_schema_for_table(df: pa.Table) with tbl.update_schema() as update: update.union_with_schema(new_schema) ``` Which seems reasonable to do if you're new to Iceberg. This is the [Java equivalent of `UnionByNameVisitor`](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/schema/UnionByNameVisitor.java). If you have something like: ``` 1: name (str) 2: age (int) ``` And you add a field: ``` 1: name (str) 2: phonenumber (int) 3: age (int) ``` Then `age` will be renamed to `phonenumber` and a new field with `age` will be added. Therefore we want to hide this behind an API like we're doing when creating a new table. I think that @HonahX made a good point about the `_SetFreshIds` visitor. Interestingly enough, the implementation on the Java side is also different where [it does a lookup on the full column name](https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/types/AssignFreshIds.java) of the `baseSchema`. This `baseSchema` is null when creating a new table. I think the problem here is that we don't have an API like in Spark where we can [nicely hide things](https://github.com/apache/iceberg/blob/main/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java#L225-L242). I'm almost tempted to allow creating a table from a PyArrow table `create_table_from_table(df: pa.Table)`, but that mixes in PyArrow into the main API, but refrains us from exposing these things to the user (which isn't super user friendly in general). WDYT @syun64 @HonahX ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org