kevinjqliu commented on code in PR #1498: URL: https://github.com/apache/iceberg-python/pull/1498#discussion_r1911622483
########## pyiceberg/io/pyarrow.py: ########## @@ -1098,8 +1109,10 @@ class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]): """Converts PyArrowSchema to Iceberg Schema. Applies the IDs from name_mapping if provided.""" _field_names: List[str] + _field: Optional[pa.Field] Review Comment: we dont need `_field` anymore ########## pyiceberg/io/pyarrow.py: ########## @@ -1003,6 +1000,20 @@ def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T: return visit_pyarrow(obj.value_type, visitor) +@visit_pyarrow.register(pa.Field) Review Comment: So I realized that `@visit_pyarrow.register(pa.Field)` is not widely used in this repo. Most visitors use the `StructType` visit function to process `Field`, accessing the field directly in the visit: ``` @visit_pyarrow.register(pa.StructType) def _(obj: pa.StructType, visitor: PyArrowSchemaVisitor[T]) -> T: visitor.before_field(field) // do something here visit_pyarrow(field.type, visitor) visitor.after_field(field) ... ``` To be consistent, what do you think about changing the removed code above to something like this ``` @visit_pyarrow.register(pa.StructType) def _(obj: pa.StructType, visitor: PyArrowSchemaVisitor[T]) -> T: results = [] for field in obj: visitor.before_field(field) try: result = visit_pyarrow(field_type, visitor) except TypeError as e: raise UnsupportedPyArrowTypeException(obj, f"Column '{obj.name}' has an unsupported type: {field_type}") from e results.append(visitor.field(field, result)) return visitor.struct(obj, results) ``` ########## tests/io/test_pyarrow_visitor.py: ########## @@ -583,46 +584,79 @@ def test_pyarrow_schema_to_schema_fresh_ids_nested_schema( assert visit_pyarrow(pyarrow_schema_nested_without_ids, _ConvertToIcebergWithoutIDs()) == iceberg_schema_nested_no_ids -def test_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids: pa.Schema) -> None: - expected_schema = pa.schema( +def test_pyarrow_schema_unsupported_type() -> None: Review Comment: we probably dont need the whole setup here. Testing `pyarrow_to_schema` is enough, for example: ``` unsupported_field = pa.field("latitude", pa.decimal256(20, 26), nullable=False) schema = pa.schema([unsupported_field]) pyarrow_to_schema(schema) # will raise here ``` ########## tests/io/test_pyarrow_visitor.py: ########## @@ -583,46 +584,79 @@ def test_pyarrow_schema_to_schema_fresh_ids_nested_schema( assert visit_pyarrow(pyarrow_schema_nested_without_ids, _ConvertToIcebergWithoutIDs()) == iceberg_schema_nested_no_ids -def test_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids: pa.Schema) -> None: Review Comment: lets add a new test instead of removing this one ########## pyiceberg/exceptions.py: ########## @@ -122,3 +125,19 @@ class CommitStateUnknownException(RESTError): class WaitingForLockException(Exception): """Need to wait for a lock, try again.""" + + +class UnsupportedPyArrowTypeException(Exception): + """Cannot convert PyArrow type to corresponding Iceberg type.""" + + def __init__(self, field: pa.Field, *args: Any): + self.field = field Review Comment: do we need to include the `field` here? ########## pyiceberg/exceptions.py: ########## @@ -122,3 +125,19 @@ class CommitStateUnknownException(RESTError): class WaitingForLockException(Exception): """Need to wait for a lock, try again.""" + + +class UnsupportedPyArrowTypeException(Exception): + """Cannot convert PyArrow type to corresponding Iceberg type.""" + + def __init__(self, field: pa.Field, *args: Any): + self.field = field + super().__init__(*args) + + +class UnsupportedPyArrowIntegerTypeException(UnsupportedPyArrowTypeException): + """Cannot convert PyArrow integer type to corresponding Iceberg type.""" + + +class UnsupportedPyArrowTimestampTypeException(UnsupportedPyArrowTypeException): Review Comment: nit: these aren't used, lets remove them. I think `UnsupportedPyArrowTypeException` is enough for this usecase -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org