DevChrisCross commented on code in PR #1498:
URL: https://github.com/apache/iceberg-python/pull/1498#discussion_r1912229915


##########
tests/io/test_pyarrow_visitor.py:
##########
@@ -625,6 +626,91 @@ def 
test_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids: pa
     assert 
_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids) == 
expected_schema
 
 
+def test_pyarrow_schema_unsupported_type() -> None:
+    unsupported_field = pa.field("latitude", pa.decimal256(20, 26), 
nullable=False, metadata={"PARQUET:field_id": "2"})
+    schema = pa.schema(
+        [
+            pa.field("foo", pa.string(), nullable=False, 
metadata={"PARQUET:field_id": "1"}),
+            pa.field(
+                "location",
+                pa.large_list(
+                    pa.field(
+                        "item",
+                        pa.struct(
+                            [
+                                unsupported_field,
+                                pa.field("longitude", pa.float32(), 
nullable=False, metadata={"PARQUET:field_id": "3"}),
+                            ]
+                        ),
+                        metadata={"PARQUET:field_id": "4"},
+                    )
+                ),
+                nullable=False,
+                metadata={"PARQUET:field_id": "5"},
+            ),
+        ],
+        metadata={"PARQUET:field_id": "6"},
+    )
+    with pytest.raises(
+        UnsupportedPyArrowTypeException, match=re.escape("Column 'latitude' 
has an unsupported type: decimal256(20, 26)")
+    ) as exc_info:
+        pyarrow_to_schema(schema)
+    assert exc_info.value.field == unsupported_field
+    exception_cause = exc_info.value.__cause__
+    assert isinstance(exception_cause, TypeError)
+    assert "Unsupported type: decimal256(20, 26)" in exception_cause.args[0]
+
+    unsupported_field = pa.field(
+        "quux",
+        pa.map_(
+            pa.field("key", pa.string(), nullable=False, 
metadata={"PARQUET:field_id": "2"}),
+            pa.field(
+                "value",
+                pa.map_(
+                    pa.field("key", pa.string(), nullable=False, 
metadata={"PARQUET:field_id": "5"}),
+                    pa.field("value", pa.decimal256(2, 3), 
metadata={"PARQUET:field_id": "6"}),
+                ),
+                nullable=False,
+                metadata={"PARQUET:field_id": "4"},
+            ),
+        ),
+        nullable=False,
+        metadata={"PARQUET:field_id": "3"},
+    )
+    schema = pa.schema(
+        [
+            pa.field("foo", pa.string(), nullable=False, 
metadata={"PARQUET:field_id": "1"}),
+            unsupported_field,
+        ]
+    )
+    with pytest.raises(
+        UnsupportedPyArrowTypeException,
+        match=re.escape("Column 'quux' has an unsupported type: map<string, 
map<string, decimal256(2, 3)>>"),

Review Comment:
   Regarding that, it is because when `pa.MapTypes` are involved, only the root 
level is considered as the `field` (unless you introduce a `StructType` inside 
the map). The way it currently visit items is that it goes straight for the 
`item_type`, hence it will never visit the `pa.Field` implementation. See below 
for the concrete scenario:
   
   ```
   - Field<quux: map<string, map<string, decimal256(2, 3)>> not null>
        - Map<string, map<string, decimal256(2, 3)>>
             - key: string
                  - DataType string
             - value: map<string, decimal256(2, 3)>
                  - Map<string, decimal256(2, 3)>
                       - key: string
                            - DataType string
                       - value:  decimal256(2, 3)
                            - DataType decimal256(2, 3)   # primitive() throws 
TypeError
                            
   # Field quux catches the error then throws UnsupportedPyArrowTypeException
   ```
   
   You could see above that it never went for `pa.Field` since it is the entire 
value for the key-value pair, i.e. `pa.field("value", pa.decimal256(2, 3), 
metadata={"PARQUET:field_id": "6"})` and the code goes straight for the 
`field.type` always, discarding the actual `pa.Field` object used along the 
process.
   
   Whereas, in comparison to the test involving the `ListType` and 
`StructType`, it was able to return the specific `field` object directly 
because the `StructType` enabled it to pass through the `pa.Field`.
   ```
   - Field pyarrow.Field<location: large_list<item: struct<latitude: 
decimal256(20, 26) not null, longitude: float not null>> not null>
        - LargeListType<item: struct<latitude: decimal256(20, 26) not null, 
longitude: float not null>>
             - StructType<latitude: decimal256(20, 26) not null, longitude: 
float not null>
                  - Field<latitude: decimal256(20, 26) not null>
                       - DataType decimal256(20, 26)
   ```  
   
   On a personal note, I think the behavior is desirable since you would know 
as a developer what map specifically contains the unsupported type, especially 
for nested ones. Like if there is a field named `foo` that is both existent to 
multiple maps, at least you would know on the get go what to fix rather than 
checking them all together. :)
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to