Re: [PR] Fallback for upsert when arrow cannot compare source rows with target rows [iceberg-python]

via GitHub Thu, 24 Apr 2025 14:04:24 -0700


kevinjqliu commented on code in PR #1878:
URL: https://github.com/apache/iceberg-python/pull/1878#discussion_r2059226527



##########
tests/table/test_upsert.py:
##########
@@ -511,6 +511,163 @@ def test_upsert_without_identifier_fields(catalog: 
Catalog) -> None:
         tbl.upsert(df)
 
 
+def test_upsert_with_struct_field_as_non_join_key(catalog: Catalog) -> None:
+    identifier = "default.test_upsert_struct_field_fails"
+    _drop_table(catalog, identifier)
+
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(
+            2,
+            "nested_type",
+            StructType(
+                NestedField(3, "sub1", StringType(), required=True),
+                NestedField(4, "sub2", StringType(), required=True),
+            ),
+            required=False,
+        ),
+        identifier_field_ids=[1],
+    )
+
+    tbl = catalog.create_table(identifier, schema=schema)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int32(), nullable=False),
+            pa.field(
+                "nested_type",
+                pa.struct(
+                    [
+                        pa.field("sub1", pa.large_string(), nullable=False),
+                        pa.field("sub2", pa.large_string(), nullable=False),
+                    ]
+                ),
+                nullable=True,
+            ),
+        ]
+    )
+
+    initial_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            }
+        ],
+        schema=arrow_schema,
+    )
+    tbl.append(initial_data)
+
+    update_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 2,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            },
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla2"},
+            },
+        ],
+        schema=arrow_schema,
+    )
+
+    res = tbl.upsert(update_data, join_cols=["id"])
+
+    expected_updated = 1
+    expected_inserted = 1
+
+    assert_upsert_result(res, expected_updated, expected_inserted)
+
+    update_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 2,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            },
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla2"},
+            },
+        ],
+        schema=arrow_schema,
+    )
+
+    res = tbl.upsert(update_data, join_cols=["id"])
+
+    expected_updated = 0
+    expected_inserted = 0
+
+    assert_upsert_result(res, expected_updated, expected_inserted)
+
+
+def test_upsert_with_struct_field_as_join_key(catalog: Catalog) -> None:
+    identifier = "default.test_upsert_with_struct_field_as_join_key"
+    _drop_table(catalog, identifier)
+
+    schema = Schema(
+        NestedField(1, "id", IntegerType(), required=True),
+        NestedField(
+            2,
+            "nested_type",
+            StructType(
+                NestedField(3, "sub1", StringType(), required=True),
+                NestedField(4, "sub2", StringType(), required=True),
+            ),
+            required=False,
+        ),
+        identifier_field_ids=[1],
+    )
+
+    tbl = catalog.create_table(identifier, schema=schema)
+
+    arrow_schema = pa.schema(
+        [
+            pa.field("id", pa.int32(), nullable=False),
+            pa.field(
+                "nested_type",
+                pa.struct(
+                    [
+                        pa.field("sub1", pa.large_string(), nullable=False),
+                        pa.field("sub2", pa.large_string(), nullable=False),
+                    ]
+                ),
+                nullable=True,
+            ),
+        ]
+    )
+
+    initial_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            }
+        ],
+        schema=arrow_schema,
+    )
+    tbl.append(initial_data)
+
+    update_data = pa.Table.from_pylist(
+        [
+            {
+                "id": 2,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            },
+            {
+                "id": 1,
+                "nested_type": {"sub1": "bla1", "sub2": "bla"},
+            },
+        ],
+        schema=arrow_schema,
+    )
+
+    with pytest.raises(
+        pa.lib.ArrowNotImplementedError, match="Keys of type struct<sub1: 
large_string not null, sub2: large_string not null>"
+    ):
+        _ = tbl.upsert(update_data, join_cols=["nested_type"])

Review Comment:
   btw this fails not in `get_rows_to_update` but in `has_duplicate_rows` 
   `group_by` has the same limitation as `join`
   
   ```
   
../../Library/Caches/pypoetry/virtualenvs/pyiceberg-Is5Rt7Ah-py3.12/lib/python3.12/site-packages/pyarrow/acero.py:410:
 in _group_by
       return decl.to_table(use_threads=use_threads)
   pyarrow/_acero.pyx:590: in pyarrow._acero.Declaration.to_table
       ???
   pyarrow/error.pxi:155: in pyarrow.lib.pyarrow_internal_check_status
       ???
   _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
   
   >   ???
   E   pyarrow.lib.ArrowNotImplementedError: Keys of type struct<sub1: 
large_string not null, sub2: large_string not null>
   
   pyarrow/error.pxi:92: ArrowNotImplementedError
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Fallback for upsert when arrow cannot compare source rows with target rows [iceberg-python]

Reply via email to