kevinjqliu commented on code in PR #1878: URL: https://github.com/apache/iceberg-python/pull/1878#discussion_r2059226527
########## tests/table/test_upsert.py: ########## @@ -511,6 +511,163 @@ def test_upsert_without_identifier_fields(catalog: Catalog) -> None: tbl.upsert(df) +def test_upsert_with_struct_field_as_non_join_key(catalog: Catalog) -> None: + identifier = "default.test_upsert_struct_field_fails" + _drop_table(catalog, identifier) + + schema = Schema( + NestedField(1, "id", IntegerType(), required=True), + NestedField( + 2, + "nested_type", + StructType( + NestedField(3, "sub1", StringType(), required=True), + NestedField(4, "sub2", StringType(), required=True), + ), + required=False, + ), + identifier_field_ids=[1], + ) + + tbl = catalog.create_table(identifier, schema=schema) + + arrow_schema = pa.schema( + [ + pa.field("id", pa.int32(), nullable=False), + pa.field( + "nested_type", + pa.struct( + [ + pa.field("sub1", pa.large_string(), nullable=False), + pa.field("sub2", pa.large_string(), nullable=False), + ] + ), + nullable=True, + ), + ] + ) + + initial_data = pa.Table.from_pylist( + [ + { + "id": 1, + "nested_type": {"sub1": "bla1", "sub2": "bla"}, + } + ], + schema=arrow_schema, + ) + tbl.append(initial_data) + + update_data = pa.Table.from_pylist( + [ + { + "id": 2, + "nested_type": {"sub1": "bla1", "sub2": "bla"}, + }, + { + "id": 1, + "nested_type": {"sub1": "bla1", "sub2": "bla2"}, + }, + ], + schema=arrow_schema, + ) + + res = tbl.upsert(update_data, join_cols=["id"]) + + expected_updated = 1 + expected_inserted = 1 + + assert_upsert_result(res, expected_updated, expected_inserted) + + update_data = pa.Table.from_pylist( + [ + { + "id": 2, + "nested_type": {"sub1": "bla1", "sub2": "bla"}, + }, + { + "id": 1, + "nested_type": {"sub1": "bla1", "sub2": "bla2"}, + }, + ], + schema=arrow_schema, + ) + + res = tbl.upsert(update_data, join_cols=["id"]) + + expected_updated = 0 + expected_inserted = 0 + + assert_upsert_result(res, expected_updated, expected_inserted) + + +def test_upsert_with_struct_field_as_join_key(catalog: Catalog) -> None: + identifier = "default.test_upsert_with_struct_field_as_join_key" + _drop_table(catalog, identifier) + + schema = Schema( + NestedField(1, "id", IntegerType(), required=True), + NestedField( + 2, + "nested_type", + StructType( + NestedField(3, "sub1", StringType(), required=True), + NestedField(4, "sub2", StringType(), required=True), + ), + required=False, + ), + identifier_field_ids=[1], + ) + + tbl = catalog.create_table(identifier, schema=schema) + + arrow_schema = pa.schema( + [ + pa.field("id", pa.int32(), nullable=False), + pa.field( + "nested_type", + pa.struct( + [ + pa.field("sub1", pa.large_string(), nullable=False), + pa.field("sub2", pa.large_string(), nullable=False), + ] + ), + nullable=True, + ), + ] + ) + + initial_data = pa.Table.from_pylist( + [ + { + "id": 1, + "nested_type": {"sub1": "bla1", "sub2": "bla"}, + } + ], + schema=arrow_schema, + ) + tbl.append(initial_data) + + update_data = pa.Table.from_pylist( + [ + { + "id": 2, + "nested_type": {"sub1": "bla1", "sub2": "bla"}, + }, + { + "id": 1, + "nested_type": {"sub1": "bla1", "sub2": "bla"}, + }, + ], + schema=arrow_schema, + ) + + with pytest.raises( + pa.lib.ArrowNotImplementedError, match="Keys of type struct<sub1: large_string not null, sub2: large_string not null>" + ): + _ = tbl.upsert(update_data, join_cols=["nested_type"]) Review Comment: btw this fails not in `get_rows_to_update` but in `has_duplicate_rows` `group_by` has the same limitation as `join` ``` ../../Library/Caches/pypoetry/virtualenvs/pyiceberg-Is5Rt7Ah-py3.12/lib/python3.12/site-packages/pyarrow/acero.py:410: in _group_by return decl.to_table(use_threads=use_threads) pyarrow/_acero.pyx:590: in pyarrow._acero.Declaration.to_table ??? pyarrow/error.pxi:155: in pyarrow.lib.pyarrow_internal_check_status ??? _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ > ??? E pyarrow.lib.ArrowNotImplementedError: Keys of type struct<sub1: large_string not null, sub2: large_string not null> pyarrow/error.pxi:92: ArrowNotImplementedError ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org