kevinjqliu commented on code in PR #1878: URL: https://github.com/apache/iceberg-python/pull/1878#discussion_r2052855516
########## pyiceberg/table/upsert_util.py: ########## @@ -82,14 +82,54 @@ def get_rows_to_update(source_table: pa.Table, target_table: pa.Table, join_cols ], ) - return ( - source_table - # We already know that the schema is compatible, this is to fix large_ types - .cast(target_table.schema) - .join(target_table, keys=list(join_cols_set), join_type="inner", left_suffix="-lhs", right_suffix="-rhs") - .filter(diff_expr) - .drop_columns([f"{col}-rhs" for col in non_key_cols]) - .rename_columns({f"{col}-lhs" if col not in join_cols else col: col for col in source_table.column_names}) - # Finally cast to the original schema since it doesn't carry nullability: - # https://github.com/apache/arrow/issues/45557 - ).cast(target_table.schema) + try: + return ( + source_table + # We already know that the schema is compatible, this is to fix large_ types + .cast(target_table.schema) + .join(target_table, keys=list(join_cols_set), join_type="inner", left_suffix="-lhs", right_suffix="-rhs") + .filter(diff_expr) + .drop_columns([f"{col}-rhs" for col in non_key_cols]) + .rename_columns({f"{col}-lhs" if col not in join_cols else col: col for col in source_table.column_names}) + # Finally cast to the original schema since it doesn't carry nullability: + # https://github.com/apache/arrow/issues/45557 + ).cast(target_table.schema) + except pa.ArrowInvalid: + # When we are not able to compare (e.g. due to unsupported types), + # fall back to selecting only rows in the source table that do NOT already exist in the target. + # See: https://github.com/apache/arrow/issues/35785 + MARKER_COLUMN_NAME = "__from_target" + INDEX_COLUMN_NAME = "__source_index" + + if MARKER_COLUMN_NAME in join_cols_set or INDEX_COLUMN_NAME in join_cols_set: + raise ValueError( + f"{MARKER_COLUMN_NAME} and {INDEX_COLUMN_NAME} are reserved for joining " + f"DataFrames, and cannot be used as column names" + ) from None + Review Comment: Since we cannot compare complex types, i would assume we cannot use any complex type columns as the join keys. If thats the case, we should check if its used and then fail -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org