szehon-ho commented on code in PR #10547: URL: https://github.com/apache/iceberg/pull/10547#discussion_r1659431526
########## spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java: ########## @@ -366,15 +370,55 @@ public void pruneColumns(StructType requestedSchema) { private Schema schemaWithMetadataColumns() { // metadata columns - List<Types.NestedField> fields = + List<Types.NestedField> metadataFields = metaColumns.stream() .distinct() .map(name -> MetadataColumns.metadataColumn(table, name)) .collect(Collectors.toList()); - Schema meta = new Schema(fields); + // only calculate potential column id collision if metadata column was requested + Schema metadataSchema = + metadataFields.isEmpty() + ? new Schema(metadataFields) + : deduplicateSchemaIds(metadataFields); // schema or rows returned by readers - return TypeUtil.join(schema, meta); + return TypeUtil.join(schema, metadataSchema); + } + + private Schema deduplicateSchemaIds(List<Types.NestedField> metaColumnFields) { + Map<Integer, Types.NestedField> indexedMetadataColumnFields = + TypeUtil.indexById(Types.StructType.of(metaColumnFields)); + + // Calculate field ids to reassign from nested partition field + Set<Integer> idsToReassign = Review Comment: Hm, I wonder could we just go through the metadataColumnFields until we find id = PARTITION_COLUMN_ID, and then index that struct only? ########## spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java: ########## @@ -366,15 +370,55 @@ public void pruneColumns(StructType requestedSchema) { private Schema schemaWithMetadataColumns() { // metadata columns - List<Types.NestedField> fields = + List<Types.NestedField> metadataFields = metaColumns.stream() .distinct() .map(name -> MetadataColumns.metadataColumn(table, name)) .collect(Collectors.toList()); - Schema meta = new Schema(fields); + // only calculate potential column id collision if metadata column was requested + Schema metadataSchema = + metadataFields.isEmpty() + ? new Schema(metadataFields) + : deduplicateSchemaIds(metadataFields); // schema or rows returned by readers - return TypeUtil.join(schema, meta); + return TypeUtil.join(schema, metadataSchema); + } + + private Schema deduplicateSchemaIds(List<Types.NestedField> metaColumnFields) { + Map<Integer, Types.NestedField> indexedMetadataColumnFields = + TypeUtil.indexById(Types.StructType.of(metaColumnFields)); + + // Calculate field ids to reassign from nested partition field + Set<Integer> idsToReassign = + indexedMetadataColumnFields.values().stream() + .map(Types.NestedField::fieldId) + .filter(id -> !MetadataColumns.isMetadataColumn(id)) + .collect(Collectors.toSet()); + + // Calculate used ids by union metadata columns with all base table schemas + Set<Integer> currentlyUsedIds = indexedMetadataColumnFields.keySet(); Review Comment: optional: how about Sets.difference() (its more functional) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org