szehon-ho commented on code in PR #5376: URL: https://github.com/apache/iceberg/pull/5376#discussion_r1022594631
########## core/src/main/java/org/apache/iceberg/BaseFilesTable.java: ########## @@ -140,42 +142,75 @@ protected CloseableIterable<FileScanTask> doPlanFiles() { } static class ManifestReadTask extends BaseFileScanTask implements DataTask { + + private static final Set<Integer> READABLE_METRICS_FIELD_IDS = + TypeUtil.getProjectedIds(DataFile.READABLE_METRICS.type()); + private static final Schema MIN_PROJECTION_FOR_READABLE_METRICS = + new Schema( + DataFile.COLUMN_SIZES, + DataFile.VALUE_COUNTS, + DataFile.NULL_VALUE_COUNTS, + DataFile.NAN_VALUE_COUNTS, + DataFile.LOWER_BOUNDS, + DataFile.UPPER_BOUNDS); + private final FileIO io; private final Map<Integer, PartitionSpec> specsById; private final ManifestFile manifest; - private final Schema schema; + private final Schema dataTableSchema; + private final Schema projection; ManifestReadTask( Table table, ManifestFile manifest, - Schema schema, + Schema projection, String schemaString, String specString, ResidualEvaluator residuals) { super(DataFiles.fromManifest(manifest), null, schemaString, specString, residuals); this.io = table.io(); this.specsById = Maps.newHashMap(table.specs()); this.manifest = manifest; - this.schema = schema; + this.dataTableSchema = table.schema(); + this.projection = projection; } @Override public CloseableIterable<StructLike> rows() { - return CloseableIterable.transform(manifestEntries(), file -> (StructLike) file); + if (projection.findColumnName(DataFile.READABLE_METRICS.fieldId()) == null) { + return CloseableIterable.transform(files(projection), file -> (StructLike) file); + } else { + Schema fileProjection = TypeUtil.selectNot(projection, READABLE_METRICS_FIELD_IDS); + + // If readable_metrics is selected, + // original metrics columns need to be selected for derivation + Schema minProjection = TypeUtil.join(fileProjection, MIN_PROJECTION_FOR_READABLE_METRICS); Review Comment: Hm not sure what is a good name. To me max projection is not right (that would just be all the columns of files table). So I just put projection for now, and we can optimize it later? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org