gaborkaszab commented on code in PR #11216: URL: https://github.com/apache/iceberg/pull/11216#discussion_r1952317886
########## data/src/main/java/org/apache/iceberg/data/PartitionStatsHandler.java: ########## @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.ImmutableGenericPartitionStatisticsFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionStatisticsFile; +import org.apache.iceberg.PartitionStats; +import org.apache.iceberg.PartitionStatsUtil; +import org.apache.iceberg.Partitioning; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.avro.InternalReader; +import org.apache.iceberg.data.parquet.InternalWriter; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.LongType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.util.SnapshotUtil; + +/** + * Computes, writes and reads the {@link PartitionStatisticsFile}. Uses generic readers and writers + * to support writing and reading of the stats in table default format. + */ +public class PartitionStatsHandler { + + private PartitionStatsHandler() {} + + public enum Column { + PARTITION(0), + SPEC_ID(1), + DATA_RECORD_COUNT(2), + DATA_FILE_COUNT(3), + TOTAL_DATA_FILE_SIZE_IN_BYTES(4), + POSITION_DELETE_RECORD_COUNT(5), + POSITION_DELETE_FILE_COUNT(6), + EQUALITY_DELETE_RECORD_COUNT(7), + EQUALITY_DELETE_FILE_COUNT(8), + TOTAL_RECORD_COUNT(9), + LAST_UPDATED_AT(10), + LAST_UPDATED_SNAPSHOT_ID(11); + + private final int id; + + Column(int id) { + this.id = id; + } + + public int id() { + return id; + } + } + + /** + * Generates the partition stats file schema based on a given partition type. + * + * <p>Note: Provide the unified partition schema type as mentioned in the spec. + * + * @param partitionType unified partition schema type. + * @return a schema that corresponds to the provided unified partition type. + */ + public static Schema schema(StructType partitionType) { + Preconditions.checkState(!partitionType.fields().isEmpty(), "Table must be partitioned"); + return new Schema( + NestedField.required(1, Column.PARTITION.name(), partitionType), + NestedField.required(2, Column.SPEC_ID.name(), IntegerType.get()), + NestedField.required(3, Column.DATA_RECORD_COUNT.name(), LongType.get()), + NestedField.required(4, Column.DATA_FILE_COUNT.name(), IntegerType.get()), + NestedField.required(5, Column.TOTAL_DATA_FILE_SIZE_IN_BYTES.name(), LongType.get()), + NestedField.optional(6, Column.POSITION_DELETE_RECORD_COUNT.name(), LongType.get()), + NestedField.optional(7, Column.POSITION_DELETE_FILE_COUNT.name(), IntegerType.get()), + NestedField.optional(8, Column.EQUALITY_DELETE_RECORD_COUNT.name(), LongType.get()), + NestedField.optional(9, Column.EQUALITY_DELETE_FILE_COUNT.name(), IntegerType.get()), + NestedField.optional(10, Column.TOTAL_RECORD_COUNT.name(), LongType.get()), + NestedField.optional(11, Column.LAST_UPDATED_AT.name(), LongType.get()), + NestedField.optional(12, Column.LAST_UPDATED_SNAPSHOT_ID.name(), LongType.get())); + } + + /** + * Computes and writes the {@link PartitionStatisticsFile} for a given table's current snapshot. + * + * @param table The {@link Table} for which the partition statistics is computed. + * @return {@link PartitionStatisticsFile} for the current snapshot, or null if no statistics are + * present. + */ + public static PartitionStatisticsFile computeAndWriteStatsFile(Table table) throws IOException { + return computeAndWriteStatsFile(table, null); + } + + /** + * Computes and writes the {@link PartitionStatisticsFile} for a given table and branch. + * + * @param table The {@link Table} for which the partition statistics is computed. + * @param branch A branch information to select the required snapshot. + * @return {@link PartitionStatisticsFile} for the given branch, or null if no statistics are + * present. + */ + public static PartitionStatisticsFile computeAndWriteStatsFile(Table table, String branch) + throws IOException { + Snapshot currentSnapshot = SnapshotUtil.latestSnapshot(table, branch); + if (currentSnapshot == null) { + Preconditions.checkArgument( + branch == null, "Couldn't find the snapshot for the branch %s", branch); + return null; + } + + StructType partitionType = Partitioning.partitionType(table); + Collection<PartitionStats> stats = PartitionStatsUtil.computeStats(table, currentSnapshot); + if (stats.isEmpty()) { + return null; + } + + List<PartitionStats> sortedStats = PartitionStatsUtil.sortStats(stats, partitionType); + return writePartitionStatsFile( + table, currentSnapshot.snapshotId(), schema(partitionType), sortedStats.iterator()); + } + + @VisibleForTesting + static PartitionStatisticsFile writePartitionStatsFile( + Table table, long snapshotId, Schema dataSchema, Iterator<PartitionStats> records) + throws IOException { + OutputFile outputFile = newPartitionStatsFile(table, snapshotId); + + try (DataWriter<StructLike> writer = dataWriter(dataSchema, outputFile)) { + records.forEachRemaining(writer::write); + } + + return ImmutableGenericPartitionStatisticsFile.builder() + .snapshotId(snapshotId) + .path(outputFile.location()) + .fileSizeInBytes(outputFile.toInputFile().getLength()) + .build(); + } + + /** + * Reads partition statistics from the specified {@link InputFile} using given schema. + * + * @param schema The {@link Schema} of the partition statistics file. + * @param inputFile An {@link InputFile} pointing to the partition stats file. + */ + public static CloseableIterable<PartitionStats> readPartitionStatsFile( + Schema schema, InputFile inputFile) { + CloseableIterable<StructLike> records = dataReader(schema, inputFile); + return CloseableIterable.transform(records, PartitionStatsHandler::recordToPartitionStats); + } + + private static FileFormat fileFormat(String fileLocation) { + return FileFormat.fromString(fileLocation.substring(fileLocation.lastIndexOf(".") + 1)); + } + + private static OutputFile newPartitionStatsFile(Table table, long snapshotId) { + Preconditions.checkArgument( + table instanceof HasTableOperations, + "Table must have operations to retrieve metadata location"); + FileFormat fileFormat = + fileFormat( + table.properties().getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT)); + return table + .io() + .newOutputFile( + ((HasTableOperations) table) + .operations() + .metadataFileLocation( + fileFormat.addExtension( + String.format(Locale.ROOT, "partition-stats-%d", snapshotId)))); + } + + private static DataWriter<StructLike> dataWriter(Schema dataSchema, OutputFile outputFile) + throws IOException { + FileFormat fileFormat = fileFormat(outputFile.location()); + switch (fileFormat) { + case PARQUET: + return Parquet.writeData(outputFile) + .schema(dataSchema) + .createWriterFunc(InternalWriter::create) + .overwrite() Review Comment: > Snapshot id will be unique for each new commit. If users are trying to compute for stats for the same snapshot. It is overwriting the file as the contents doesn't change. I'm still not sure about this. Let's assume we update the stats for a previous snapshot that already had stats. Also assume that there is a reader actually reading that stat file in the same time. In this case overwriting the file while the reader is reading it (e.g. reading the footer) might result in reading incorrectly, right? There might be an instant when the reader tries to read from the file but the file is not there. Or does Iceberg have any guarantees that this couldn't happen with an overwrite? > the contents doesn't change Also, it's possible that we don't write the very same content into the stat file. E.g. when stat files are corrupted for a reason and we want to re-compute them. Another use-case that I have in mind for writing different content into an existing file is when we decide to extend the stats schema with some additional columns and then we decide to re-compute the stats to have the new column types too in the file. Not saying that this is a scenario that happens frequently but it's also not impossible. ########## data/src/main/java/org/apache/iceberg/data/PartitionStatsHandler.java: ########## @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.ImmutableGenericPartitionStatisticsFile; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionStatisticsFile; +import org.apache.iceberg.PartitionStats; +import org.apache.iceberg.PartitionStatsUtil; +import org.apache.iceberg.Partitioning; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.avro.InternalReader; +import org.apache.iceberg.data.parquet.InternalWriter; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.LongType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.StructType; +import org.apache.iceberg.util.SnapshotUtil; + +/** + * Computes, writes and reads the {@link PartitionStatisticsFile}. Uses generic readers and writers + * to support writing and reading of the stats in table default format. + */ +public final class PartitionStatsHandler { + + private PartitionStatsHandler() {} + + public enum Column { + PARTITION(0), + SPEC_ID(1), + DATA_RECORD_COUNT(2), + DATA_FILE_COUNT(3), + TOTAL_DATA_FILE_SIZE_IN_BYTES(4), + POSITION_DELETE_RECORD_COUNT(5), + POSITION_DELETE_FILE_COUNT(6), + EQUALITY_DELETE_RECORD_COUNT(7), + EQUALITY_DELETE_FILE_COUNT(8), + TOTAL_RECORD_COUNT(9), + LAST_UPDATED_AT(10), + LAST_UPDATED_SNAPSHOT_ID(11); + + private final int id; + + Column(int id) { + this.id = id; + } + + public int id() { + return id; + } + } + + /** + * Generates the partition stats file schema based on a given partition type. + * + * <p>Note: Provide the unified partition schema type as mentioned in the spec. + * + * @param partitionType unified partition schema type. + * @return a schema that corresponds to the provided unified partition type. + */ + public static Schema schema(StructType partitionType) { + Preconditions.checkState(!partitionType.fields().isEmpty(), "table must be partitioned"); + return new Schema( + NestedField.required(1, Column.PARTITION.name(), partitionType), + NestedField.required(2, Column.SPEC_ID.name(), IntegerType.get()), + NestedField.required(3, Column.DATA_RECORD_COUNT.name(), LongType.get()), + NestedField.required(4, Column.DATA_FILE_COUNT.name(), IntegerType.get()), + NestedField.required(5, Column.TOTAL_DATA_FILE_SIZE_IN_BYTES.name(), LongType.get()), + NestedField.optional(6, Column.POSITION_DELETE_RECORD_COUNT.name(), LongType.get()), + NestedField.optional(7, Column.POSITION_DELETE_FILE_COUNT.name(), IntegerType.get()), + NestedField.optional(8, Column.EQUALITY_DELETE_RECORD_COUNT.name(), LongType.get()), + NestedField.optional(9, Column.EQUALITY_DELETE_FILE_COUNT.name(), IntegerType.get()), + NestedField.optional(10, Column.TOTAL_RECORD_COUNT.name(), LongType.get()), + NestedField.optional(11, Column.LAST_UPDATED_AT.name(), LongType.get()), + NestedField.optional(12, Column.LAST_UPDATED_SNAPSHOT_ID.name(), LongType.get())); + } + + /** + * Computes and writes the {@link PartitionStatisticsFile} for a given table's current snapshot. + * + * @param table The {@link Table} for which the partition statistics is computed. + * @return {@link PartitionStatisticsFile} for the current snapshot. + */ + public static PartitionStatisticsFile computeAndWriteStatsFile(Table table) { + return computeAndWriteStatsFile(table, null); + } + + /** + * Computes and writes the {@link PartitionStatisticsFile} for a given table and branch. + * + * @param table The {@link Table} for which the partition statistics is computed. + * @param branch A branch information to select the required snapshot. + * @return {@link PartitionStatisticsFile} for the given branch. + */ + public static PartitionStatisticsFile computeAndWriteStatsFile(Table table, String branch) { + Snapshot currentSnapshot = SnapshotUtil.latestSnapshot(table, branch); + if (currentSnapshot == null) { + Preconditions.checkArgument( + branch == null, "Couldn't find the snapshot for the branch %s", branch); + return null; Review Comment: Just thinking out loud. With returning null we won't be able to have this easy usage of the functionality: `testTable.updatePartitionStatistics().setPartitionStatistics(PartitionStatsHandler.computeAndWriteStatsFile(testTable, "b1");).commit();` Not sure how common it is in Iceberg to do null checks after trying to create some resources. For instance I recall table creation doesn't give null to users but a Table object or it throws if it didn't succeed creating one. Throwing an exception that we weren't able to create stats might make sense. Still not confident with either approach. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org