stevenzwu commented on code in PR #12306: URL: https://github.com/apache/iceberg/pull/12306#discussion_r1960273582
########## core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java: ########## @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for rewriting content file groups ({@link FileRewriteGroup}). The lifecycle for the + * executor looks like the following: + * + * <ul> + * <li>{@link #init(Map)} initializes the executor with the configuration parameters + * <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan to do the actual + * rewrite of the files, and returns the generated new files. + * </ul> + * + * A single executor could be used to rewrite multiple groups for the same plan. + * + * @param <FGI> the Java type of the plan info like {@link RewriteDataFiles.FileGroupInfo} or {@link Review Comment: minor suggestions on the Javadoc ``` * @param <FGI> the Java type of the rewrite file group info like {@link RewriteDataFiles.FileGroupInfo} or {@link RewritePositionDeleteFiles.FileGroupInfo} * @param <T> the Java type of the input scan tasks (input) * @param <F> the Java type of the content files (input and output) * @param <G> the Java type of the rewrite file group like {@link ...} ``` ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { + return info; + } + + /** Input of the group. {@link ContentScanTask}s to read. */ + public List<T> fileScans() { + return fileScanTasks; + } + + /** + * While we create tasks that should all be smaller than our target size, there is a chance that + * the actual data will end up being larger than our target size due to various factors of + * compression, serialization, which are outside our control. If this occurs, instead of making a + * single file that is close in size to our target, we would end up producing one file of the + * target size, and then a small extra file with the remaining data. + * + * <p>For example, if our target is 512 MB, we may generate a rewrite task that should be 500 MB. + * When we write the data we may find we actually have to write out 530 MB. If we use the target + * size while writing, we would produce a 512 MB file and an 18 MB file. If instead we use a + * larger size estimated by this method, then we end up writing a single file. + * + * @return the target size plus one half of the distance between max and target + */ + public long writeMaxFileSize() { + return writeMaxFileSize; + } + + /** + * Determines the reader split size as the input size divided by the desired number of output + * files. The final split size is adjusted to be at least as big as the target file size but less + * than the max write file size. + */ + public long splitSize() { + return splitSize; + } + + /** Expected number of the output files. */ + public int expectedOutputFiles() { + return expectedOutputFiles; + } + + /** Accumulated size for the input files. */ + public long sizeInBytes() { + return fileScanTasks.stream().mapToLong(T::length).sum(); + } + + /** Number of the input files. */ + public int numFiles() { Review Comment: nit: `numInputFiles` ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java: ########## @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for rewriting content file groups ({@link FileRewriteGroup}). The lifecycle for the + * executor looks like the following: + * + * <ul> + * <li>{@link #init(Map)} initializes the executor with the configuration parameters + * <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan to do the actual + * rewrite of the files, and returns the generated new files. + * </ul> + * + * A single executor could be used to rewrite multiple groups for the same plan. + * + * @param <FGI> the Java type of the plan info like {@link RewriteDataFiles.FileGroupInfo} or {@link + * RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + * @param <G> the Java type of the planned groups + */ +public interface FileRewriteExecutor< + FGI, + T extends ContentScanTask<F>, + F extends ContentFile<F>, + G extends FileRewriteGroup<FGI, T, F>> { + + /** Returns a description for this rewriter. */ + default String description() { + return getClass().getName(); + } + + /** + * Returns a set of supported options for this rewriter. Only options specified in this list will + * be accepted at runtime. Any other options will be rejected. + */ + Set<String> validOptions(); + + /** + * Initializes this rewriter using provided options. Review Comment: rewriter -> runner ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java: ########## @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for rewriting content file groups ({@link FileRewriteGroup}). The lifecycle for the + * executor looks like the following: + * + * <ul> + * <li>{@link #init(Map)} initializes the executor with the configuration parameters + * <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan to do the actual + * rewrite of the files, and returns the generated new files. + * </ul> + * + * A single executor could be used to rewrite multiple groups for the same plan. + * + * @param <FGI> the Java type of the plan info like {@link RewriteDataFiles.FileGroupInfo} or {@link + * RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + * @param <G> the Java type of the planned groups + */ +public interface FileRewriteExecutor< + FGI, + T extends ContentScanTask<F>, + F extends ContentFile<F>, + G extends FileRewriteGroup<FGI, T, F>> { + + /** Returns a description for this rewriter. */ + default String description() { + return getClass().getName(); + } + + /** + * Returns a set of supported options for this rewriter. Only options specified in this list will Review Comment: nit: rewriter -> runner ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java: ########## @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for rewriting content file groups ({@link FileRewriteGroup}). The lifecycle for the + * executor looks like the following: + * + * <ul> + * <li>{@link #init(Map)} initializes the executor with the configuration parameters + * <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan to do the actual + * rewrite of the files, and returns the generated new files. + * </ul> + * + * A single executor could be used to rewrite multiple groups for the same plan. + * + * @param <FGI> the Java type of the plan info like {@link RewriteDataFiles.FileGroupInfo} or {@link + * RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + * @param <G> the Java type of the planned groups + */ +public interface FileRewriteExecutor< + FGI, Review Comment: Right now, it seems that {@link RewriteDataFiles.FileGroupInfo} and {@link RewritePositionDeleteFiles.FileGroupInfo} interfaces are identical. wondering if we should start without generics for `FGI`? it can affect future flexibility though. ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { + return info; + } + + /** Input of the group. {@link ContentScanTask}s to read. */ + public List<T> fileScans() { + return fileScanTasks; + } + + /** + * While we create tasks that should all be smaller than our target size, there is a chance that + * the actual data will end up being larger than our target size due to various factors of + * compression, serialization, which are outside our control. If this occurs, instead of making a + * single file that is close in size to our target, we would end up producing one file of the + * target size, and then a small extra file with the remaining data. + * + * <p>For example, if our target is 512 MB, we may generate a rewrite task that should be 500 MB. + * When we write the data we may find we actually have to write out 530 MB. If we use the target + * size while writing, we would produce a 512 MB file and an 18 MB file. If instead we use a + * larger size estimated by this method, then we end up writing a single file. + * + * @return the target size plus one half of the distance between max and target + */ + public long writeMaxFileSize() { + return writeMaxFileSize; + } + + /** + * Determines the reader split size as the input size divided by the desired number of output + * files. The final split size is adjusted to be at least as big as the target file size but less + * than the max write file size. + */ + public long splitSize() { + return splitSize; + } + + /** Expected number of the output files. */ + public int expectedOutputFiles() { + return expectedOutputFiles; + } + + /** Accumulated size for the input files. */ + public long sizeInBytes() { Review Comment: `inputFilesSizeInBytes`? ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { + return info; + } + + /** Input of the group. {@link ContentScanTask}s to read. */ + public List<T> fileScans() { Review Comment: nit: I know the method name is copied from existing code. Since this is a new class, we have an opportunity to name it more accurately as `scanTasks` or `fileScanTasks`. ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { + return info; + } + + /** Input of the group. {@link ContentScanTask}s to read. */ + public List<T> fileScans() { + return fileScanTasks; + } + + /** + * While we create tasks that should all be smaller than our target size, there is a chance that + * the actual data will end up being larger than our target size due to various factors of + * compression, serialization, which are outside our control. If this occurs, instead of making a + * single file that is close in size to our target, we would end up producing one file of the + * target size, and then a small extra file with the remaining data. + * + * <p>For example, if our target is 512 MB, we may generate a rewrite task that should be 500 MB. + * When we write the data we may find we actually have to write out 530 MB. If we use the target + * size while writing, we would produce a 512 MB file and an 18 MB file. If instead we use a + * larger size estimated by this method, then we end up writing a single file. + * + * @return the target size plus one half of the distance between max and target + */ + public long writeMaxFileSize() { + return writeMaxFileSize; + } + + /** + * Determines the reader split size as the input size divided by the desired number of output + * files. The final split size is adjusted to be at least as big as the target file size but less + * than the max write file size. + */ + public long splitSize() { Review Comment: inputSplitSize? ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or Review Comment: for consistency with the class above, should this generic type also be called `FGI`? ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { Review Comment: info -> fileGroupInfo ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { + return info; + } + + /** Input of the group. {@link ContentScanTask}s to read. */ + public List<T> fileScans() { + return fileScanTasks; + } + + /** + * While we create tasks that should all be smaller than our target size, there is a chance that + * the actual data will end up being larger than our target size due to various factors of + * compression, serialization, which are outside our control. If this occurs, instead of making a + * single file that is close in size to our target, we would end up producing one file of the + * target size, and then a small extra file with the remaining data. + * + * <p>For example, if our target is 512 MB, we may generate a rewrite task that should be 500 MB. + * When we write the data we may find we actually have to write out 530 MB. If we use the target + * size while writing, we would produce a 512 MB file and an 18 MB file. If instead we use a + * larger size estimated by this method, then we end up writing a single file. + * + * @return the target size plus one half of the distance between max and target + */ + public long writeMaxFileSize() { Review Comment: maxWriteFileSize or maxOutputFileSize? ########## core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java: ########## @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.List; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * Container class representing a set of files to be rewritten by a {@link FileRewriteExecutor}. + * + * @param <I> the Java type of the plan info member like {@link RewriteDataFiles.FileGroupInfo} or + * {@link RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + */ +public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F extends ContentFile<F>> { + private final I info; + private final List<T> fileScanTasks; + private final long writeMaxFileSize; + private final long splitSize; + private final int expectedOutputFiles; + + FileRewriteGroup( + I info, + List<T> fileScanTasks, + long writeMaxFileSize, + long splitSize, + int expectedOutputFiles) { + this.info = info; + this.fileScanTasks = fileScanTasks; + this.writeMaxFileSize = writeMaxFileSize; + this.splitSize = splitSize; + this.expectedOutputFiles = expectedOutputFiles; + } + + /** Identifiers and partition information about the group. */ + public I info() { + return info; + } + + /** Input of the group. {@link ContentScanTask}s to read. */ Review Comment: nit: maybe just `scan tasks for input files`? ########## core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java: ########## @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; + +/** + * A class for planning content file rewrites. + * + * <p>The entire rewrite operation is broken down into pieces. The grouping is based on partitioning + * and the planning could create multiple groups within a partition. As a result {@link + * FileRewritePlan} is generated which contains the data need by the {@link FileRewriteExecutor}s + * which execute the actual file rewrite. + * + * <p>The lifecycle of the planner is: + * + * <ul> + * <li>{@link #init(Map)} initializes the planner with the configuration parameters + * <li>{@link #plan()} generates the plan for the given configuration + * </ul> + * + * @param <FGI> the Java type of the plan info like {@link RewriteDataFiles.FileGroupInfo} or {@link + * RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + * @param <G> the Java type of the planned groups + */ +public interface FileRewritePlanner< + FGI, + T extends ContentScanTask<F>, + F extends ContentFile<F>, + G extends FileRewriteGroup<FGI, T, F>> { + + /** Returns a description for this rewriter. */ + default String description() { + return getClass().getName(); + } + + /** + * Returns a set of supported options for this rewriter. Only options specified in this list will Review Comment: rewriter 0> planner ########## core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java: ########## @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.actions; + +import java.util.Map; +import org.apache.iceberg.ContentFile; +import org.apache.iceberg.ContentScanTask; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.io.CloseableIterable; + +/** + * Result of the file rewrite planning as generated by the {@link FileRewritePlanner#plan()}. + * + * <p>The plan contains an iterable for the planned groups and statistics about the number of the + * generated groups, like the total number of the groups and the groups per partition. The plan also + * contains some calculated values required by the {@link FileRewriteExecutor}s where the values are + * based on the input data and the planning parameters. + * + * <p>Groups in a plan could be processed independently. For example, in Spark this means that each + * group would be rewritten in its own Spark job. + * + * @param <I> the Java type of the plan info like {@link RewriteDataFiles.FileGroupInfo} or {@link + * RewritePositionDeleteFiles.FileGroupInfo} + * @param <T> the Java type of the tasks to read the files which are rewritten + * @param <F> the Java type of the content files which are rewritten + * @param <G> the Java type of the planned groups + */ +public class FileRewritePlan< + I, Review Comment: `FGI` for consistency? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org