stevenzwu commented on code in PR #12306:
URL: https://github.com/apache/iceberg/pull/12306#discussion_r1960273582


##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for rewriting content file groups ({@link FileRewriteGroup}). The 
lifecycle for the
+ * executor looks like the following:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the executor with the configuration 
parameters
+ *   <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan 
to do the actual
+ *       rewrite of the files, and returns the generated new files.
+ * </ul>
+ *
+ * A single executor could be used to rewrite multiple groups for the same 
plan.
+ *
+ * @param <FGI> the Java type of the plan info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link

Review Comment:
   minor suggestions on the Javadoc
   
   ```
    * @param <FGI> the Java type of the rewrite file group info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link 
RewritePositionDeleteFiles.FileGroupInfo}
    * @param <T> the Java type of the input scan tasks (input)
    * @param <F> the Java type of the content files (input and output)
    * @param <G> the Java type of the rewrite file group like {@link ...} 
   ```



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */
+  public List<T> fileScans() {
+    return fileScanTasks;
+  }
+
+  /**
+   * While we create tasks that should all be smaller than our target size, 
there is a chance that
+   * the actual data will end up being larger than our target size due to 
various factors of
+   * compression, serialization, which are outside our control. If this 
occurs, instead of making a
+   * single file that is close in size to our target, we would end up 
producing one file of the
+   * target size, and then a small extra file with the remaining data.
+   *
+   * <p>For example, if our target is 512 MB, we may generate a rewrite task 
that should be 500 MB.
+   * When we write the data we may find we actually have to write out 530 MB. 
If we use the target
+   * size while writing, we would produce a 512 MB file and an 18 MB file. If 
instead we use a
+   * larger size estimated by this method, then we end up writing a single 
file.
+   *
+   * @return the target size plus one half of the distance between max and 
target
+   */
+  public long writeMaxFileSize() {
+    return writeMaxFileSize;
+  }
+
+  /**
+   * Determines the reader split size as the input size divided by the desired 
number of output
+   * files. The final split size is adjusted to be at least as big as the 
target file size but less
+   * than the max write file size.
+   */
+  public long splitSize() {
+    return splitSize;
+  }
+
+  /** Expected number of the output files. */
+  public int expectedOutputFiles() {
+    return expectedOutputFiles;
+  }
+
+  /** Accumulated size for the input files. */
+  public long sizeInBytes() {
+    return fileScanTasks.stream().mapToLong(T::length).sum();
+  }
+
+  /** Number of the input files. */
+  public int numFiles() {

Review Comment:
   nit: `numInputFiles`



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for rewriting content file groups ({@link FileRewriteGroup}). The 
lifecycle for the
+ * executor looks like the following:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the executor with the configuration 
parameters
+ *   <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan 
to do the actual
+ *       rewrite of the files, and returns the generated new files.
+ * </ul>
+ *
+ * A single executor could be used to rewrite multiple groups for the same 
plan.
+ *
+ * @param <FGI> the Java type of the plan info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link
+ *     RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ * @param <G> the Java type of the planned groups
+ */
+public interface FileRewriteExecutor<
+    FGI,
+    T extends ContentScanTask<F>,
+    F extends ContentFile<F>,
+    G extends FileRewriteGroup<FGI, T, F>> {
+
+  /** Returns a description for this rewriter. */
+  default String description() {
+    return getClass().getName();
+  }
+
+  /**
+   * Returns a set of supported options for this rewriter. Only options 
specified in this list will
+   * be accepted at runtime. Any other options will be rejected.
+   */
+  Set<String> validOptions();
+
+  /**
+   * Initializes this rewriter using provided options.

Review Comment:
   rewriter -> runner



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for rewriting content file groups ({@link FileRewriteGroup}). The 
lifecycle for the
+ * executor looks like the following:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the executor with the configuration 
parameters
+ *   <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan 
to do the actual
+ *       rewrite of the files, and returns the generated new files.
+ * </ul>
+ *
+ * A single executor could be used to rewrite multiple groups for the same 
plan.
+ *
+ * @param <FGI> the Java type of the plan info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link
+ *     RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ * @param <G> the Java type of the planned groups
+ */
+public interface FileRewriteExecutor<
+    FGI,
+    T extends ContentScanTask<F>,
+    F extends ContentFile<F>,
+    G extends FileRewriteGroup<FGI, T, F>> {
+
+  /** Returns a description for this rewriter. */
+  default String description() {
+    return getClass().getName();
+  }
+
+  /**
+   * Returns a set of supported options for this rewriter. Only options 
specified in this list will

Review Comment:
   nit: rewriter -> runner



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteExecutor.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for rewriting content file groups ({@link FileRewriteGroup}). The 
lifecycle for the
+ * executor looks like the following:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the executor with the configuration 
parameters
+ *   <li>{@link #rewrite(FileRewriteGroup)} called for every group in the plan 
to do the actual
+ *       rewrite of the files, and returns the generated new files.
+ * </ul>
+ *
+ * A single executor could be used to rewrite multiple groups for the same 
plan.
+ *
+ * @param <FGI> the Java type of the plan info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link
+ *     RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ * @param <G> the Java type of the planned groups
+ */
+public interface FileRewriteExecutor<
+    FGI,

Review Comment:
   Right now, it seems that {@link RewriteDataFiles.FileGroupInfo} and {@link 
RewritePositionDeleteFiles.FileGroupInfo} interfaces are identical. wondering 
if we should start without generics for `FGI`? it can affect future flexibility 
though.



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */
+  public List<T> fileScans() {
+    return fileScanTasks;
+  }
+
+  /**
+   * While we create tasks that should all be smaller than our target size, 
there is a chance that
+   * the actual data will end up being larger than our target size due to 
various factors of
+   * compression, serialization, which are outside our control. If this 
occurs, instead of making a
+   * single file that is close in size to our target, we would end up 
producing one file of the
+   * target size, and then a small extra file with the remaining data.
+   *
+   * <p>For example, if our target is 512 MB, we may generate a rewrite task 
that should be 500 MB.
+   * When we write the data we may find we actually have to write out 530 MB. 
If we use the target
+   * size while writing, we would produce a 512 MB file and an 18 MB file. If 
instead we use a
+   * larger size estimated by this method, then we end up writing a single 
file.
+   *
+   * @return the target size plus one half of the distance between max and 
target
+   */
+  public long writeMaxFileSize() {
+    return writeMaxFileSize;
+  }
+
+  /**
+   * Determines the reader split size as the input size divided by the desired 
number of output
+   * files. The final split size is adjusted to be at least as big as the 
target file size but less
+   * than the max write file size.
+   */
+  public long splitSize() {
+    return splitSize;
+  }
+
+  /** Expected number of the output files. */
+  public int expectedOutputFiles() {
+    return expectedOutputFiles;
+  }
+
+  /** Accumulated size for the input files. */
+  public long sizeInBytes() {

Review Comment:
   `inputFilesSizeInBytes`?



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */
+  public List<T> fileScans() {

Review Comment:
   nit: I know the method name is copied from existing code. Since this is a 
new class, we have an opportunity to name it more accurately as `scanTasks` or 
`fileScanTasks`.



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */
+  public List<T> fileScans() {
+    return fileScanTasks;
+  }
+
+  /**
+   * While we create tasks that should all be smaller than our target size, 
there is a chance that
+   * the actual data will end up being larger than our target size due to 
various factors of
+   * compression, serialization, which are outside our control. If this 
occurs, instead of making a
+   * single file that is close in size to our target, we would end up 
producing one file of the
+   * target size, and then a small extra file with the remaining data.
+   *
+   * <p>For example, if our target is 512 MB, we may generate a rewrite task 
that should be 500 MB.
+   * When we write the data we may find we actually have to write out 530 MB. 
If we use the target
+   * size while writing, we would produce a 512 MB file and an 18 MB file. If 
instead we use a
+   * larger size estimated by this method, then we end up writing a single 
file.
+   *
+   * @return the target size plus one half of the distance between max and 
target
+   */
+  public long writeMaxFileSize() {
+    return writeMaxFileSize;
+  }
+
+  /**
+   * Determines the reader split size as the input size divided by the desired 
number of output
+   * files. The final split size is adjusted to be at least as big as the 
target file size but less
+   * than the max write file size.
+   */
+  public long splitSize() {

Review Comment:
   inputSplitSize?



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or

Review Comment:
   for consistency with the class above, should this generic type also be 
called `FGI`?



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {

Review Comment:
   info -> fileGroupInfo



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */
+  public List<T> fileScans() {
+    return fileScanTasks;
+  }
+
+  /**
+   * While we create tasks that should all be smaller than our target size, 
there is a chance that
+   * the actual data will end up being larger than our target size due to 
various factors of
+   * compression, serialization, which are outside our control. If this 
occurs, instead of making a
+   * single file that is close in size to our target, we would end up 
producing one file of the
+   * target size, and then a small extra file with the remaining data.
+   *
+   * <p>For example, if our target is 512 MB, we may generate a rewrite task 
that should be 500 MB.
+   * When we write the data we may find we actually have to write out 530 MB. 
If we use the target
+   * size while writing, we would produce a 512 MB file and an 18 MB file. If 
instead we use a
+   * larger size estimated by this method, then we end up writing a single 
file.
+   *
+   * @return the target size plus one half of the distance between max and 
target
+   */
+  public long writeMaxFileSize() {

Review Comment:
   maxWriteFileSize or maxOutputFileSize? 
   



##########
core/src/main/java/org/apache/iceberg/actions/FileRewriteGroup.java:
##########
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.List;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * Container class representing a set of files to be rewritten by a {@link 
FileRewriteExecutor}.
+ *
+ * @param <I> the Java type of the plan info member like {@link 
RewriteDataFiles.FileGroupInfo} or
+ *     {@link RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ */
+public abstract class FileRewriteGroup<I, T extends ContentScanTask<F>, F 
extends ContentFile<F>> {
+  private final I info;
+  private final List<T> fileScanTasks;
+  private final long writeMaxFileSize;
+  private final long splitSize;
+  private final int expectedOutputFiles;
+
+  FileRewriteGroup(
+      I info,
+      List<T> fileScanTasks,
+      long writeMaxFileSize,
+      long splitSize,
+      int expectedOutputFiles) {
+    this.info = info;
+    this.fileScanTasks = fileScanTasks;
+    this.writeMaxFileSize = writeMaxFileSize;
+    this.splitSize = splitSize;
+    this.expectedOutputFiles = expectedOutputFiles;
+  }
+
+  /** Identifiers and partition information about the group. */
+  public I info() {
+    return info;
+  }
+
+  /** Input of the group. {@link ContentScanTask}s to read. */

Review Comment:
   nit: maybe just `scan tasks for input files`?



##########
core/src/main/java/org/apache/iceberg/actions/FileRewritePlanner.java:
##########
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import java.util.Set;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+
+/**
+ * A class for planning content file rewrites.
+ *
+ * <p>The entire rewrite operation is broken down into pieces. The grouping is 
based on partitioning
+ * and the planning could create multiple groups within a partition. As a 
result {@link
+ * FileRewritePlan} is generated which contains the data need by the {@link 
FileRewriteExecutor}s
+ * which execute the actual file rewrite.
+ *
+ * <p>The lifecycle of the planner is:
+ *
+ * <ul>
+ *   <li>{@link #init(Map)} initializes the planner with the configuration 
parameters
+ *   <li>{@link #plan()} generates the plan for the given configuration
+ * </ul>
+ *
+ * @param <FGI> the Java type of the plan info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link
+ *     RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ * @param <G> the Java type of the planned groups
+ */
+public interface FileRewritePlanner<
+    FGI,
+    T extends ContentScanTask<F>,
+    F extends ContentFile<F>,
+    G extends FileRewriteGroup<FGI, T, F>> {
+
+  /** Returns a description for this rewriter. */
+  default String description() {
+    return getClass().getName();
+  }
+
+  /**
+   * Returns a set of supported options for this rewriter. Only options 
specified in this list will

Review Comment:
   rewriter 0> planner



##########
core/src/main/java/org/apache/iceberg/actions/FileRewritePlan.java:
##########
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.util.Map;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.ContentScanTask;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.io.CloseableIterable;
+
+/**
+ * Result of the file rewrite planning as generated by the {@link 
FileRewritePlanner#plan()}.
+ *
+ * <p>The plan contains an iterable for the planned groups and statistics 
about the number of the
+ * generated groups, like the total number of the groups and the groups per 
partition. The plan also
+ * contains some calculated values required by the {@link 
FileRewriteExecutor}s where the values are
+ * based on the input data and the planning parameters.
+ *
+ * <p>Groups in a plan could be processed independently. For example, in Spark 
this means that each
+ * group would be rewritten in its own Spark job.
+ *
+ * @param <I> the Java type of the plan info like {@link 
RewriteDataFiles.FileGroupInfo} or {@link
+ *     RewritePositionDeleteFiles.FileGroupInfo}
+ * @param <T> the Java type of the tasks to read the files which are rewritten
+ * @param <F> the Java type of the content files which are rewritten
+ * @param <G> the Java type of the planned groups
+ */
+public class FileRewritePlan<
+    I,

Review Comment:
   `FGI` for consistency?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org


Reply via email to