stevenzwu commented on code in PR #12493:
URL: https://github.com/apache/iceberg/pull/12493#discussion_r2015095511


##########
core/src/main/java/org/apache/iceberg/actions/BinPackRewriteFileGroupPlanner.java:
##########
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.actions;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.apache.iceberg.ContentFile;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.RewriteJobOrder;
+import org.apache.iceberg.StructLike;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.TableScan;
+import org.apache.iceberg.actions.RewriteDataFiles.FileGroupInfo;
+import org.apache.iceberg.data.GenericRecord;
+import org.apache.iceberg.expressions.Expression;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.io.CloseableIterable;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
+import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ContentFileUtil;
+import org.apache.iceberg.util.PropertyUtil;
+import org.apache.iceberg.util.StructLikeMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Groups specified data files in the {@link Table} into {@link 
RewriteFileGroup}s. The files are
+ * grouped by partitions based on their size using fix sized bins. Extends 
{@link
+ * SizeBasedFileRewritePlanner} with delete file number and delete ratio 
thresholds and job {@link
+ * RewriteDataFiles#REWRITE_JOB_ORDER} handling.
+ */
+public class BinPackRewriteFileGroupPlanner
+    extends SizeBasedFileRewritePlanner<FileGroupInfo, FileScanTask, DataFile, 
RewriteFileGroup> {
+  /**
+   * The minimum number of deletes that needs to be associated with a data 
file for it to be
+   * considered for rewriting. If a data file has this number of deletes or 
more, it will be
+   * rewritten regardless of its file size determined by {@link 
#MIN_FILE_SIZE_BYTES} and {@link
+   * #MAX_FILE_SIZE_BYTES}. If a file group contains a file that satisfies 
this condition, the file
+   * group will be rewritten regardless of the number of files in the file 
group determined by
+   * {@link #MIN_INPUT_FILES}.
+   *
+   * <p>Defaults to Integer.MAX_VALUE, which means this feature is not enabled 
by default.
+   */
+  public static final String DELETE_FILE_THRESHOLD = "delete-file-threshold";
+
+  public static final int DELETE_FILE_THRESHOLD_DEFAULT = Integer.MAX_VALUE;
+
+  /**
+   * The minimum deletion ratio that needs to be associated with a data file 
for it to be considered
+   * for rewriting. If the deletion ratio of a data file is greater than or 
equal to this value, it
+   * will be rewritten regardless of its file size determined by {@link 
#MIN_FILE_SIZE_BYTES} and
+   * {@link #MAX_FILE_SIZE_BYTES}. If a file group contains a file that 
satisfies this condition,
+   * the file group will be rewritten regardless of the number of files in the 
file group determined
+   * by {@link #MIN_INPUT_FILES}.
+   *
+   * <p>Defaults to 0.3, which means that if the deletion ratio of a file 
reaches or exceeds 30%, it
+   * may trigger the rewriting operation.
+   */
+  public static final String DELETE_RATIO_THRESHOLD = "delete-ratio-threshold";
+
+  public static final double DELETE_RATIO_THRESHOLD_DEFAULT = 0.3;
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(BinPackRewriteFileGroupPlanner.class);
+
+  private final Expression filter;
+  private final Long snapshotId;
+  private final boolean caseSensitive;
+
+  private int deleteFileThreshold;
+  private double deleteRatioThreshold;
+  private RewriteJobOrder rewriteJobOrder;
+
+  public BinPackRewriteFileGroupPlanner(Table table) {
+    this(table, Expressions.alwaysTrue());
+  }
+
+  public BinPackRewriteFileGroupPlanner(Table table, Expression filter) {
+    this(
+        table,
+        filter,
+        table.currentSnapshot() != null ? table.currentSnapshot().snapshotId() 
: null,
+        false);
+  }
+
+  /**
+   * Creates the planner for the given table.
+   *
+   * @param table to plan for
+   * @param filter used to remove files from the plan
+   * @param snapshotId used as a basis for planning - should be used as 
starting snapshot id at

Review Comment:
   nit on Javadoc
   ```
   @param snapshotId a snapshot ID used for planning and as the starting 
snapshot id for commit validation when replacing the files
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to