pvary commented on code in PR #15996:
URL: https://github.com/apache/iceberg/pull/15996#discussion_r3209390542


##########
flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/EqualityConvertPlanner.java:
##########
@@ -0,0 +1,564 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.flink.maintenance.operator;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.flink.annotation.Internal;
+import org.apache.flink.api.common.state.ListState;
+import org.apache.flink.api.common.state.ListStateDescriptor;
+import org.apache.flink.api.common.typeinfo.Types;
+import org.apache.flink.metrics.Counter;
+import org.apache.flink.metrics.MetricGroup;
+import org.apache.flink.runtime.state.StateInitializationContext;
+import org.apache.flink.runtime.state.StateSnapshotContext;
+import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
+import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
+import org.apache.flink.streaming.api.watermark.Watermark;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+import org.apache.flink.util.OutputTag;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DeleteFile;
+import org.apache.iceberg.FileContent;
+import org.apache.iceberg.ManifestFile;
+import org.apache.iceberg.ManifestFiles;
+import org.apache.iceberg.ManifestReader;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotChanges;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.flink.TableLoader;
+import org.apache.iceberg.flink.maintenance.api.Trigger;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.util.ContentFileUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Planner for the equality delete conversion pipeline. For each trigger, it 
picks the oldest
+ * staging snapshot that hasn't been converted yet and emits {@link 
ReadCommand}s describing the
+ * files its downstream readers and workers must process.
+ *
+ * <p>Emissions are grouped into phases separated by watermarks so the 
downstream operators observe
+ * them in a consistent order:
+ *
+ * <ol>
+ *   <li>Main data — read once per new equality-field-set to seed the worker's 
key index.
+ *   <li>Equality deletes from the staging snapshot — resolve against the 
index.
+ *   <li>Positional deletes from the staging snapshot — bypass the worker, go 
straight to DVs.
+ *   <li>New staging data — added to the index for the next cycle.
+ * </ol>
+ *
+ * The planner also detects external commits on main (not produced by the 
converter itself) and
+ * re-emits main data so the worker can rebuild its index before resolving the 
current cycle's
+ * equality deletes.
+ *
+ * <p>An {@link EqualityConvertPlanResult} with the current cycle's metadata 
is emitted via the
+ * {@link #METADATA_STREAM} side output after the read commands.
+ */
+@Internal
+public class EqualityConvertPlanner extends AbstractStreamOperator<ReadCommand>
+    implements OneInputStreamOperator<Trigger, ReadCommand> {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(EqualityConvertPlanner.class);
+
+  public static final OutputTag<EqualityConvertPlanResult> METADATA_STREAM =
+      new OutputTag<>("metadata-stream") {};
+
+  private static final String PROCESSED_EQ_DELETE_FILE_NUM_METRIC = 
"processedEqDeleteFileNum";
+  private static final String PROCESSED_STAGING_SNAPSHOT_NUM_METRIC = 
"processedStagingSnapshotNum";
+  private static final String SKIPPED_NO_OP_CYCLES_METRIC = 
"skippedNoOpCycles";
+  private static final String REINDEX_COUNT_METRIC = "reindexCount";
+
+  private final String tableName;
+  private final String taskName;
+  private final int taskIndex;
+  private final TableLoader tableLoader;
+  private final String stagingBranch;
+  private final String targetBranch;
+
+  // Main snapshot id the worker's index reflects.
+  private transient ListState<Long> indexSnapshotState;
+  // Equality field sets for which main data has been emitted.
+  private transient ListState<List<Integer>> mainIndexEmittedState;
+
+  private transient Table table;
+
+  private transient Long lastMainSnapshotId;
+  private transient Long lastStagingSnapshotId;
+  private transient Long indexSnapshotId;
+  private transient Set<List<Integer>> mainIndexEmittedSet;

Review Comment:
   Why is this a set?
   Can we have multiple versions active? Shouldn't we just keep the last one?
   
   Basically we are saying that the "Key" could be dynamically changed. How do 
we handle these changes?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to