Re: [PR] A new implementation of an Iceberg Sink [WIP] that will be used with upcoming Flink Compaction jobs [iceberg]

via GitHub Tue, 07 May 2024 00:47:21 -0700


pvary commented on code in PR #10179:
URL: https://github.com/apache/iceberg/pull/10179#discussion_r1591967898



##########
flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java:
##########
@@ -0,0 +1,789 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.flink.sink;
+
+import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION;
+import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL;
+import static org.apache.iceberg.TableProperties.ORC_COMPRESSION;
+import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY;
+import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION;
+import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL;
+import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.time.Duration;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Function;
+import org.apache.flink.api.common.functions.MapFunction;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.connector.sink2.Committer;
+import org.apache.flink.api.connector.sink2.CommitterInitContext;
+import org.apache.flink.api.connector.sink2.Sink;
+import org.apache.flink.api.connector.sink2.SinkWriter;
+import org.apache.flink.api.connector.sink2.SupportsCommitter;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.ReadableConfig;
+import org.apache.flink.core.io.SimpleVersionedSerializer;
+import org.apache.flink.streaming.api.connector.sink2.CommittableMessage;
+import 
org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo;
+import 
org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology;
+import 
org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology;
+import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology;
+import org.apache.flink.streaming.api.connector.sink2.WithPostCommitTopology;
+import org.apache.flink.streaming.api.connector.sink2.WithPreCommitTopology;
+import org.apache.flink.streaming.api.connector.sink2.WithPreWriteTopology;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.datastream.DataStreamSink;
+import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
+import org.apache.flink.table.api.TableSchema;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.util.DataFormatConverters;
+import org.apache.flink.table.types.DataType;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.types.Row;
+import org.apache.flink.util.Preconditions;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DistributionMode;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.PartitionField;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.SerializableTable;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.flink.FlinkSchemaUtil;
+import org.apache.iceberg.flink.FlinkWriteConf;
+import org.apache.iceberg.flink.FlinkWriteOptions;
+import org.apache.iceberg.flink.TableLoader;
+import org.apache.iceberg.flink.sink.committer.SinkAggregator;
+import org.apache.iceberg.flink.sink.committer.SinkCommittable;
+import org.apache.iceberg.flink.sink.committer.SinkCommittableSerializer;
+import org.apache.iceberg.flink.sink.committer.SinkCommitter;
+import org.apache.iceberg.flink.sink.writer.IcebergSinkWriter;
+import org.apache.iceberg.flink.sink.writer.IcebergStreamWriterMetrics;
+import org.apache.iceberg.flink.util.FlinkCompatibilityUtil;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.relocated.com.google.common.collect.Maps;
+import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.util.SerializableSupplier;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Flink v2 sink offer different hooks to insert custom topologies into the 
sink. We will use the
+ * following:
+ *
+ * <ul>
+ *   <li>{@link WithPreWriteTopology} which redistributes the data to the 
writers based on the
+ *       {@link DistributionMode}
+ *   <li>{@link org.apache.flink.api.connector.sink2.SinkWriter} which writes 
data files, and
+ *       generates a {@link SinkCommittable} to store the {@link 
org.apache.iceberg.io.WriteResult}
+ *   <li>{@link WithPreCommitTopology} which we use to to place the {@link 
SinkAggregator} which
+ *       merges the individual {@link 
org.apache.flink.api.connector.sink2.SinkWriter}'s {@link
+ *       org.apache.iceberg.io.WriteResult}s to a single {@link
+ *       org.apache.iceberg.flink.sink.DeltaManifests}
+ *   <li>{@link Committer} which stores the incoming {@link
+ *       org.apache.iceberg.flink.sink.DeltaManifests}s in state for recovery, 
and commits them to
+ *       the Iceberg table
+ *   <li>{@link WithPostCommitTopology} we could use for incremental 
compaction later. This is not
+ *       implemented yet.
+ * </ul>
+ *
+ * The job graph looks like below:
+ *
+ * <pre>{@code
+ *                            Flink sink
+ *               
+-----------------------------------------------------------------------------------+
+ *               |                                                             
                      |
+ * +-------+     | +----------+                               +-------------+  
    +---------------+ |
+ * | Map 1 | ==> | | writer 1 |                               | committer 1 | 
---> | post commit 1 | |
+ * +-------+     | +----------+                               +-------------+  
    +---------------+ |
+ *               |             \                             /                
\                      |
+ *               |              \                           /                  
\                     |
+ *               |               \                         /                   
 \                    |
+ * +-------+     | +----------+   \ +-------------------+ /   +-------------+  
  \ +---------------+ |
+ * | Map 2 | ==> | | writer 2 | --->| commit aggregator |     | committer 2 |  
    | post commit 2 | |
+ * +-------+     | +----------+     +-------------------+     +-------------+  
    +---------------+ |
+ *               |                                             Commit only on  
                      |
+ *               |                                             committer 1     
                      |
+ *               
+-----------------------------------------------------------------------------------+
+ * }</pre>
+ */
+public class IcebergSink
+    implements Sink<RowData>,
+        SupportsPreWriteTopology<RowData>,
+        SupportsCommitter<SinkCommittable>,
+        SupportsPreCommitTopology<SinkCommittable, SinkCommittable>,
+        SupportsPostCommitTopology<SinkCommittable> {
+  private Table initTable;
+  private final TableLoader tableLoader;
+  private final Map<String, String> snapshotProperties;
+  private String uidPrefix;
+  private Committer<SinkCommittable> sinkV2Committer;
+  private final String sinkId;
+  private transient IcebergSink.Builder builder;
+  private Map<String, String> writeProperties;
+  private RowType flinkRowType;
+  private SerializableSupplier<Table> tableSupplier;
+  private transient FlinkWriteConf flinkWriteConf;
+  private List<Integer> equalityFieldIds;
+  // FlinkWriteConf properties stored here for serialization
+  private boolean upsertMode;
+  private FileFormat dataFileFormat;
+  private long targetDataFileSize;
+  private String branch;
+  private boolean overwriteMode;
+  private int workerPoolSize;
+  private static final Logger LOG = LoggerFactory.getLogger(IcebergSink.class);
+
+  static RowType toFlinkRowType(Schema schema, TableSchema requestedSchema) {
+    if (requestedSchema != null) {
+      // Convert the flink schema to iceberg schema firstly, then reassign ids 
to match the existing
+      // iceberg schema.
+      Schema writeSchema = 
TypeUtil.reassignIds(FlinkSchemaUtil.convert(requestedSchema), schema);
+      TypeUtil.validateWriteSchema(schema, writeSchema, true, true);
+
+      // We use this flink schema to read values from RowData. The flink's 
TINYINT and SMALLINT will
+      // be promoted to iceberg INTEGER, that means if we use iceberg's table 
schema to read TINYINT
+      // (backend by 1 'byte'), we will read 4 bytes rather than 1 byte, it 
will mess up the byte
+      // array in
+      // BinaryRowData. So here we must use flink schema.
+      return (RowType) requestedSchema.toRowDataType().getLogicalType();
+    } else {
+      return FlinkSchemaUtil.convert(schema);
+    }
+  }
+
+  @Override
+  public SinkWriter<RowData> createWriter(InitContext context) throws 
IOException {
+    RowDataTaskWriterFactory taskWriterFactory =
+        new RowDataTaskWriterFactory(
+            tableSupplier,
+            flinkRowType,
+            targetDataFileSize,
+            dataFileFormat,
+            writeProperties,
+            equalityFieldIds,
+            upsertMode);
+    IcebergStreamWriterMetrics metrics =
+        new IcebergStreamWriterMetrics(context.metricGroup(), 
initTable.name());
+    return new IcebergSinkWriter(
+        tableSupplier,
+        taskWriterFactory,
+        metrics,
+        context.getSubtaskId(),
+        context.getAttemptNumber());
+  }
+
+  @Override
+  public Committer<SinkCommittable> createCommitter(CommitterInitContext 
context)
+      throws IOException {
+    return sinkV2Committer != null
+        ? sinkV2Committer
+        : new SinkCommitter(
+            tableLoader, branch, snapshotProperties, overwriteMode, 
workerPoolSize, sinkId);
+  }
+
+  @Override
+  public SimpleVersionedSerializer<SinkCommittable> getCommittableSerializer() 
{
+    return new SinkCommittableSerializer();
+  }
+
+  @Override
+  public void 
addPostCommitTopology(DataStream<CommittableMessage<SinkCommittable>> 
committables) {
+    // TODO Support small file compaction
+  }
+
+  @Override
+  public DataStream<RowData> addPreWriteTopology(DataStream<RowData> 
inputDataStream) {
+    return distributeDataStream(inputDataStream);
+  }
+
+  @Override
+  public DataStream<CommittableMessage<SinkCommittable>> addPreCommitTopology(
+      DataStream<CommittableMessage<SinkCommittable>> writeResults) {
+    TypeInformation<CommittableMessage<SinkCommittable>> typeInformation =
+        CommittableMessageTypeInfo.of(this::getCommittableSerializer);
+    return writeResults
+        .global()
+        .transform(
+            prefixIfNotNull(uidPrefix, initTable.name() + "-" + sinkId + 
"-pre-commit-topology"),
+            typeInformation,
+            new SinkAggregator(tableLoader, sinkId))
+        .uid(prefixIfNotNull(uidPrefix, "pre-commit-topology"))
+        .setParallelism(1)
+        .setMaxParallelism(1)
+        .global();
+  }
+
+  @Override
+  public SimpleVersionedSerializer<SinkCommittable> getWriteResultSerializer() 
{
+    return new SinkCommittableSerializer();
+  }
+
+  public static class Builder {
+    private Function<String, DataStream<RowData>> inputCreator = null;
+    private TableLoader tableLoader;
+    private Table table;
+    private TableSchema tableSchema;
+    private List<String> equalityFieldColumns = null;
+    private String uidPrefix = null;
+    private final Map<String, String> snapshotProperties = Maps.newHashMap();
+    private ReadableConfig readableConfig = new Configuration();
+    private final Map<String, String> writeOptions = Maps.newHashMap();
+
+    private Builder() {}
+
+    private IcebergSink.Builder forRowData(DataStream<RowData> 
newRowDataInput) {
+      this.inputCreator = ignored -> newRowDataInput;
+      return this;
+    }
+
+    private IcebergSink.Builder forRow(DataStream<Row> input, TableSchema 
inputTableSchema) {
+      RowType rowType = (RowType) 
inputTableSchema.toRowDataType().getLogicalType();
+      DataType[] fieldDataTypes = inputTableSchema.getFieldDataTypes();
+
+      DataFormatConverters.RowConverter rowConverter =
+          new DataFormatConverters.RowConverter(fieldDataTypes);
+      return this.forMapperOutputType(
+              input, rowConverter::toInternal, 
FlinkCompatibilityUtil.toTypeInfo(rowType))
+          .tableSchema(inputTableSchema);
+    }
+
+    private <T> IcebergSink.Builder forMapperOutputType(
+        DataStream<T> input, MapFunction<T, RowData> mapper, 
TypeInformation<RowData> outputType) {
+      this.inputCreator =
+          newUidPrefix -> {
+            // Input stream order is crucial for some situation(e.g. in cdc 
case). Therefore, we
+            // need to set the parallelism
+            // of map operator same as its input to keep map operator chaining 
its input, and avoid
+            // rebalanced by default.
+            SingleOutputStreamOperator<RowData> inputStream =
+                input.map(mapper, 
outputType).setParallelism(input.getParallelism());
+            if (newUidPrefix != null) {
+              inputStream.name(operatorName(newUidPrefix)).uid(newUidPrefix + 
"-mapper");
+            }
+            return inputStream;
+          };
+      return this;
+    }
+
+    /**
+     * This iceberg {@link Table} instance is used for initializing {@link 
IcebergStreamWriter}
+     * which will write all the records into {@link DataFile}s and emit them 
to downstream operator.
+     * Providing a table would avoid so many table loading from each separate 
task.
+     *
+     * @param newTable the loaded iceberg table instance.
+     * @return {@link IcebergSink.Builder} to connect the iceberg table.
+     */
+    public IcebergSink.Builder table(Table newTable) {
+      this.table = newTable;
+      return this;
+    }
+
+    /**
+     * The table loader is used for loading tables in {@link 
IcebergFilesCommitter} lazily, we need

Review Comment:
   This is not true - we use a different committer.
   Please check the comments



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] A new implementation of an Iceberg Sink [WIP] that will be used with upcoming Flink Compaction jobs [iceberg]

Reply via email to