stevenzwu commented on code in PR #10179: URL: https://github.com/apache/iceberg/pull/10179#discussion_r1605607730
########## flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java: ########## @@ -0,0 +1,780 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.committer.SinkAggregator; +import org.apache.iceberg.flink.sink.committer.SinkCommittable; +import org.apache.iceberg.flink.sink.committer.SinkCommittableSerializer; +import org.apache.iceberg.flink.sink.committer.SinkCommitter; +import org.apache.iceberg.flink.sink.committer.WriteResultSerializer; +import org.apache.iceberg.flink.sink.writer.IcebergSinkWriter; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Flink v2 sink offer different hooks to insert custom topologies into the sink. We will use the + * following: + * + * <ul> + * <li>{@link SupportsPreWriteTopology} which redistributes the data to the writers based on the + * {@link DistributionMode} + * <li>{@link org.apache.flink.api.connector.sink2.SinkWriter} which writes data files, and + * generates a {@link SinkCommittable} to store the {@link org.apache.iceberg.io.WriteResult} + * <li>{@link SupportsPreCommitTopology} which we use to to place the {@link SinkAggregator} which + * merges the individual {@link org.apache.flink.api.connector.sink2.SinkWriter}'s {@link + * org.apache.iceberg.io.WriteResult}s to a single {@link + * org.apache.iceberg.flink.sink.DeltaManifests} + * <li>{@link SinkCommitter} which stores the incoming {@link + * org.apache.iceberg.flink.sink.DeltaManifests}s in state for recovery, and commits them to + * the Iceberg table + * <li>{@link SupportsPostCommitTopology} we could use for incremental compaction later. This is + * not implemented yet. + * </ul> + * + * The job graph looks like below: + * + * <pre>{@code + * Flink sink + * +-----------------------------------------------------------------------------------+ + * | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | Map 1 | ==> | | writer 1 | | committer 1 | ---> | post commit 1 | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | \ / \ | + * | \ / \ | + * | \ / \ | + * +-------+ | +----------+ \ +-------------------+ / +-------------+ \ +---------------+ | + * | Map 2 | ==> | | writer 2 | --->| commit aggregator | | committer 2 | | post commit 2 | | + * +-------+ | +----------+ +-------------------+ +-------------+ +---------------+ | + * | Commit only on | + * | committer 1 | + * +-----------------------------------------------------------------------------------+ + * }</pre> + */ +public class IcebergSink + implements Sink<RowData>, + SupportsPreWriteTopology<RowData>, + SupportsCommitter<SinkCommittable>, + SupportsPreCommitTopology<WriteResult, SinkCommittable>, + SupportsPostCommitTopology<SinkCommittable> { + private Table initTable; + private final TableLoader tableLoader; + private final Map<String, String> snapshotProperties; Review Comment: nit: `snapshotSummary` is probably more accurate. properties can cause a little confusion as table properties ########## flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java: ########## @@ -0,0 +1,780 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.committer.SinkAggregator; +import org.apache.iceberg.flink.sink.committer.SinkCommittable; +import org.apache.iceberg.flink.sink.committer.SinkCommittableSerializer; +import org.apache.iceberg.flink.sink.committer.SinkCommitter; +import org.apache.iceberg.flink.sink.committer.WriteResultSerializer; +import org.apache.iceberg.flink.sink.writer.IcebergSinkWriter; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Flink v2 sink offer different hooks to insert custom topologies into the sink. We will use the + * following: + * + * <ul> + * <li>{@link SupportsPreWriteTopology} which redistributes the data to the writers based on the + * {@link DistributionMode} + * <li>{@link org.apache.flink.api.connector.sink2.SinkWriter} which writes data files, and + * generates a {@link SinkCommittable} to store the {@link org.apache.iceberg.io.WriteResult} + * <li>{@link SupportsPreCommitTopology} which we use to to place the {@link SinkAggregator} which + * merges the individual {@link org.apache.flink.api.connector.sink2.SinkWriter}'s {@link + * org.apache.iceberg.io.WriteResult}s to a single {@link + * org.apache.iceberg.flink.sink.DeltaManifests} + * <li>{@link SinkCommitter} which stores the incoming {@link + * org.apache.iceberg.flink.sink.DeltaManifests}s in state for recovery, and commits them to + * the Iceberg table + * <li>{@link SupportsPostCommitTopology} we could use for incremental compaction later. This is + * not implemented yet. + * </ul> + * + * The job graph looks like below: + * + * <pre>{@code + * Flink sink + * +-----------------------------------------------------------------------------------+ + * | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | Map 1 | ==> | | writer 1 | | committer 1 | ---> | post commit 1 | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | \ / \ | + * | \ / \ | + * | \ / \ | + * +-------+ | +----------+ \ +-------------------+ / +-------------+ \ +---------------+ | + * | Map 2 | ==> | | writer 2 | --->| commit aggregator | | committer 2 | | post commit 2 | | + * +-------+ | +----------+ +-------------------+ +-------------+ +---------------+ | + * | Commit only on | + * | committer 1 | + * +-----------------------------------------------------------------------------------+ + * }</pre> + */ +public class IcebergSink + implements Sink<RowData>, + SupportsPreWriteTopology<RowData>, + SupportsCommitter<SinkCommittable>, + SupportsPreCommitTopology<WriteResult, SinkCommittable>, + SupportsPostCommitTopology<SinkCommittable> { + private Table initTable; + private final TableLoader tableLoader; + private final Map<String, String> snapshotProperties; + private String uidPrefix; + private Committer<SinkCommittable> sinkCommitter; + private final String sinkId; + private transient IcebergSink.Builder builder; + private Map<String, String> writeProperties; + private RowType flinkRowType; + private SerializableSupplier<Table> tableSupplier; + private transient FlinkWriteConf flinkWriteConf; + private List<Integer> equalityFieldIds; + private boolean upsertMode; + private FileFormat dataFileFormat; + private long targetDataFileSize; + private String branch; + private boolean overwriteMode; + private int workerPoolSize; + private static final Logger LOG = LoggerFactory.getLogger(IcebergSink.class); + + private IcebergSink(IcebergSink.Builder builder) { + this.builder = builder; + this.tableLoader = builder.tableLoader(); + this.snapshotProperties = builder.snapshotProperties; + + // We generate a random UUID every time when a sink is created. + // This is used to separate files generated by different sinks writing the same table. Review Comment: is this a valid requirement to have multiple sinks writing to the same Iceberg table in a single Flink job? can't the streams be unionized first before writing to Iceberg? ########## flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/committer/SinkCommittable.java: ########## @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.committer; + +import java.io.Serializable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** + * The aggregated results of a single checkpoint which should be committed. Containing the + * serialized {@link org.apache.iceberg.flink.sink.DeltaManifests} file - which contains the commit + * data, and the jobId, operatorId, checkpointId triplet which helps identifying the specific commit + * + * <p>{@link SinkCommittableSerializer} is used for serializing the objects between the Writer and + * the Aggregator operator and between the Aggregator and the Committer as well. + */ +public class SinkCommittable implements Serializable { + private final byte[] manifest; + private final String jobId; + private final String operatorId; + private final long checkpointId; + + public SinkCommittable(byte[] manifest, String jobId, String operatorId, long checkpointId) { Review Comment: I think we should pass in `manifest` as `DeltaManifests` (not as `byte[]`). I know current code does byte[]. We can move the serialization and deserialization to the `SinkCommittableSerializer` ########## flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java: ########## @@ -0,0 +1,780 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.committer.SinkAggregator; +import org.apache.iceberg.flink.sink.committer.SinkCommittable; +import org.apache.iceberg.flink.sink.committer.SinkCommittableSerializer; +import org.apache.iceberg.flink.sink.committer.SinkCommitter; +import org.apache.iceberg.flink.sink.committer.WriteResultSerializer; +import org.apache.iceberg.flink.sink.writer.IcebergSinkWriter; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Flink v2 sink offer different hooks to insert custom topologies into the sink. We will use the + * following: + * + * <ul> + * <li>{@link SupportsPreWriteTopology} which redistributes the data to the writers based on the + * {@link DistributionMode} + * <li>{@link org.apache.flink.api.connector.sink2.SinkWriter} which writes data files, and + * generates a {@link SinkCommittable} to store the {@link org.apache.iceberg.io.WriteResult} + * <li>{@link SupportsPreCommitTopology} which we use to to place the {@link SinkAggregator} which + * merges the individual {@link org.apache.flink.api.connector.sink2.SinkWriter}'s {@link + * org.apache.iceberg.io.WriteResult}s to a single {@link + * org.apache.iceberg.flink.sink.DeltaManifests} + * <li>{@link SinkCommitter} which stores the incoming {@link + * org.apache.iceberg.flink.sink.DeltaManifests}s in state for recovery, and commits them to + * the Iceberg table + * <li>{@link SupportsPostCommitTopology} we could use for incremental compaction later. This is + * not implemented yet. + * </ul> + * + * The job graph looks like below: + * + * <pre>{@code + * Flink sink + * +-----------------------------------------------------------------------------------+ + * | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | Map 1 | ==> | | writer 1 | | committer 1 | ---> | post commit 1 | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | \ / \ | + * | \ / \ | + * | \ / \ | + * +-------+ | +----------+ \ +-------------------+ / +-------------+ \ +---------------+ | + * | Map 2 | ==> | | writer 2 | --->| commit aggregator | | committer 2 | | post commit 2 | | + * +-------+ | +----------+ +-------------------+ +-------------+ +---------------+ | + * | Commit only on | Review Comment: > Commit only on committer 1 This has always been a sticking issue to me. It will be confusing for users when looking at the DAG and wondering why other committer subtasks get no traffic or why there are so many committer subtasks. I asked the Flink community on this before. There was a suggestion of supporting control of committer parallelism. That will be sufficient for this purpose. But I think it is not formally pursued and completed. https://lists.apache.org/thread/82bgvlton9olb591bfg2djv0cshj1bxj It will be great to fix the Flink side first. ########## flink/v1.19/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergSink.java: ########## @@ -0,0 +1,780 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION; +import static org.apache.iceberg.TableProperties.AVRO_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION; +import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; +import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.time.Duration; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.function.Function; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.connector.sink2.Committer; +import org.apache.flink.api.connector.sink2.CommitterInitContext; +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.api.connector.sink2.SupportsCommitter; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessageTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.SupportsPostCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreCommitTopology; +import org.apache.flink.streaming.api.connector.sink2.SupportsPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.flink.util.Preconditions; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.PartitionField; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SerializableTable; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.flink.TableLoader; +import org.apache.iceberg.flink.sink.committer.SinkAggregator; +import org.apache.iceberg.flink.sink.committer.SinkCommittable; +import org.apache.iceberg.flink.sink.committer.SinkCommittableSerializer; +import org.apache.iceberg.flink.sink.committer.SinkCommitter; +import org.apache.iceberg.flink.sink.committer.WriteResultSerializer; +import org.apache.iceberg.flink.sink.writer.IcebergSinkWriter; +import org.apache.iceberg.flink.util.FlinkCompatibilityUtil; +import org.apache.iceberg.io.WriteResult; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.SerializableSupplier; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Flink v2 sink offer different hooks to insert custom topologies into the sink. We will use the + * following: + * + * <ul> + * <li>{@link SupportsPreWriteTopology} which redistributes the data to the writers based on the + * {@link DistributionMode} + * <li>{@link org.apache.flink.api.connector.sink2.SinkWriter} which writes data files, and + * generates a {@link SinkCommittable} to store the {@link org.apache.iceberg.io.WriteResult} + * <li>{@link SupportsPreCommitTopology} which we use to to place the {@link SinkAggregator} which + * merges the individual {@link org.apache.flink.api.connector.sink2.SinkWriter}'s {@link + * org.apache.iceberg.io.WriteResult}s to a single {@link + * org.apache.iceberg.flink.sink.DeltaManifests} + * <li>{@link SinkCommitter} which stores the incoming {@link + * org.apache.iceberg.flink.sink.DeltaManifests}s in state for recovery, and commits them to + * the Iceberg table + * <li>{@link SupportsPostCommitTopology} we could use for incremental compaction later. This is + * not implemented yet. + * </ul> + * + * The job graph looks like below: + * + * <pre>{@code + * Flink sink + * +-----------------------------------------------------------------------------------+ + * | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | Map 1 | ==> | | writer 1 | | committer 1 | ---> | post commit 1 | | + * +-------+ | +----------+ +-------------+ +---------------+ | + * | \ / \ | + * | \ / \ | + * | \ / \ | + * +-------+ | +----------+ \ +-------------------+ / +-------------+ \ +---------------+ | + * | Map 2 | ==> | | writer 2 | --->| commit aggregator | | committer 2 | | post commit 2 | | + * +-------+ | +----------+ +-------------------+ +-------------+ +---------------+ | + * | Commit only on | + * | committer 1 | + * +-----------------------------------------------------------------------------------+ + * }</pre> + */ +public class IcebergSink + implements Sink<RowData>, + SupportsPreWriteTopology<RowData>, + SupportsCommitter<SinkCommittable>, + SupportsPreCommitTopology<WriteResult, SinkCommittable>, + SupportsPostCommitTopology<SinkCommittable> { + private Table initTable; + private final TableLoader tableLoader; + private final Map<String, String> snapshotProperties; + private String uidPrefix; + private Committer<SinkCommittable> sinkCommitter; + private final String sinkId; + private transient IcebergSink.Builder builder; Review Comment: nit: `IcebergSink.` not needed. applicable to other places -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org