JonasJ-ap commented on code in PR #6449:
URL: https://github.com/apache/iceberg/pull/6449#discussion_r1098097617


##########
delta-lake/src/main/java/org/apache/iceberg/delta/BaseSnapshotDeltaLakeTableAction.java:
##########
@@ -0,0 +1,398 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.delta;
+
+import io.delta.standalone.DeltaLog;
+import io.delta.standalone.VersionLog;
+import io.delta.standalone.actions.Action;
+import io.delta.standalone.actions.AddFile;
+import io.delta.standalone.actions.RemoveFile;
+import java.io.File;
+import java.net.URI;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.net.URLCodec;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.AppendFiles;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataFiles;
+import org.apache.iceberg.DeleteFiles;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Metrics;
+import org.apache.iceberg.MetricsConfig;
+import org.apache.iceberg.OverwriteFiles;
+import org.apache.iceberg.PartitionField;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotSummary;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.Transaction;
+import org.apache.iceberg.catalog.Catalog;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.io.ResolvingFileIO;
+import org.apache.iceberg.mapping.MappingUtil;
+import org.apache.iceberg.mapping.NameMapping;
+import org.apache.iceberg.mapping.NameMappingParser;
+import org.apache.iceberg.parquet.ParquetUtil;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Type;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Takes a Delta Lake table's location and attempts to create an Iceberg table 
snapshot in an
+ * optional user-specified location (default to the Delta Lake table's 
location) with a different
+ * identifier.
+ */
+class BaseSnapshotDeltaLakeTableAction implements SnapshotDeltaLakeTable {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(BaseSnapshotDeltaLakeTableAction.class);
+
+  private static final String SNAPSHOT_SOURCE_PROP = "snapshot_source";
+  private static final String DELTA_SOURCE_VALUE = "delta";
+  private static final String ORIGINAL_LOCATION_PROP = "original_location";
+  private static final String PARQUET_SUFFIX = ".parquet";
+  private static final String AVRO_SUFFIX = ".avro";
+  private static final String ORC_SUFFIX = ".orc";
+  private final ImmutableMap.Builder<String, String> 
additionalPropertiesBuilder =
+      ImmutableMap.builder();
+  private DeltaLog deltaLog;
+  private Catalog icebergCatalog;
+  private final String deltaTableLocation;
+  private TableIdentifier newTableIdentifier;
+  private String newTableLocation;
+  private ResolvingFileIO deltaLakeFileIO;
+  private long deltaStartVersion;
+
+  /**
+   * Snapshot a delta lake table to be an iceberg table. The action will read 
the delta lake table's
+   * log through the table's path, create a new iceberg table using the given 
icebergCatalog and
+   * newTableIdentifier, and commit all changes in one iceberg transaction.
+   *
+   * <p>The new table will only be created if the snapshot is successful.
+   *
+   * @param deltaTableLocation the delta lake table's path
+   */
+  BaseSnapshotDeltaLakeTableAction(String deltaTableLocation) {
+    this.deltaTableLocation = deltaTableLocation;
+    this.newTableLocation = deltaTableLocation;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable tableProperties(Map<String, String> 
properties) {
+    additionalPropertiesBuilder.putAll(properties);
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable tableProperty(String name, String value) {
+    additionalPropertiesBuilder.put(name, value);
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable tableLocation(String location) {
+    this.newTableLocation = location;
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable as(TableIdentifier identifier) {
+    this.newTableIdentifier = identifier;
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable icebergCatalog(Catalog catalog) {
+    this.icebergCatalog = catalog;
+    return this;
+  }
+
+  @Override
+  public SnapshotDeltaLakeTable deltaLakeConfiguration(Configuration conf) {
+    this.deltaLog = DeltaLog.forTable(conf, deltaTableLocation);
+    this.deltaLakeFileIO = new ResolvingFileIO();
+    this.deltaLakeFileIO.initialize(ImmutableMap.of());
+    this.deltaLakeFileIO.setConf(conf);

Review Comment:
   I switched to use `ResolvingFileIO` here. The reason I directly initialize 
the IO here is that: based on previous discussion: 
https://github.com/apache/iceberg/pull/6449#discussion_r1061746850, we would 
like to maintain the consistency of the reader and the writer. Since the 
`deltaLog` relies on the hadoop configuration, `deltaLakeFileIo` should also 
use the hadoop configuration instead of other table properties. Hence, we shall 
initialize the io using an empty map.
   
   @jackye1995, @danielcweeks . Could you please help me confirm the 
`ResolvingFileIO` initialization process here? Thank you very much!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to