Fokko commented on code in PR #960:
URL: https://github.com/apache/iceberg-rust/pull/960#discussion_r1954458103


##########
crates/iceberg/src/transaction.rs:
##########
@@ -169,6 +175,172 @@ impl<'a> Transaction<'a> {
 
         catalog.update_table(table_commit).await
     }
+
+    /// Adds existing parquet files
+    pub async fn add_parquet_files(
+        self,
+        file_paths: Vec<String>,
+        check_duplicate_files: bool,
+    ) -> Result<Transaction<'a>> {
+        if check_duplicate_files {
+            let unique_paths: HashSet<_> = file_paths.iter().collect();
+            if unique_paths.len() != file_paths.len() {
+                return Err(Error::new(
+                    ErrorKind::DataInvalid,
+                    "Duplicate file paths provided",
+                ));
+            }
+        }
+        let table_metadata = self.table.metadata();
+
+        let data_files = Transaction::parquet_files_to_data_files(
+            &self,
+            self.table.file_io(),
+            file_paths,
+            table_metadata,
+        )
+        .await?;
+
+        let mut fast_append_action = self.fast_append(Some(Uuid::new_v4()), 
Vec::new())?;
+        fast_append_action.add_data_files(data_files)?;
+
+        fast_append_action.apply().await
+    }
+
+    async fn parquet_files_to_data_files(
+        &self,
+        file_io: &FileIO,
+        file_paths: Vec<String>,
+        table_metadata: &TableMetadata,
+    ) -> Result<Vec<DataFile>> {
+        let mut data_files: Vec<DataFile> = Vec::new();
+        let partition_value =
+            
self.create_default_partition_value(&table_metadata.default_partition_type)?;
+
+        for file_path in file_paths {
+            let input_file = file_io.new_input(&file_path)?;
+            if !input_file.exists().await? {
+                return Err(Error::new(
+                    ErrorKind::DataInvalid,
+                    "File does not exist".to_string(),
+                ));
+            }

Review Comment:
   I'm not a rustacean, but I'm wondering if we need to do this check. I would 
expect that this would do a call to the object store, which we ideally want to 
avoid



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to