Fokko commented on code in PR #960: URL: https://github.com/apache/iceberg-rust/pull/960#discussion_r1954458103
########## crates/iceberg/src/transaction.rs: ########## @@ -169,6 +175,172 @@ impl<'a> Transaction<'a> { catalog.update_table(table_commit).await } + + /// Adds existing parquet files + pub async fn add_parquet_files( + self, + file_paths: Vec<String>, + check_duplicate_files: bool, + ) -> Result<Transaction<'a>> { + if check_duplicate_files { + let unique_paths: HashSet<_> = file_paths.iter().collect(); + if unique_paths.len() != file_paths.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Duplicate file paths provided", + )); + } + } + let table_metadata = self.table.metadata(); + + let data_files = Transaction::parquet_files_to_data_files( + &self, + self.table.file_io(), + file_paths, + table_metadata, + ) + .await?; + + let mut fast_append_action = self.fast_append(Some(Uuid::new_v4()), Vec::new())?; + fast_append_action.add_data_files(data_files)?; + + fast_append_action.apply().await + } + + async fn parquet_files_to_data_files( + &self, + file_io: &FileIO, + file_paths: Vec<String>, + table_metadata: &TableMetadata, + ) -> Result<Vec<DataFile>> { + let mut data_files: Vec<DataFile> = Vec::new(); + let partition_value = + self.create_default_partition_value(&table_metadata.default_partition_type)?; + + for file_path in file_paths { + let input_file = file_io.new_input(&file_path)?; + if !input_file.exists().await? { + return Err(Error::new( + ErrorKind::DataInvalid, + "File does not exist".to_string(), + )); + } Review Comment: I'm not a rustacean, but I'm wondering if we need to do this check. I would expect that this would do a call to the object store, which we ideally want to avoid -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org