liurenjie1024 commented on code in PR #960: URL: https://github.com/apache/iceberg-rust/pull/960#discussion_r1976795860
########## crates/iceberg/src/writer/file_writer/parquet_writer.rs: ########## @@ -391,6 +442,79 @@ impl ParquetWriter { ); Ok(builder) } + + /// `ParquetMetadata` to data file builder + pub fn parquet_to_data_file_builder( Review Comment: ```suggestion pub(crate) fn parquet_to_data_file_builder( ``` ########## crates/iceberg/src/transaction.rs: ########## @@ -205,8 +208,86 @@ impl<'a> FastAppendAction<'a> { Ok(self) } + /// Adds existing parquet files + pub async fn add_parquet_files(mut self, file_path: Vec<String>) -> Result<Transaction<'a>> { Review Comment: Given checking parquet file compatibility is an important feature, if we plan to address it in following pr, I think we should mark this as private function. ```suggestion async fn add_parquet_files(mut self, file_path: Vec<String>) -> Result<Transaction<'a>> { ``` ########## crates/iceberg/src/writer/file_writer/parquet_writer.rs: ########## @@ -301,12 +316,48 @@ impl MinMaxColAggregator { Ok(()) } + /// Returns lower and upper bounds fn produce(self) -> (HashMap<i32, Datum>, HashMap<i32, Datum>) { (self.lower_bounds, self.upper_bounds) } } impl ParquetWriter { + /// Converts parquet files to data files + pub async fn parquet_files_to_data_files( Review Comment: ```suggestion pub(crate) async fn parquet_files_to_data_files( ``` ########## crates/iceberg/src/transaction.rs: ########## @@ -205,8 +208,86 @@ impl<'a> FastAppendAction<'a> { Ok(self) } + /// Adds existing parquet files + pub async fn add_parquet_files(mut self, file_path: Vec<String>) -> Result<Transaction<'a>> { + if !self + .snapshot_produce_action + .tx + .table + .metadata() + .default_spec + .is_unpartitioned() + { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Appending to partitioned tables is not supported", + )); + } + + let table_metadata = self.snapshot_produce_action.tx.table.metadata(); + + let data_files = ParquetWriter::parquet_files_to_data_files( + self.snapshot_produce_action.tx.table.file_io(), + file_path, + table_metadata, + ) + .await?; + + self.add_data_files(data_files)?; + + self.apply().await + } + /// Finished building the action and apply it to the transaction. pub async fn apply(self) -> Result<Transaction<'a>> { + // Checks duplicate files Review Comment: Yes, my bad. @jonathanc-n added this option before, I suggested to remove it and always check it. I think we could have a following up issue to track it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org