ZENOTME commented on code in PR #902: URL: https://github.com/apache/iceberg-rust/pull/902#discussion_r2238325767
########## crates/iceberg/src/transaction/merge_append.rs: ########## @@ -0,0 +1,329 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::{BTreeMap, HashMap}; +use std::pin::Pin; +use std::sync::Arc; + +use async_trait::async_trait; +use uuid::Uuid; + +use crate::Result; +use crate::io::FileIO; +use crate::spec::{DataFile, ManifestContentType, ManifestFile, ManifestStatus, ManifestWriter}; +use crate::table::Table; +use crate::transaction::append::AppendOperation; +use crate::transaction::snapshot::{DefaultManifestProcess, ManifestProcess, SnapshotProducer}; +use crate::transaction::{ActionCommit, TransactionAction}; +use crate::utils::bin::ListPacker; +use crate::utils::parse_property; + +/// MergeAppendAction is a transaction action similar to fast append except that it will merge manifests +/// based on the target size. +pub struct MergeAppendAction { + target_size_bytes: u32, + min_count_to_merge: u32, + check_duplicate: bool, + merge_enabled: bool, + // below are properties used to create SnapshotProducer when commit + commit_uuid: Option<Uuid>, + key_metadata: Option<Vec<u8>>, + snapshot_properties: HashMap<String, String>, + added_data_files: Vec<DataFile>, +} + +/// Target size of manifest file when merging manifests. +pub const MANIFEST_TARGET_SIZE_BYTES: &str = "commit.manifest.target-size-bytes"; +const MANIFEST_TARGET_SIZE_BYTES_DEFAULT: u32 = 8 * 1024 * 1024; // 8 MB +/// Minimum number of manifests to merge. +pub const MANIFEST_MIN_MERGE_COUNT: &str = "commit.manifest.min-count-to-merge"; +const MANIFEST_MIN_MERGE_COUNT_DEFAULT: u32 = 100; +/// Whether allow to merge manifests. +pub const MANIFEST_MERGE_ENABLED: &str = "commit.manifest-merge.enabled"; +const MANIFEST_MERGE_ENABLED_DEFAULT: bool = false; + +impl MergeAppendAction { + pub(crate) fn new(table: &Table) -> Result<Self> { + let target_size_bytes: u32 = parse_property( + table.metadata().properties(), + MANIFEST_TARGET_SIZE_BYTES, + MANIFEST_TARGET_SIZE_BYTES_DEFAULT, + )?; + let min_count_to_merge: u32 = parse_property( + table.metadata().properties(), + MANIFEST_MIN_MERGE_COUNT, + MANIFEST_MIN_MERGE_COUNT_DEFAULT, + )?; + let merge_enabled = parse_property( + table.metadata().properties(), + MANIFEST_MERGE_ENABLED, + MANIFEST_MERGE_ENABLED_DEFAULT, + )?; + Ok(Self { + check_duplicate: true, + target_size_bytes, + min_count_to_merge, + merge_enabled, + commit_uuid: None, + key_metadata: None, + snapshot_properties: HashMap::default(), + added_data_files: vec![], + }) + } + + pub fn with_check_duplicate(mut self, v: bool) -> Self { + self.check_duplicate = v; + self + } + + pub fn set_commit_uuid(mut self, commit_uuid: Uuid) -> Self { + self.commit_uuid = Some(commit_uuid); + self + } + + pub fn set_key_metadata(mut self, key_metadata: Vec<u8>) -> Self { + self.key_metadata = Some(key_metadata); + self + } + + pub fn set_snapshot_properties(mut self, snapshot_properties: HashMap<String, String>) -> Self { + self.snapshot_properties = snapshot_properties; + self + } + + /// Add data files to the snapshot. + pub fn add_data_files( + &mut self, Review Comment: Thanks @jdcasale I don't noticed that actually. Both way looks good me, but I prefer &mut self because add_data_files actually more like `push_back`, if we consume mut self which means that the user always need to get self back after add_data_files like following: ``` let action = action.add_data_files(...) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org