liurenjie1024 commented on code in PR #56:
URL: https://github.com/apache/iceberg-rust/pull/56#discussion_r1316957584


##########
crates/iceberg/src/spec/manifest_list.rs:
##########
@@ -0,0 +1,581 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! ManifestList for Iceberg.
+
+use apache_avro::{from_value, Reader};
+
+use crate::{spec::Literal, Error};
+
+use super::{FormatVersion, StructType};
+
+/// Snapshots are embedded in table metadata, but the list of manifests for a
+/// snapshot are stored in a separate manifest list file.
+///
+/// A new manifest list is written for each attempt to commit a snapshot
+/// because the list of manifests always changes to produce a new snapshot.
+/// When a manifest list is written, the (optimistic) sequence number of the
+/// snapshot is written for all new manifest files tracked by the list.
+///
+/// A manifest list includes summary metadata that can be used to avoid
+/// scanning all of the manifests in a snapshot when planning a table scan.
+/// This includes the number of added, existing, and deleted files, and a
+/// summary of values for each field of the partition spec used to write the
+/// manifest.
+#[derive(Debug, Clone)]
+pub struct ManifestList {
+    version: FormatVersion,
+    /// Entries in a manifest list.
+    entries: Vec<ManifestListEntry>,
+}
+
+impl ManifestList {
+    /// Parse manifest list from bytes.
+    ///
+    /// QUESTION: Will we have more than one manifest list in a single file?
+    pub fn parse(
+        bs: &[u8],
+        version: FormatVersion,
+        partition_type: &StructType,
+    ) -> Result<ManifestList, Error> {
+        let reader = Reader::new(bs)?;
+        let entries = match version {
+            FormatVersion::V2 => reader
+                .map(|v| {
+                    v.and_then(|value| 
from_value::<_serde::ManifestListEntryV2>(&value))
+                        .map_err(|e| {
+                            Error::new(
+                                crate::ErrorKind::DataInvalid,
+                                "Failed to parse manifest list entry",
+                            )
+                            .with_source(e)
+                        })
+                })
+                .map(|v| v.and_then(|v| v.try_into(partition_type)))
+                .collect::<Result<Vec<_>, Error>>()?,
+            FormatVersion::V1 => reader
+                .map(|v| {
+                    v.and_then(|value| 
from_value::<_serde::ManifestListEntryV1>(&value))
+                        .map_err(|e| {
+                            Error::new(
+                                crate::ErrorKind::DataInvalid,
+                                "Failed to parse manifest list entry",
+                            )
+                            .with_source(e)
+                        })
+                })
+                .map(|v| v.and_then(|v| v.try_into(partition_type)))
+                .collect::<Result<Vec<_>, Error>>()?,
+        };
+        Ok(ManifestList { version, entries })
+    }
+
+    /// Get the entries in the manifest list.
+    pub fn entries(&self) -> &[ManifestListEntry] {
+        &self.entries
+    }
+
+    /// Get the version of the manifest list.
+    pub fn version(&self) -> &FormatVersion {
+        &self.version
+    }
+}
+
+/// Entry in a manifest list.
+#[derive(Debug, PartialEq, Clone)]
+pub struct ManifestListEntry {
+    /// field: 500
+    ///
+    /// Location of the manifest file
+    manifest_path: String,
+    /// field: 501
+    ///
+    /// Length of the manifest file in bytes
+    manifest_length: i64,
+    /// field: 502
+    ///
+    /// ID of a partition spec used to write the manifest; must be listed
+    /// in table metadata partition-specs
+    partition_spec_id: i32,
+    /// field: 517
+    ///
+    /// The type of files tracked by the manifest, either data or delete
+    /// files; 0 for all v1 manifests
+    content: ManifestContentType,
+    /// field: 515
+    ///
+    /// The sequence number when the manifest was added to the table; use 0
+    /// when reading v1 manifest lists
+    sequence_number: i64,
+    /// field: 516
+    ///
+    /// The minimum data sequence number of all live data or delete files in
+    /// the manifest; use 0 when reading v1 manifest lists
+    min_sequence_number: i64,
+    /// field: 503
+    ///
+    /// ID of the snapshot where the manifest file was added
+    added_snapshot_id: i64,
+    /// field: 504
+    ///
+    /// Number of entries in the manifest that have status ADDED, when null
+    /// this is assumed to be non-zero
+    added_data_files_count: Option<i32>,
+    /// field: 505
+    ///
+    /// Number of entries in the manifest that have status EXISTING (0),
+    /// when null this is assumed to be non-zero
+    existing_data_files_count: Option<i32>,
+    /// field: 506
+    ///
+    /// Number of entries in the manifest that have status DELETED (2),
+    /// when null this is assumed to be non-zero
+    deleted_data_files_count: Option<i32>,
+    /// field: 512
+    ///
+    /// Number of rows in all of files in the manifest that have status
+    /// ADDED, when null this is assumed to be non-zero
+    added_rows_count: Option<i64>,
+    /// field: 513
+    ///
+    /// Number of rows in all of files in the manifest that have status
+    /// EXISTING, when null this is assumed to be non-zero
+    existing_rows_count: Option<i64>,
+    /// field: 514
+    ///
+    /// Number of rows in all of files in the manifest that have status
+    /// DELETED, when null this is assumed to be non-zero
+    deleted_rows_count: Option<i64>,
+    /// field: 507
+    /// element_field: 508
+    ///
+    /// A list of field summaries for each partition field in the spec. Each
+    /// field in the list corresponds to a field in the manifest file’s
+    /// partition spec.
+    partitions: Option<Vec<FieldSummary>>,

Review Comment:
   Good point. But since it's optional, we don't need to write `None` in 
serialized format? We just need to skip serialization.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to