zhjwpku commented on code in PR #91: URL: https://github.com/apache/iceberg-cpp/pull/91#discussion_r2083010628
########## src/iceberg/manifest_list.h: ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include <cstdint> +#include <optional> +#include <string> +#include <string_view> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result<ManifestContent> ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional<bool> contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> upper_bound; + + static const SchemaField CONTAINS_NULL; + static const SchemaField CONTAINS_NAN; + static const SchemaField LOWER_BOUND; + static const SchemaField UPPER_BOUND; + + static StructType GetType(); Review Comment: done with singleton. ########## src/iceberg/manifest_reader.h: ########## @@ -41,7 +43,7 @@ class ICEBERG_EXPORT ManifestReader { /// \brief Read manifest files from a manifest list file. class ICEBERG_EXPORT ManifestListReader { public: - virtual Result<std::span<std::unique_ptr<class ManifestFile>>> Files() const = 0; + virtual Result<std::span<std::unique_ptr<struct ManifestFile>>> Files() const = 0; Review Comment: done. ########## src/iceberg/manifest_list.cc: ########## @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_list.h" + +#include <vector> + +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +const SchemaField FieldSummary::CONTAINS_NULL = + SchemaField::MakeRequired(509, "contains_null", std::make_shared<BooleanType>()); +const SchemaField FieldSummary::CONTAINS_NAN = + SchemaField::MakeOptional(518, "contains_nan", std::make_shared<BooleanType>()); +const SchemaField FieldSummary::LOWER_BOUND = + SchemaField::MakeOptional(510, "lower_bound", std::make_shared<BinaryType>()); +const SchemaField FieldSummary::UPPER_BOUND = + SchemaField::MakeOptional(511, "upper_bound", std::make_shared<BinaryType>()); + +StructType FieldSummary::GetType() { + return StructType({ + CONTAINS_NULL, + CONTAINS_NAN, + LOWER_BOUND, + UPPER_BOUND, + }); +} + +const SchemaField ManifestFile::MANIFEST_PATH = + SchemaField::MakeRequired(500, "manifest_path", std::make_shared<StringType>()); +const SchemaField ManifestFile::MANIFEST_LENGTH = + SchemaField::MakeRequired(501, "manifest_length", std::make_shared<LongType>()); +const SchemaField ManifestFile::PARTITION_SPEC_ID = + SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared<IntType>()); +const SchemaField ManifestFile::CONTENT = + SchemaField::MakeOptional(517, "content", std::make_shared<IntType>()); +const SchemaField ManifestFile::SEQUENCE_NUMBER = + SchemaField::MakeOptional(515, "sequence_number", std::make_shared<LongType>()); +const SchemaField ManifestFile::MIN_SEQUENCE_NUMBER = + SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared<LongType>()); +const SchemaField ManifestFile::ADDED_SNAPSHOT_ID = + SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared<LongType>()); +const SchemaField ManifestFile::ADDED_FILES_COUNT = + SchemaField::MakeOptional(504, "added_files_count", std::make_shared<IntType>()); +const SchemaField ManifestFile::EXISTING_FILES_COUNT = + SchemaField::MakeOptional(505, "existing_files_count", std::make_shared<IntType>()); +const SchemaField ManifestFile::DELETED_FILES_COUNT = + SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared<IntType>()); +const SchemaField ManifestFile::ADDED_ROWS_COUNT = + SchemaField::MakeOptional(512, "added_rows_count", std::make_shared<LongType>()); +const SchemaField ManifestFile::EXISTING_ROWS_COUNT = + SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared<LongType>()); +const SchemaField ManifestFile::DELETED_ROWS_COUNT = + SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared<LongType>()); +const SchemaField ManifestFile::PARTITIONS = SchemaField::MakeOptional( + 507, "partitions", + std::make_shared<ListType>(SchemaField::MakeRequired( + 508, std::string(ListType::kElementName), + std::make_shared<StructType>(FieldSummary::GetType())))); +const SchemaField ManifestFile::KEY_METADATA = + SchemaField::MakeOptional(519, "key_metadata", std::make_shared<BinaryType>()); +const SchemaField ManifestFile::FIRST_ROW_ID = + SchemaField::MakeOptional(520, "first_row_id", std::make_shared<LongType>()); + +StructType ManifestFile::Schema() { + std::vector<SchemaField> fields; + fields.push_back(MANIFEST_PATH); + fields.push_back(MANIFEST_LENGTH); + fields.push_back(PARTITION_SPEC_ID); + fields.push_back(CONTENT); + fields.push_back(SEQUENCE_NUMBER); + fields.push_back(MIN_SEQUENCE_NUMBER); + fields.push_back(ADDED_SNAPSHOT_ID); + fields.push_back(ADDED_FILES_COUNT); + fields.push_back(EXISTING_FILES_COUNT); + fields.push_back(DELETED_FILES_COUNT); + fields.push_back(ADDED_ROWS_COUNT); + fields.push_back(EXISTING_ROWS_COUNT); + fields.push_back(DELETED_ROWS_COUNT); + fields.push_back(PARTITIONS); + fields.push_back(KEY_METADATA); + fields.push_back(FIRST_ROW_ID); + + return StructType(std::move(fields)); Review Comment: done. ########## src/iceberg/manifest_list.cc: ########## @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_list.h" + +#include <vector> + +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +const SchemaField FieldSummary::CONTAINS_NULL = Review Comment: done. ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional<std::vector<uint8_t>> key_metadata; + /// Field id: 132 + /// Element Field id: 133 + /// Split offsets for the data file. For example, all row group offsets in a Parquet + /// file. Must be sorted ascending. + std::vector<int64_t> split_offsets; + /// Field id: 135 + /// Element Field id: 136 + /// Field ids used to determine row equality in equality delete files. Required when + /// content=2 and should be null otherwise. Fields with ids listed in this column must + /// be present in the delete file. + std::vector<int32_t> equality_ids; + /// Field id: 140 + /// ID representing sort order for this file + /// + /// If sort order ID is missing or unknown, then the order is assumed to be unsorted. + /// Only data files and equality delete files should be written with a non-null order + /// id. Position deletes are required to be sorted by file and position, not a table + /// order, and should set sort order id to null. Readers must ignore sort order id for + /// position delete files. + std::optional<int32_t> sort_order_id; + /// This field is not included in spec, so it is not serialized into the manifest file. + /// It is just store in memory representation used in process. + int32_t partition_spec_id; + /// Field id: 142 + /// The _row_id for the first row in the data file. + /// + /// Reference: + /// - [First Row ID + /// Inheritance](https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance) + std::optional<int64_t> first_row_id; + /// Field id: 143 + /// Fully qualified location (URI with FS scheme) of a data file that all deletes + /// reference. + /// + /// Position delete metadata can use referenced_data_file when all deletes tracked by + /// the entry are in a single data file. Setting the referenced file is required for + /// deletion vectors. + std::optional<std::string> referenced_data_file; + /// Field id: 144 + /// The offset in the file where the content starts. + /// + /// The content_offset and content_size_in_bytes fields are used to reference a specific + /// blob for direct access to a deletion vector. For deletion vectors, these values are + /// required and must exactly match the offset and length stored in the Puffin footer + /// for the deletion vector blob. + std::optional<int64_t> content_offset; + /// Field id: 145 + /// The length of a referenced content stored in the file; required if content_offset is + /// present + std::optional<int64_t> content_size_in_bytes; + + static const SchemaField CONTENT; + static const SchemaField FILE_PATH; + static const SchemaField FILE_FORMAT; + static const SchemaField RECORD_COUNT; + static const SchemaField FILE_SIZE; + static const SchemaField COLUMN_SIZES; + static const SchemaField VALUE_COUNTS; + static const SchemaField NULL_VALUE_COUNTS; + static const SchemaField NAN_VALUE_COUNTS; + static const SchemaField LOWER_BOUNDS; + static const SchemaField UPPER_BOUNDS; + static const SchemaField KEY_METADATA; + static const SchemaField SPLIT_OFFSETS; + static const SchemaField EQUALITY_IDS; + static const SchemaField SORT_ORDER_ID; + static const SchemaField FIRST_ROW_ID; + static const SchemaField REFERENCED_DATA_FILE; + static const SchemaField CONTENT_OFFSET; + static const SchemaField CONTENT_SIZE; + + static StructType GetType(StructType partition_type); +}; + +/// \brief A manifest is an immutable Avro file that lists data files or delete files, +/// along with each file's partition data tuple, metrics, and tracking information. + +/// \brief The schema of a manifest file +struct ICEBERG_EXPORT ManifestEntry { + /// Field id: 0 + /// Used to track additions and deletions. Deletes are informational only and not used + /// in scans. + ManifestStatus status; + /// Field id: 1 + /// Snapshot id where the file was added, or deleted if status is 2. Inherited when + /// null. + std::optional<int64_t> snapshot_id; + /// Field id: 3 + /// Data sequence number of the file. Inherited when null and status is 1 (added). + std::optional<int64_t> sequence_number; + /// Field id: 4 + /// File sequence number indicating when the file was added. Inherited when null and + /// status is 1 (added). + std::optional<int64_t> file_sequence_number; + /// Field id: 2 + /// File path, partition tuple, metrics, ... + DataFile data_file; Review Comment: The `DataFile` is part of the in-memory representation of `ManifestEntry`, so IMHO there's no need to wrap it in a smart pointer, embedding it directly in `ManifestEntry` should be more memory-efficient. ########## src/iceberg/manifest_reader.h: ########## @@ -26,13 +26,15 @@ #include <span> #include "iceberg/file_reader.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/type_fwd.h" namespace iceberg { /// \brief Read manifest entries from a manifest file. class ICEBERG_EXPORT ManifestReader { public: - virtual Result<std::span<std::unique_ptr<class ManifestEntry>>> Entries() const = 0; + virtual Result<std::span<std::unique_ptr<struct ManifestEntry>>> Entries() const = 0; Review Comment: done. ########## src/iceberg/manifest_list.h: ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include <cstdint> +#include <optional> +#include <string> +#include <string_view> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result<ManifestContent> ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional<bool> contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> upper_bound; + + static const SchemaField CONTAINS_NULL; + static const SchemaField CONTAINS_NAN; + static const SchemaField LOWER_BOUND; + static const SchemaField UPPER_BOUND; + + static StructType GetType(); +}; + +/// \brief Entry in a manifest list. +struct ICEBERG_EXPORT ManifestFile { Review Comment: I haven't considered that, I will do these in later PRs. ########## src/iceberg/manifest_list.h: ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include <cstdint> +#include <optional> +#include <string> +#include <string_view> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result<ManifestContent> ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional<bool> contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> upper_bound; + + static const SchemaField CONTAINS_NULL; + static const SchemaField CONTAINS_NAN; + static const SchemaField LOWER_BOUND; + static const SchemaField UPPER_BOUND; + + static StructType GetType(); +}; + +/// \brief Entry in a manifest list. +struct ICEBERG_EXPORT ManifestFile { + /// Field id: 500 + /// Location of the manifest file + std::string manifest_path; + /// Field id: 501 + /// Length of the manifest file in bytes + int64_t manifest_length; + /// Field id: 502 + /// ID of a partition spec used to write the manifest; must be listed in table metadata + /// partition-specs + int32_t partition_spec_id; + /// Field id: 517 + /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 + /// manifests + ManifestContent content; + /// Field id: 515 + /// The sequence number when the manifest was added to the table; use 0 when reading v1 + /// manifest lists + int64_t sequence_number; + /// Field id: 516 + /// The minimum data sequence number of all live data or delete files in the manifest; + /// use 0 when reading v1 manifest lists + int64_t min_sequence_number; + /// Field id: 503 + /// ID of the snapshot where the manifest file was added + int64_t added_snapshot_id; + /// Field id: 504 + /// Number of entries in the manifest that have status ADDED (1), when null this is + /// assumed to be non-zero + std::optional<int32_t> added_files_count; + /// Field id: 505 + /// Number of entries in the manifest that have status EXISTING (0), when null this is + /// assumed to be non-zero + std::optional<int32_t> existing_files_count; + /// Field id: 506 + /// Number of entries in the manifest that have status DELETED (2), when null this is + /// assumed to be non-zero + std::optional<int32_t> deleted_files_count; + /// Field id: 512 + /// Number of rows in all of files in the manifest that have status ADDED, when null + /// this is assumed to be non-zero + std::optional<int64_t> added_rows_count; + /// Field id: 513 + /// Number of rows in all of files in the manifest that have status EXISTING, when null + /// this is assumed to be non-zero + std::optional<int64_t> existing_rows_count; + /// Field id: 514 + /// Number of rows in all of files in the manifest that have status DELETED, when null + /// this is assumed to be non-zero + std::optional<int64_t> deleted_rows_count; + /// Field id: 507 + /// Element field id: 508 + /// A list of field summaries for each partition field in the spec. Each field in the + /// list corresponds to a field in the manifest file's partition spec. + std::vector<FieldSummary> partitions; + /// Field id: 519 + /// Implementation-specific key metadata for encryption + std::vector<uint8_t> key_metadata; + /// Field id: 520 + /// The starting _row_id to assign to rows added by ADDED data files + int64_t first_row_id; + + /// \brief Checks if this manifest file contains entries with ADDED status. + [[nodiscard]] bool has_added_files() const { + return added_files_count.has_value() && *added_files_count > 0; + } + + /// \brief Checks if this manifest file contains entries with EXISTING status. + [[nodiscard]] bool has_existing_files() const { + return existing_files_count.has_value() && *existing_files_count > 0; + } + + /// \brief Checks if this manifest file contains entries with DELETED status + [[nodiscard]] bool has_deleted_files() const { + return deleted_files_count.has_value() && *deleted_files_count > 0; + } + + static const SchemaField MANIFEST_PATH; + static const SchemaField MANIFEST_LENGTH; + static const SchemaField PARTITION_SPEC_ID; + static const SchemaField CONTENT; + static const SchemaField SEQUENCE_NUMBER; + static const SchemaField MIN_SEQUENCE_NUMBER; + static const SchemaField ADDED_SNAPSHOT_ID; + static const SchemaField ADDED_FILES_COUNT; + static const SchemaField EXISTING_FILES_COUNT; + static const SchemaField DELETED_FILES_COUNT; + static const SchemaField ADDED_ROWS_COUNT; + static const SchemaField EXISTING_ROWS_COUNT; + static const SchemaField DELETED_ROWS_COUNT; + static const SchemaField PARTITIONS; + static const SchemaField KEY_METADATA; + static const SchemaField FIRST_ROW_ID; + + static StructType Schema(); Review Comment: done. ########## src/iceberg/manifest_list.h: ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include <cstdint> +#include <optional> +#include <string> +#include <string_view> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result<ManifestContent> ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional<bool> contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> upper_bound; + + static const SchemaField CONTAINS_NULL; + static const SchemaField CONTAINS_NAN; + static const SchemaField LOWER_BOUND; + static const SchemaField UPPER_BOUND; + + static StructType GetType(); +}; + +/// \brief Entry in a manifest list. +struct ICEBERG_EXPORT ManifestFile { + /// Field id: 500 + /// Location of the manifest file + std::string manifest_path; + /// Field id: 501 + /// Length of the manifest file in bytes + int64_t manifest_length; + /// Field id: 502 + /// ID of a partition spec used to write the manifest; must be listed in table metadata + /// partition-specs + int32_t partition_spec_id; + /// Field id: 517 + /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 + /// manifests + ManifestContent content; + /// Field id: 515 + /// The sequence number when the manifest was added to the table; use 0 when reading v1 + /// manifest lists + int64_t sequence_number; + /// Field id: 516 + /// The minimum data sequence number of all live data or delete files in the manifest; + /// use 0 when reading v1 manifest lists + int64_t min_sequence_number; + /// Field id: 503 + /// ID of the snapshot where the manifest file was added + int64_t added_snapshot_id; + /// Field id: 504 + /// Number of entries in the manifest that have status ADDED (1), when null this is + /// assumed to be non-zero + std::optional<int32_t> added_files_count; + /// Field id: 505 + /// Number of entries in the manifest that have status EXISTING (0), when null this is + /// assumed to be non-zero + std::optional<int32_t> existing_files_count; + /// Field id: 506 + /// Number of entries in the manifest that have status DELETED (2), when null this is + /// assumed to be non-zero + std::optional<int32_t> deleted_files_count; + /// Field id: 512 + /// Number of rows in all of files in the manifest that have status ADDED, when null + /// this is assumed to be non-zero + std::optional<int64_t> added_rows_count; + /// Field id: 513 + /// Number of rows in all of files in the manifest that have status EXISTING, when null + /// this is assumed to be non-zero + std::optional<int64_t> existing_rows_count; + /// Field id: 514 + /// Number of rows in all of files in the manifest that have status DELETED, when null + /// this is assumed to be non-zero + std::optional<int64_t> deleted_rows_count; + /// Field id: 507 + /// Element field id: 508 + /// A list of field summaries for each partition field in the spec. Each field in the + /// list corresponds to a field in the manifest file's partition spec. + std::vector<FieldSummary> partitions; + /// Field id: 519 + /// Implementation-specific key metadata for encryption + std::vector<uint8_t> key_metadata; + /// Field id: 520 + /// The starting _row_id to assign to rows added by ADDED data files + int64_t first_row_id; + + /// \brief Checks if this manifest file contains entries with ADDED status. + [[nodiscard]] bool has_added_files() const { + return added_files_count.has_value() && *added_files_count > 0; Review Comment: done suggested. ########## src/iceberg/manifest_list.h: ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include <cstdint> +#include <optional> +#include <string> +#include <string_view> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result<ManifestContent> ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional<bool> contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> upper_bound; + + static const SchemaField CONTAINS_NULL; + static const SchemaField CONTAINS_NAN; + static const SchemaField LOWER_BOUND; + static const SchemaField UPPER_BOUND; + + static StructType GetType(); +}; + +/// \brief Entry in a manifest list. +struct ICEBERG_EXPORT ManifestFile { + /// Field id: 500 + /// Location of the manifest file + std::string manifest_path; + /// Field id: 501 + /// Length of the manifest file in bytes + int64_t manifest_length; + /// Field id: 502 + /// ID of a partition spec used to write the manifest; must be listed in table metadata + /// partition-specs + int32_t partition_spec_id; + /// Field id: 517 + /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 + /// manifests + ManifestContent content; + /// Field id: 515 + /// The sequence number when the manifest was added to the table; use 0 when reading v1 + /// manifest lists + int64_t sequence_number; + /// Field id: 516 + /// The minimum data sequence number of all live data or delete files in the manifest; + /// use 0 when reading v1 manifest lists + int64_t min_sequence_number; + /// Field id: 503 + /// ID of the snapshot where the manifest file was added + int64_t added_snapshot_id; + /// Field id: 504 + /// Number of entries in the manifest that have status ADDED (1), when null this is + /// assumed to be non-zero + std::optional<int32_t> added_files_count; + /// Field id: 505 + /// Number of entries in the manifest that have status EXISTING (0), when null this is + /// assumed to be non-zero + std::optional<int32_t> existing_files_count; + /// Field id: 506 + /// Number of entries in the manifest that have status DELETED (2), when null this is + /// assumed to be non-zero + std::optional<int32_t> deleted_files_count; + /// Field id: 512 + /// Number of rows in all of files in the manifest that have status ADDED, when null + /// this is assumed to be non-zero + std::optional<int64_t> added_rows_count; + /// Field id: 513 + /// Number of rows in all of files in the manifest that have status EXISTING, when null + /// this is assumed to be non-zero + std::optional<int64_t> existing_rows_count; + /// Field id: 514 + /// Number of rows in all of files in the manifest that have status DELETED, when null + /// this is assumed to be non-zero + std::optional<int64_t> deleted_rows_count; + /// Field id: 507 + /// Element field id: 508 + /// A list of field summaries for each partition field in the spec. Each field in the + /// list corresponds to a field in the manifest file's partition spec. + std::vector<FieldSummary> partitions; + /// Field id: 519 + /// Implementation-specific key metadata for encryption + std::vector<uint8_t> key_metadata; + /// Field id: 520 + /// The starting _row_id to assign to rows added by ADDED data files + int64_t first_row_id; + + /// \brief Checks if this manifest file contains entries with ADDED status. + [[nodiscard]] bool has_added_files() const { + return added_files_count.has_value() && *added_files_count > 0; + } + + /// \brief Checks if this manifest file contains entries with EXISTING status. + [[nodiscard]] bool has_existing_files() const { + return existing_files_count.has_value() && *existing_files_count > 0; + } + + /// \brief Checks if this manifest file contains entries with DELETED status + [[nodiscard]] bool has_deleted_files() const { + return deleted_files_count.has_value() && *deleted_files_count > 0; + } + + static const SchemaField MANIFEST_PATH; + static const SchemaField MANIFEST_LENGTH; + static const SchemaField PARTITION_SPEC_ID; + static const SchemaField CONTENT; + static const SchemaField SEQUENCE_NUMBER; + static const SchemaField MIN_SEQUENCE_NUMBER; + static const SchemaField ADDED_SNAPSHOT_ID; + static const SchemaField ADDED_FILES_COUNT; + static const SchemaField EXISTING_FILES_COUNT; + static const SchemaField DELETED_FILES_COUNT; + static const SchemaField ADDED_ROWS_COUNT; + static const SchemaField EXISTING_ROWS_COUNT; + static const SchemaField DELETED_ROWS_COUNT; + static const SchemaField PARTITIONS; + static const SchemaField KEY_METADATA; + static const SchemaField FIRST_ROW_ID; + + static StructType Schema(); +}; + +/// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are +/// stored in a separate manifest list file. +/// +/// A new manifest list is written for each attempt to commit a snapshot because the list +/// of manifests always changes to produce a new snapshot. When a manifest list is +/// written, the (optimistic) sequence number of the snapshot is written for all new +/// manifest files tracked by the list. +/// +/// A manifest list includes summary metadata that can be used to avoid scanning all of +/// the manifests in a snapshot when planning a table scan. This includes the number of +/// added, existing, and deleted files, and a summary of values for each field of the +/// partition spec used to write the manifest. +struct ManifestList { Review Comment: good catch, fixed. ########## src/iceberg/manifest_list.h: ########## @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include <cstdint> +#include <optional> +#include <string> +#include <string_view> + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result<ManifestContent> ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional<bool> contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional<std::vector<uint8_t>> upper_bound; + + static const SchemaField CONTAINS_NULL; Review Comment: done suggested, both `k-prefix` and `inline static` ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional<std::vector<uint8_t>> key_metadata; Review Comment: `key_metadata` is simply a binary blob used for encryption purposes? If so it seems that std::vector<uint8_t> is sufficient; I checked the Rust implementation, they did the same. ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional<std::vector<uint8_t>> key_metadata; + /// Field id: 132 + /// Element Field id: 133 + /// Split offsets for the data file. For example, all row group offsets in a Parquet + /// file. Must be sorted ascending. + std::vector<int64_t> split_offsets; + /// Field id: 135 + /// Element Field id: 136 + /// Field ids used to determine row equality in equality delete files. Required when + /// content=2 and should be null otherwise. Fields with ids listed in this column must + /// be present in the delete file. + std::vector<int32_t> equality_ids; + /// Field id: 140 + /// ID representing sort order for this file + /// + /// If sort order ID is missing or unknown, then the order is assumed to be unsorted. + /// Only data files and equality delete files should be written with a non-null order + /// id. Position deletes are required to be sorted by file and position, not a table + /// order, and should set sort order id to null. Readers must ignore sort order id for + /// position delete files. + std::optional<int32_t> sort_order_id; + /// This field is not included in spec, so it is not serialized into the manifest file. + /// It is just store in memory representation used in process. + int32_t partition_spec_id; + /// Field id: 142 + /// The _row_id for the first row in the data file. + /// + /// Reference: + /// - [First Row ID + /// Inheritance](https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance) + std::optional<int64_t> first_row_id; + /// Field id: 143 + /// Fully qualified location (URI with FS scheme) of a data file that all deletes + /// reference. + /// + /// Position delete metadata can use referenced_data_file when all deletes tracked by + /// the entry are in a single data file. Setting the referenced file is required for + /// deletion vectors. + std::optional<std::string> referenced_data_file; + /// Field id: 144 + /// The offset in the file where the content starts. + /// + /// The content_offset and content_size_in_bytes fields are used to reference a specific + /// blob for direct access to a deletion vector. For deletion vectors, these values are + /// required and must exactly match the offset and length stored in the Puffin footer + /// for the deletion vector blob. + std::optional<int64_t> content_offset; + /// Field id: 145 + /// The length of a referenced content stored in the file; required if content_offset is + /// present + std::optional<int64_t> content_size_in_bytes; + + static const SchemaField CONTENT; + static const SchemaField FILE_PATH; + static const SchemaField FILE_FORMAT; + static const SchemaField RECORD_COUNT; + static const SchemaField FILE_SIZE; + static const SchemaField COLUMN_SIZES; + static const SchemaField VALUE_COUNTS; + static const SchemaField NULL_VALUE_COUNTS; + static const SchemaField NAN_VALUE_COUNTS; + static const SchemaField LOWER_BOUNDS; + static const SchemaField UPPER_BOUNDS; + static const SchemaField KEY_METADATA; + static const SchemaField SPLIT_OFFSETS; + static const SchemaField EQUALITY_IDS; + static const SchemaField SORT_ORDER_ID; + static const SchemaField FIRST_ROW_ID; + static const SchemaField REFERENCED_DATA_FILE; + static const SchemaField CONTENT_OFFSET; + static const SchemaField CONTENT_SIZE; + + static StructType GetType(StructType partition_type); +}; + +/// \brief A manifest is an immutable Avro file that lists data files or delete files, +/// along with each file's partition data tuple, metrics, and tracking information. + +/// \brief The schema of a manifest file +struct ICEBERG_EXPORT ManifestEntry { + /// Field id: 0 + /// Used to track additions and deletions. Deletes are informational only and not used + /// in scans. + ManifestStatus status; + /// Field id: 1 + /// Snapshot id where the file was added, or deleted if status is 2. Inherited when + /// null. + std::optional<int64_t> snapshot_id; + /// Field id: 3 + /// Data sequence number of the file. Inherited when null and status is 1 (added). + std::optional<int64_t> sequence_number; + /// Field id: 4 + /// File sequence number indicating when the file was added. Inherited when null and + /// status is 1 (added). + std::optional<int64_t> file_sequence_number; + /// Field id: 2 + /// File path, partition tuple, metrics, ... + DataFile data_file; + + static const SchemaField STATUS; + static const SchemaField SNAPSHOT_ID; + static const SchemaField SEQUENCE_NUMBER; + static const SchemaField FILE_SEQUENCE_NUMBER; + + static StructType GetSchema(StructType partition_type); Review Comment: I've converted these to shared_ptr and updated the function names, I'm not sure if the new names are the best fit, feel free to suggest better. ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional<std::vector<uint8_t>> key_metadata; + /// Field id: 132 + /// Element Field id: 133 + /// Split offsets for the data file. For example, all row group offsets in a Parquet + /// file. Must be sorted ascending. + std::vector<int64_t> split_offsets; + /// Field id: 135 + /// Element Field id: 136 + /// Field ids used to determine row equality in equality delete files. Required when + /// content=2 and should be null otherwise. Fields with ids listed in this column must + /// be present in the delete file. + std::vector<int32_t> equality_ids; + /// Field id: 140 + /// ID representing sort order for this file + /// + /// If sort order ID is missing or unknown, then the order is assumed to be unsorted. + /// Only data files and equality delete files should be written with a non-null order + /// id. Position deletes are required to be sorted by file and position, not a table + /// order, and should set sort order id to null. Readers must ignore sort order id for + /// position delete files. + std::optional<int32_t> sort_order_id; + /// This field is not included in spec, so it is not serialized into the manifest file. + /// It is just store in memory representation used in process. + int32_t partition_spec_id; + /// Field id: 142 + /// The _row_id for the first row in the data file. + /// + /// Reference: + /// - [First Row ID + /// Inheritance](https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance) + std::optional<int64_t> first_row_id; + /// Field id: 143 + /// Fully qualified location (URI with FS scheme) of a data file that all deletes + /// reference. + /// + /// Position delete metadata can use referenced_data_file when all deletes tracked by + /// the entry are in a single data file. Setting the referenced file is required for + /// deletion vectors. + std::optional<std::string> referenced_data_file; + /// Field id: 144 + /// The offset in the file where the content starts. + /// + /// The content_offset and content_size_in_bytes fields are used to reference a specific + /// blob for direct access to a deletion vector. For deletion vectors, these values are + /// required and must exactly match the offset and length stored in the Puffin footer + /// for the deletion vector blob. + std::optional<int64_t> content_offset; + /// Field id: 145 + /// The length of a referenced content stored in the file; required if content_offset is + /// present + std::optional<int64_t> content_size_in_bytes; + + static const SchemaField CONTENT; + static const SchemaField FILE_PATH; + static const SchemaField FILE_FORMAT; + static const SchemaField RECORD_COUNT; + static const SchemaField FILE_SIZE; + static const SchemaField COLUMN_SIZES; + static const SchemaField VALUE_COUNTS; + static const SchemaField NULL_VALUE_COUNTS; + static const SchemaField NAN_VALUE_COUNTS; + static const SchemaField LOWER_BOUNDS; + static const SchemaField UPPER_BOUNDS; + static const SchemaField KEY_METADATA; + static const SchemaField SPLIT_OFFSETS; + static const SchemaField EQUALITY_IDS; + static const SchemaField SORT_ORDER_ID; + static const SchemaField FIRST_ROW_ID; + static const SchemaField REFERENCED_DATA_FILE; + static const SchemaField CONTENT_OFFSET; + static const SchemaField CONTENT_SIZE; + + static StructType GetType(StructType partition_type); Review Comment: I will use shared_ptr for both input and return types. ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) Review Comment: removed. ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the Review Comment: I think the `greater than or equal` provides clearer semantics and helps users avoid confusion with C++'s upper_bound, so I'd prefer to keep it as is for now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org