wgtmac commented on code in PR #91: URL: https://github.com/apache/iceberg-cpp/pull/91#discussion_r2083555468
########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map<std::string, std::any> partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map<int32_t, std::vector<uint8_t>> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional<std::vector<uint8_t>> key_metadata; Review Comment: Yes, I believe an empty `key_metadata` is enough. ########## src/iceberg/manifest_list.cc: ########## @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_list.h" + +#include <vector> + +#include "iceberg/type.h" + +namespace iceberg { + +const StructType& PartitionFieldSummary::Type() { + static const std::shared_ptr<StructType> instance{new StructType({ Review Comment: Can we use `std::make_unique`? As this is a const variable, we need to use `kType` or something similar. ########## src/iceberg/manifest_list.cc: ########## @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_list.h" + +#include <vector> + +#include "iceberg/type.h" + +namespace iceberg { + +const StructType& PartitionFieldSummary::Type() { + static const std::shared_ptr<StructType> instance{new StructType({ + PartitionFieldSummary::kConsTainsNull, + PartitionFieldSummary::kContainsNaN, + PartitionFieldSummary::kLowerBound, + PartitionFieldSummary::kUpperBound, + })}; + return *instance; +} + +const StructType& ManifestFile::Type() { + static const std::shared_ptr<StructType> instance{new StructType( Review Comment: ditto ########## src/iceberg/manifest_entry.h: ########## @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <any> +#include <cstdint> +#include <map> +#include <memory> +#include <optional> +#include <string> +#include <vector> + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result<ManifestStatus> ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result<DataFileContent> DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::any partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::map<int32_t, int64_t> column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::map<int32_t, int64_t> value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::map<int32_t, int64_t> null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::map<int32_t, int64_t> nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + std::map<int32_t, std::vector<uint8_t>> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-NaN values in the + /// column for the file. + std::map<int32_t, std::vector<uint8_t>> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional<std::vector<uint8_t>> key_metadata; + /// Field id: 132 + /// Element Field id: 133 + /// Split offsets for the data file. For example, all row group offsets in a Parquet + /// file. Must be sorted ascending. + std::vector<int64_t> split_offsets; + /// Field id: 135 + /// Element Field id: 136 + /// Field ids used to determine row equality in equality delete files. Required when + /// content=2 and should be null otherwise. Fields with ids listed in this column must + /// be present in the delete file. + std::vector<int32_t> equality_ids; + /// Field id: 140 + /// ID representing sort order for this file + /// + /// If sort order ID is missing or unknown, then the order is assumed to be unsorted. + /// Only data files and equality delete files should be written with a non-null order + /// id. Position deletes are required to be sorted by file and position, not a table + /// order, and should set sort order id to null. Readers must ignore sort order id for + /// position delete files. + std::optional<int32_t> sort_order_id; + /// This field is not included in spec, so it is not serialized into the manifest file. + /// It is just store in memory representation used in process. + int32_t partition_spec_id; + /// Field id: 142 + /// The _row_id for the first row in the data file. + /// + /// Reference: + /// - [First Row ID + /// Inheritance](https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance) + std::optional<int64_t> first_row_id; + /// Field id: 143 + /// Fully qualified location (URI with FS scheme) of a data file that all deletes + /// reference. + /// + /// Position delete metadata can use referenced_data_file when all deletes tracked by + /// the entry are in a single data file. Setting the referenced file is required for + /// deletion vectors. + std::optional<std::string> referenced_data_file; + /// Field id: 144 + /// The offset in the file where the content starts. + /// + /// The content_offset and content_size_in_bytes fields are used to reference a specific + /// blob for direct access to a deletion vector. For deletion vectors, these values are + /// required and must exactly match the offset and length stored in the Puffin footer + /// for the deletion vector blob. + std::optional<int64_t> content_offset; + /// Field id: 145 + /// The length of a referenced content stored in the file; required if content_offset is + /// present + std::optional<int64_t> content_size_in_bytes; + + inline static const SchemaField kContent = MakeRequiredField<IntType>(134, "content"); Review Comment: Should we support adding `doc` to `SchemaField`? In the Java impl, the doc strings are serialized with the schema in the Avro file (manifest list file and manifest file). We can add it later though. Am I right? @Fokko ########## src/iceberg/schema_field.h: ########## @@ -86,4 +86,16 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { bool optional_; }; +template <typename T, typename... Args> +inline SchemaField MakeRequiredField(int id, std::string name, Args&&... args) { Review Comment: Perhaps we can add an extra `doc` parameter but is not actually used before supporting `doc` to `SchemaField`. In this way, we can have full definition of those fields in this PR. ########## src/iceberg/manifest_entry.cc: ########## @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_entry.h" + +#include <memory> +#include <vector> + +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +std::shared_ptr<StructType> DataFile::Type(std::shared_ptr<StructType> partition_type) { + return std::make_shared<StructType>(std::vector<SchemaField>{ + kContent, + kFilePath, + kFileFormat, + SchemaField::MakeRequired(102, "partition", std::move(partition_type)), + kRecordCount, + kFileSize, + kColumnSizes, + kValueCounts, + kNullValueCounts, + kNanValueCounts, + kLowerBounds, + kUpperBounds, + kKeyMetadata, + kSplitOffsets, + kEqualityIds, + kSortOrderId, + kFirstRowId, + kReferencedDataFile, + kContentOffset, + kContentSize}); +} + +std::shared_ptr<StructType> ManifestEntry::TypeFromPartitionType( + std::shared_ptr<StructType> partition_type) { + return TypeFromDataFileType(DataFile::Type(std::move(partition_type))); +} + +std::shared_ptr<StructType> ManifestEntry::TypeFromDataFileType( + std::shared_ptr<StructType> datafile_type) { + return std::make_shared<StructType>(std::vector<SchemaField>{ Review Comment: Not related to this PR: is it better to use `std::unique_ptr<Type>` in `SchemaField`? Or is it important to make `SchemaField` copyable? @lidavidm -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org