wgtmac commented on code in PR #147: URL: https://github.com/apache/iceberg-cpp/pull/147#discussion_r2214718141
########## src/iceberg/file_format.h: ########## @@ -36,6 +36,7 @@ enum class ICEBERG_EXPORT FileFormatType { kAvro, kOrc, kPuffin, + kUnknown = 99 Review Comment: I think we need to remove this. See my comment below. ########## src/iceberg/manifest_list.h: ########## @@ -40,7 +40,7 @@ namespace iceberg { struct ICEBERG_EXPORT PartitionFieldSummary { /// Field id: 509 /// Whether the manifest contains at least one partition with a null value for the field - bool contains_null; + bool contains_null = false; Review Comment: IIUC, the default value should be `true`. ########## src/iceberg/manifest_list.h: ########## @@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile { std::string manifest_path; /// Field id: 501 /// Length of the manifest file in bytes - int64_t manifest_length; + int64_t manifest_length = 0; /// Field id: 502 /// ID of a partition spec used to write the manifest; must be listed in table metadata /// partition-specs - int32_t partition_spec_id; + int32_t partition_spec_id = 0; /// Field id: 517 /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 /// manifests - Content content; + Content content = Content::kData; /// Field id: 515 /// The sequence number when the manifest was added to the table; use 0 when reading v1 /// manifest lists - int64_t sequence_number; + int64_t sequence_number = 0; /// Field id: 516 /// The minimum data sequence number of all live data or delete files in the manifest; /// use 0 when reading v1 manifest lists - int64_t min_sequence_number; + int64_t min_sequence_number = 0; /// Field id: 503 /// ID of the snapshot where the manifest file was added - int64_t added_snapshot_id; + int64_t added_snapshot_id = 0; Review Comment: ```suggestion int64_t added_snapshot_id = Snapshot::kInvalidSnapshotId; ``` ########## src/iceberg/manifest_list.h: ########## @@ -137,7 +139,7 @@ struct ICEBERG_EXPORT ManifestFile { std::vector<uint8_t> key_metadata; /// Field id: 520 /// The starting _row_id to assign to rows added by ADDED data files - int64_t first_row_id; + int64_t first_row_id = 0; Review Comment: ```suggestion std::optional<int64_t> first_row_id; ``` Let's make it optional. It is v3 only. ########## src/iceberg/expression/literal.cc: ########## @@ -179,6 +179,10 @@ std::strong_ordering CompareFloat(T lhs, T rhs) { return lhs_is_negative <=> rhs_is_negative; } +bool Literal::operator==(const iceberg::Literal& other) const { Review Comment: Perhaps rename the PR title to `feat: support operator== for Literal/Manifest/ManifestList` ? ########## src/iceberg/manifest_list.h: ########## @@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile { std::string manifest_path; /// Field id: 501 /// Length of the manifest file in bytes - int64_t manifest_length; + int64_t manifest_length = 0; /// Field id: 502 /// ID of a partition spec used to write the manifest; must be listed in table metadata /// partition-specs - int32_t partition_spec_id; + int32_t partition_spec_id = 0; /// Field id: 517 /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 /// manifests - Content content; + Content content = Content::kData; /// Field id: 515 /// The sequence number when the manifest was added to the table; use 0 when reading v1 /// manifest lists - int64_t sequence_number; + int64_t sequence_number = 0; Review Comment: ```suggestion int64_t sequence_number = TableMetadata::kInitialSequenceNumber; ``` ########## test/manifest_list_reader_test.cc: ########## @@ -55,74 +95,9 @@ TEST_F(ManifestListReaderTest, BasicTest) { auto read_result = manifest_reader->Files(); ASSERT_EQ(read_result.has_value(), true); ASSERT_EQ(read_result.value().size(), 4); - std::string test_dir_prefix = "/tmp/db/db/iceberg_test/metadata/"; - for (const auto& file : read_result.value()) { - auto manifest_path = file.manifest_path.substr(test_dir_prefix.size()); - if (manifest_path == "2bccd69e-d642-4816-bba0-261cd9bd0d93-m0.avro") { - ASSERT_EQ(file.added_snapshot_id, 7412193043800610213); - ASSERT_EQ(file.manifest_length, 7433); - ASSERT_EQ(file.sequence_number, 4); - ASSERT_EQ(file.min_sequence_number, 4); - ASSERT_EQ(file.partitions.size(), 1); - const auto& partition = file.partitions[0]; - ASSERT_EQ(partition.contains_null, false); - ASSERT_EQ(partition.contains_nan.value(), false); - ASSERT_EQ(partition.lower_bound.value(), - std::vector<uint8_t>({'x', ';', 0x07, 0x00})); - ASSERT_EQ(partition.upper_bound.value(), - std::vector<uint8_t>({'x', ';', 0x07, 0x00})); - } else if (manifest_path == "9b6ffacd-ef10-4abf-a89c-01c733696796-m0.avro") { - ASSERT_EQ(file.added_snapshot_id, 5485972788975780755); - ASSERT_EQ(file.manifest_length, 7431); - ASSERT_EQ(file.sequence_number, 3); - ASSERT_EQ(file.min_sequence_number, 3); - ASSERT_EQ(file.partitions.size(), 1); - const auto& partition = file.partitions[0]; - ASSERT_EQ(partition.contains_null, false); - ASSERT_EQ(partition.contains_nan.value(), false); - ASSERT_EQ(partition.lower_bound.value(), - std::vector<uint8_t>({'(', 0x19, 0x07, 0x00})); - ASSERT_EQ(partition.upper_bound.value(), - std::vector<uint8_t>({'(', 0x19, 0x07, 0x00})); - } else if (manifest_path == "2541e6b5-4923-4bd5-886d-72c6f7228400-m0.avro") { - ASSERT_EQ(file.added_snapshot_id, 1679468743751242972); - ASSERT_EQ(file.manifest_length, 7433); - ASSERT_EQ(file.sequence_number, 2); - ASSERT_EQ(file.min_sequence_number, 2); - ASSERT_EQ(file.partitions.size(), 1); - const auto& partition = file.partitions[0]; - ASSERT_EQ(partition.contains_null, false); - ASSERT_EQ(partition.contains_nan.value(), false); - ASSERT_EQ(partition.lower_bound.value(), - std::vector<uint8_t>({0xd0, 0xd4, 0x06, 0x00})); - ASSERT_EQ(partition.upper_bound.value(), - std::vector<uint8_t>({0xd0, 0xd4, 0x06, 0x00})); - } else if (manifest_path == "3118c801-d2e0-4df6-8c7a-7d4eaade32f8-m0.avro") { - ASSERT_EQ(file.added_snapshot_id, 1579605567338877265); - ASSERT_EQ(file.manifest_length, 7431); - ASSERT_EQ(file.sequence_number, 1); - ASSERT_EQ(file.min_sequence_number, 1); - ASSERT_EQ(file.partitions.size(), 1); - const auto& partition = file.partitions[0]; - ASSERT_EQ(partition.contains_null, false); - ASSERT_EQ(partition.contains_nan.value(), false); - ASSERT_EQ(partition.lower_bound.value(), - std::vector<uint8_t>({0xb8, 0xd4, 0x06, 0x00})); - ASSERT_EQ(partition.upper_bound.value(), - std::vector<uint8_t>({0xb8, 0xd4, 0x06, 0x00})); - } else { - ASSERT_TRUE(false) << "Unexpected manifest file: " << manifest_path; - } - ASSERT_EQ(file.partition_spec_id, 0); - ASSERT_EQ(file.content, ManifestFile::Content::kData); - ASSERT_EQ(file.added_files_count, 1); - ASSERT_EQ(file.existing_files_count, 0); - ASSERT_EQ(file.deleted_files_count, 0); - ASSERT_EQ(file.added_rows_count, 1); - ASSERT_EQ(file.existing_rows_count, 0); - ASSERT_EQ(file.deleted_rows_count, 0); - ASSERT_EQ(file.key_metadata.empty(), true); - } + + auto expected_manifest_list = prepare_test_manifest_list(); Review Comment: ```suggestion auto expected_manifest_list = PrepareTestManifestList(); ``` ########## src/iceberg/manifest_entry.h: ########## @@ -146,7 +146,7 @@ struct ICEBERG_EXPORT DataFile { std::optional<int32_t> sort_order_id; /// This field is not included in spec, so it is not serialized into the manifest file. /// It is just store in memory representation used in process. - int32_t partition_spec_id; + int32_t partition_spec_id = 0; Review Comment: ```suggestion int32_t partition_spec_id = PartitionSpec::kInitialSpecId; ``` ########## src/iceberg/manifest_entry.cc: ########## @@ -27,6 +27,14 @@ namespace iceberg { +bool ManifestEntry::operator==(const iceberg::ManifestEntry& other) const { Review Comment: ```suggestion bool ManifestEntry::operator==(const ManifestEntry& other) const { ``` ########## test/manifest_list_reader_test.cc: ########## @@ -42,6 +42,46 @@ class ManifestListReaderTest : public TempFileTestBase { file_io_ = std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_); } + std::vector<ManifestFile> prepare_test_manifest_list() { Review Comment: ```suggestion std::vector<ManifestFile> PrepareTestManifestList() { ``` ########## src/iceberg/manifest_list.h: ########## @@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile { std::string manifest_path; /// Field id: 501 /// Length of the manifest file in bytes - int64_t manifest_length; + int64_t manifest_length = 0; /// Field id: 502 /// ID of a partition spec used to write the manifest; must be listed in table metadata /// partition-specs - int32_t partition_spec_id; + int32_t partition_spec_id = 0; Review Comment: ```suggestion int32_t partition_spec_id = PartitionSpec::kInitialSpecId; ``` ########## src/iceberg/expression/literal.cc: ########## @@ -179,6 +179,10 @@ std::strong_ordering CompareFloat(T lhs, T rhs) { return lhs_is_negative <=> rhs_is_negative; } +bool Literal::operator==(const iceberg::Literal& other) const { Review Comment: ```suggestion bool Literal::operator==(const Literal& other) const { ``` ########## src/iceberg/manifest_entry.h: ########## @@ -68,13 +68,13 @@ struct ICEBERG_EXPORT DataFile { /// Field id: 134 /// Type of content stored by the data file: data, equality deletes, or position /// deletes (all v1 files are data files) - Content content; + Content content = Content::kData; /// Field id: 100 /// Full URI for the file with FS scheme std::string file_path; /// Field id: 101 /// File format type, avro, orc, parquet, or puffin - FileFormatType file_format; + FileFormatType file_format = FileFormatType::kUnknown; Review Comment: Should we remove the definition of `kUnknown` and use `kParquet` as the default? ########## src/iceberg/manifest_list.h: ########## @@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile { std::string manifest_path; /// Field id: 501 /// Length of the manifest file in bytes - int64_t manifest_length; + int64_t manifest_length = 0; /// Field id: 502 /// ID of a partition spec used to write the manifest; must be listed in table metadata /// partition-specs - int32_t partition_spec_id; + int32_t partition_spec_id = 0; /// Field id: 517 /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 /// manifests - Content content; + Content content = Content::kData; /// Field id: 515 /// The sequence number when the manifest was added to the table; use 0 when reading v1 /// manifest lists - int64_t sequence_number; + int64_t sequence_number = 0; /// Field id: 516 /// The minimum data sequence number of all live data or delete files in the manifest; /// use 0 when reading v1 manifest lists - int64_t min_sequence_number; + int64_t min_sequence_number = 0; Review Comment: ```suggestion int64_t min_sequence_number = TableMetadata::kInitialSequenceNumber; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org