wgtmac commented on code in PR #147:
URL: https://github.com/apache/iceberg-cpp/pull/147#discussion_r2214718141


##########
src/iceberg/file_format.h:
##########
@@ -36,6 +36,7 @@ enum class ICEBERG_EXPORT FileFormatType {
   kAvro,
   kOrc,
   kPuffin,
+  kUnknown = 99

Review Comment:
   I think we need to remove this. See my comment below.



##########
src/iceberg/manifest_list.h:
##########
@@ -40,7 +40,7 @@ namespace iceberg {
 struct ICEBERG_EXPORT PartitionFieldSummary {
   /// Field id: 509
   /// Whether the manifest contains at least one partition with a null value 
for the field
-  bool contains_null;
+  bool contains_null = false;

Review Comment:
   IIUC, the default value should be `true`.



##########
src/iceberg/manifest_list.h:
##########
@@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile {
   std::string manifest_path;
   /// Field id: 501
   /// Length of the manifest file in bytes
-  int64_t manifest_length;
+  int64_t manifest_length = 0;
   /// Field id: 502
   /// ID of a partition spec used to write the manifest; must be listed in 
table metadata
   /// partition-specs
-  int32_t partition_spec_id;
+  int32_t partition_spec_id = 0;
   /// Field id: 517
   /// The type of files tracked by the manifest, either data or delete files; 
0 for all v1
   /// manifests
-  Content content;
+  Content content = Content::kData;
   /// Field id: 515
   /// The sequence number when the manifest was added to the table; use 0 when 
reading v1
   /// manifest lists
-  int64_t sequence_number;
+  int64_t sequence_number = 0;
   /// Field id: 516
   /// The minimum data sequence number of all live data or delete files in the 
manifest;
   /// use 0 when reading v1 manifest lists
-  int64_t min_sequence_number;
+  int64_t min_sequence_number = 0;
   /// Field id: 503
   /// ID of the snapshot where the manifest file was added
-  int64_t added_snapshot_id;
+  int64_t added_snapshot_id = 0;

Review Comment:
   ```suggestion
     int64_t added_snapshot_id = Snapshot::kInvalidSnapshotId;
   ```



##########
src/iceberg/manifest_list.h:
##########
@@ -137,7 +139,7 @@ struct ICEBERG_EXPORT ManifestFile {
   std::vector<uint8_t> key_metadata;
   /// Field id: 520
   /// The starting _row_id to assign to rows added by ADDED data files
-  int64_t first_row_id;
+  int64_t first_row_id = 0;

Review Comment:
   ```suggestion
     std::optional<int64_t> first_row_id;
   ```
   
   Let's make it optional. It is v3 only.



##########
src/iceberg/expression/literal.cc:
##########
@@ -179,6 +179,10 @@ std::strong_ordering CompareFloat(T lhs, T rhs) {
   return lhs_is_negative <=> rhs_is_negative;
 }
 
+bool Literal::operator==(const iceberg::Literal& other) const {

Review Comment:
   Perhaps rename the PR title to `feat: support operator== for 
Literal/Manifest/ManifestList` ?



##########
src/iceberg/manifest_list.h:
##########
@@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile {
   std::string manifest_path;
   /// Field id: 501
   /// Length of the manifest file in bytes
-  int64_t manifest_length;
+  int64_t manifest_length = 0;
   /// Field id: 502
   /// ID of a partition spec used to write the manifest; must be listed in 
table metadata
   /// partition-specs
-  int32_t partition_spec_id;
+  int32_t partition_spec_id = 0;
   /// Field id: 517
   /// The type of files tracked by the manifest, either data or delete files; 
0 for all v1
   /// manifests
-  Content content;
+  Content content = Content::kData;
   /// Field id: 515
   /// The sequence number when the manifest was added to the table; use 0 when 
reading v1
   /// manifest lists
-  int64_t sequence_number;
+  int64_t sequence_number = 0;

Review Comment:
   ```suggestion
     int64_t sequence_number = TableMetadata::kInitialSequenceNumber;
   ```



##########
test/manifest_list_reader_test.cc:
##########
@@ -55,74 +95,9 @@ TEST_F(ManifestListReaderTest, BasicTest) {
   auto read_result = manifest_reader->Files();
   ASSERT_EQ(read_result.has_value(), true);
   ASSERT_EQ(read_result.value().size(), 4);
-  std::string test_dir_prefix = "/tmp/db/db/iceberg_test/metadata/";
-  for (const auto& file : read_result.value()) {
-    auto manifest_path = file.manifest_path.substr(test_dir_prefix.size());
-    if (manifest_path == "2bccd69e-d642-4816-bba0-261cd9bd0d93-m0.avro") {
-      ASSERT_EQ(file.added_snapshot_id, 7412193043800610213);
-      ASSERT_EQ(file.manifest_length, 7433);
-      ASSERT_EQ(file.sequence_number, 4);
-      ASSERT_EQ(file.min_sequence_number, 4);
-      ASSERT_EQ(file.partitions.size(), 1);
-      const auto& partition = file.partitions[0];
-      ASSERT_EQ(partition.contains_null, false);
-      ASSERT_EQ(partition.contains_nan.value(), false);
-      ASSERT_EQ(partition.lower_bound.value(),
-                std::vector<uint8_t>({'x', ';', 0x07, 0x00}));
-      ASSERT_EQ(partition.upper_bound.value(),
-                std::vector<uint8_t>({'x', ';', 0x07, 0x00}));
-    } else if (manifest_path == 
"9b6ffacd-ef10-4abf-a89c-01c733696796-m0.avro") {
-      ASSERT_EQ(file.added_snapshot_id, 5485972788975780755);
-      ASSERT_EQ(file.manifest_length, 7431);
-      ASSERT_EQ(file.sequence_number, 3);
-      ASSERT_EQ(file.min_sequence_number, 3);
-      ASSERT_EQ(file.partitions.size(), 1);
-      const auto& partition = file.partitions[0];
-      ASSERT_EQ(partition.contains_null, false);
-      ASSERT_EQ(partition.contains_nan.value(), false);
-      ASSERT_EQ(partition.lower_bound.value(),
-                std::vector<uint8_t>({'(', 0x19, 0x07, 0x00}));
-      ASSERT_EQ(partition.upper_bound.value(),
-                std::vector<uint8_t>({'(', 0x19, 0x07, 0x00}));
-    } else if (manifest_path == 
"2541e6b5-4923-4bd5-886d-72c6f7228400-m0.avro") {
-      ASSERT_EQ(file.added_snapshot_id, 1679468743751242972);
-      ASSERT_EQ(file.manifest_length, 7433);
-      ASSERT_EQ(file.sequence_number, 2);
-      ASSERT_EQ(file.min_sequence_number, 2);
-      ASSERT_EQ(file.partitions.size(), 1);
-      const auto& partition = file.partitions[0];
-      ASSERT_EQ(partition.contains_null, false);
-      ASSERT_EQ(partition.contains_nan.value(), false);
-      ASSERT_EQ(partition.lower_bound.value(),
-                std::vector<uint8_t>({0xd0, 0xd4, 0x06, 0x00}));
-      ASSERT_EQ(partition.upper_bound.value(),
-                std::vector<uint8_t>({0xd0, 0xd4, 0x06, 0x00}));
-    } else if (manifest_path == 
"3118c801-d2e0-4df6-8c7a-7d4eaade32f8-m0.avro") {
-      ASSERT_EQ(file.added_snapshot_id, 1579605567338877265);
-      ASSERT_EQ(file.manifest_length, 7431);
-      ASSERT_EQ(file.sequence_number, 1);
-      ASSERT_EQ(file.min_sequence_number, 1);
-      ASSERT_EQ(file.partitions.size(), 1);
-      const auto& partition = file.partitions[0];
-      ASSERT_EQ(partition.contains_null, false);
-      ASSERT_EQ(partition.contains_nan.value(), false);
-      ASSERT_EQ(partition.lower_bound.value(),
-                std::vector<uint8_t>({0xb8, 0xd4, 0x06, 0x00}));
-      ASSERT_EQ(partition.upper_bound.value(),
-                std::vector<uint8_t>({0xb8, 0xd4, 0x06, 0x00}));
-    } else {
-      ASSERT_TRUE(false) << "Unexpected manifest file: " << manifest_path;
-    }
-    ASSERT_EQ(file.partition_spec_id, 0);
-    ASSERT_EQ(file.content, ManifestFile::Content::kData);
-    ASSERT_EQ(file.added_files_count, 1);
-    ASSERT_EQ(file.existing_files_count, 0);
-    ASSERT_EQ(file.deleted_files_count, 0);
-    ASSERT_EQ(file.added_rows_count, 1);
-    ASSERT_EQ(file.existing_rows_count, 0);
-    ASSERT_EQ(file.deleted_rows_count, 0);
-    ASSERT_EQ(file.key_metadata.empty(), true);
-  }
+
+  auto expected_manifest_list = prepare_test_manifest_list();

Review Comment:
   ```suggestion
     auto expected_manifest_list = PrepareTestManifestList();
   ```



##########
src/iceberg/manifest_entry.h:
##########
@@ -146,7 +146,7 @@ struct ICEBERG_EXPORT DataFile {
   std::optional<int32_t> sort_order_id;
   /// This field is not included in spec, so it is not serialized into the 
manifest file.
   /// It is just store in memory representation used in process.
-  int32_t partition_spec_id;
+  int32_t partition_spec_id = 0;

Review Comment:
   ```suggestion
     int32_t partition_spec_id = PartitionSpec::kInitialSpecId;
   ```



##########
src/iceberg/manifest_entry.cc:
##########
@@ -27,6 +27,14 @@
 
 namespace iceberg {
 
+bool ManifestEntry::operator==(const iceberg::ManifestEntry& other) const {

Review Comment:
   ```suggestion
   bool ManifestEntry::operator==(const ManifestEntry& other) const {
   ```



##########
test/manifest_list_reader_test.cc:
##########
@@ -42,6 +42,46 @@ class ManifestListReaderTest : public TempFileTestBase {
     file_io_ = 
std::make_shared<iceberg::arrow::ArrowFileSystemFileIO>(local_fs_);
   }
 
+  std::vector<ManifestFile> prepare_test_manifest_list() {

Review Comment:
   ```suggestion
     std::vector<ManifestFile> PrepareTestManifestList() {
   ```



##########
src/iceberg/manifest_list.h:
##########
@@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile {
   std::string manifest_path;
   /// Field id: 501
   /// Length of the manifest file in bytes
-  int64_t manifest_length;
+  int64_t manifest_length = 0;
   /// Field id: 502
   /// ID of a partition spec used to write the manifest; must be listed in 
table metadata
   /// partition-specs
-  int32_t partition_spec_id;
+  int32_t partition_spec_id = 0;

Review Comment:
   ```suggestion
     int32_t partition_spec_id = PartitionSpec::kInitialSpecId;
   ```



##########
src/iceberg/expression/literal.cc:
##########
@@ -179,6 +179,10 @@ std::strong_ordering CompareFloat(T lhs, T rhs) {
   return lhs_is_negative <=> rhs_is_negative;
 }
 
+bool Literal::operator==(const iceberg::Literal& other) const {

Review Comment:
   ```suggestion
   bool Literal::operator==(const Literal& other) const {
   ```



##########
src/iceberg/manifest_entry.h:
##########
@@ -68,13 +68,13 @@ struct ICEBERG_EXPORT DataFile {
   /// Field id: 134
   /// Type of content stored by the data file: data, equality deletes, or 
position
   /// deletes (all v1 files are data files)
-  Content content;
+  Content content = Content::kData;
   /// Field id: 100
   /// Full URI for the file with FS scheme
   std::string file_path;
   /// Field id: 101
   /// File format type, avro, orc, parquet, or puffin
-  FileFormatType file_format;
+  FileFormatType file_format = FileFormatType::kUnknown;

Review Comment:
   Should we remove the definition of `kUnknown` and use `kParquet` as the 
default?



##########
src/iceberg/manifest_list.h:
##########
@@ -83,26 +85,26 @@ struct ICEBERG_EXPORT ManifestFile {
   std::string manifest_path;
   /// Field id: 501
   /// Length of the manifest file in bytes
-  int64_t manifest_length;
+  int64_t manifest_length = 0;
   /// Field id: 502
   /// ID of a partition spec used to write the manifest; must be listed in 
table metadata
   /// partition-specs
-  int32_t partition_spec_id;
+  int32_t partition_spec_id = 0;
   /// Field id: 517
   /// The type of files tracked by the manifest, either data or delete files; 
0 for all v1
   /// manifests
-  Content content;
+  Content content = Content::kData;
   /// Field id: 515
   /// The sequence number when the manifest was added to the table; use 0 when 
reading v1
   /// manifest lists
-  int64_t sequence_number;
+  int64_t sequence_number = 0;
   /// Field id: 516
   /// The minimum data sequence number of all live data or delete files in the 
manifest;
   /// use 0 when reading v1 manifest lists
-  int64_t min_sequence_number;
+  int64_t min_sequence_number = 0;

Review Comment:
   ```suggestion
     int64_t min_sequence_number = TableMetadata::kInitialSequenceNumber;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to