fallintoplace commented on code in PR #664:
URL: https://github.com/apache/iceberg-cpp/pull/664#discussion_r3292671378


##########
src/iceberg/test/manifest_group_test.cc:
##########
@@ -404,6 +405,123 @@ TEST_P(ManifestGroupTest, CustomManifestEntriesFilter) {
                                                              
"/path/to/data3.parquet"));
 }
 
+TEST_P(ManifestGroupTest, FilterFilesByRecordCount) {
+  auto version = GetParam();
+
+  constexpr int64_t kSnapshotId = 1000L;
+  const auto part_value = PartitionValues({Literal::Int(0)});
+
+  std::vector<ManifestEntry> data_entries{
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/small.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/5)),
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/boundary.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/10)),
+      MakeEntry(ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+                MakeDataFile("/path/to/large.parquet", part_value,
+                             partitioned_spec_->spec_id(), 
/*record_count=*/15))};
+  auto data_manifest =
+      WriteDataManifest(version, kSnapshotId, std::move(data_entries), 
partitioned_spec_);
+
+  std::vector<ManifestFile> manifests = {data_manifest};
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto group,
+      ManifestGroup::Make(file_io_, schema_, GetSpecsById(), 
std::move(manifests)));
+  group->FilterFiles(Expressions::GreaterThanOrEqual("record_count", 
Literal::Long(10)));
+
+  ICEBERG_UNWRAP_OR_FAIL(auto entries, group->Entries());
+  EXPECT_THAT(GetEntryPaths(entries),
+              testing::UnorderedElementsAre("/path/to/boundary.parquet",
+                                            "/path/to/large.parquet"));
+}
+
+TEST_P(ManifestGroupTest, FilterFilesRejectsPartitionMetadata) {
+  auto version = GetParam();
+
+  constexpr int64_t kSnapshotId = 1000L;
+  const auto part_value = PartitionValues({Literal::Int(0)});
+
+  std::vector<ManifestEntry> data_entries{MakeEntry(
+      ManifestStatus::kAdded, kSnapshotId, /*sequence_number=*/1,
+      MakeDataFile("/path/to/data.parquet", part_value, 
partitioned_spec_->spec_id()))};
+  auto data_manifest =
+      WriteDataManifest(version, kSnapshotId, std::move(data_entries), 
partitioned_spec_);
+
+  std::vector<ManifestFile> manifests = {data_manifest};
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto group,
+      ManifestGroup::Make(file_io_, schema_, GetSpecsById(), 
std::move(manifests)));
+  group->FilterFiles(Expressions::Equal("partition.data_bucket_16_2", 
Literal::Int(1)));
+
+  auto result = group->Entries();
+  EXPECT_THAT(result, IsError(ErrorKind::kInvalidExpression));
+  EXPECT_THAT(result, HasErrorMessage("Cannot find field 
'partition.data_bucket_16_2'"));

Review Comment:
   I checked the Java behavior before changing this. Java `ManifestGroup` 
builds the file evaluator with `DataFile.getType(EMPTY_STRUCT)`, and 
`Evaluator` binds the expression eagerly, so unknown fields fail binding rather 
than being treated as match-all.
   
   I kept the C++ behavior aligned with that. In this path, silently ignoring 
malformed or unknown `FilterFiles(...)` predicates would broaden the scan and 
hide caller bugs, which felt riskier than failing fast.
   
   For logical predicates on table columns, callers should use 
`FilterData(...)`, which can then be projected into the appropriate 
partition/manifest filters per spec.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to