bodduv commented on code in PR #14500:
URL: https://github.com/apache/iceberg/pull/14500#discussion_r2817005618
##########
api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java:
##########
@@ -684,4 +733,111 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats",
INT_MAX_VALUE))
new StrictMetricsEvaluator(SCHEMA,
notNull("struct.nested_col_with_stats")).eval(FILE);
assertThat(shouldRead).as("notNull nested column should not
match").isFalse();
}
+
+ // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison
only
+
+ @Test
+ public void testStrictUuidGt() {
+ // Query: uuid > 0x00... (all UUIDs in file should be > this)
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid",
belowMin)).eval(UUID_FILE);
+ // With bounds [UUID_MIN, UUID_MAX], all values should be > belowMin
+ assertThat(allMatch).as("All UUIDs should be greater than
belowMin").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidLt() {
+ // Query: uuid < 0xFF... (all UUIDs in file should be < this)
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, lessThan("uuid",
aboveMax)).eval(UUID_FILE);
+ // With bounds [UUID_MIN, UUID_MAX], all values should be < aboveMax
+ assertThat(allMatch).as("All UUIDs should be less than aboveMax").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidEqNeverMatchesRange() {
+ // Strict eq should never match when there's a range of values
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ boolean allMatch = new StrictMetricsEvaluator(SCHEMA, equal("uuid",
middle)).eval(UUID_FILE);
+ assertThat(allMatch).as("Strict eq should not match range").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidInNeverMatchesRange() {
+ // Strict IN should never match when there's a range of values (lower !=
upper)
+ UUID middle1 = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ UUID middle2 = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid", middle1,
middle2)).eval(UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match range").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidInMatchesSingleValue() {
+ // Strict IN should match when lower == upper and the value is in the set
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid",
SINGLE_UUID)).eval(SINGLE_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should match single value in
set").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidInDoesNotMatchWhenValueNotInSet() {
+ // Strict IN should not match when lower == upper but the value is not in
the set
+ UUID otherUuid = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid",
otherUuid)).eval(SINGLE_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match when value not in
set").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidNotInMatchesWhenAllValuesOutsideBounds() {
+ // Strict NOT IN should match when all values in the set are outside the
file's bounds
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, notIn("uuid", belowMin,
aboveMax)).eval(UUID_FILE);
+ // All values in the set are outside [UUID_MIN, UUID_MAX], so all rows
match NOT IN
+ assertThat(allMatch).as("Strict NOT IN should match when all values
outside bounds").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidNotInDoesNotMatchWhenValueInBounds() {
+ // Strict NOT IN should not match when a value in the set is within bounds
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ boolean allMatch = new StrictMetricsEvaluator(SCHEMA, notIn("uuid",
middle)).eval(UUID_FILE);
+ // middle is within [UUID_MIN, UUID_MAX], so some rows might match the
value
+ assertThat(allMatch).as("Strict NOT IN should not match when value in
bounds").isFalse();
+ }
+
+ // Tests for file with inverted UUID bounds (as would be written by legacy
signed comparator)
+
+ @Test
+ public void testStrictUuidInWithLegacyInvertedBounds() {
+ // With inverted bounds [0x80..., 0x40...] where lower > upper in RFC
order,
Review Comment:
Correct, StrictMetricsEvaluator does not need dual-comparator evaluation.
All the following instances require that the new unsigned comparator is used
(as they are related to write operations):
- Usage at `BaseOverwriteFiles`: ensure that files being added during an
overwrite operation match the overwrite filter.
- Usage at `ManifestFilterManager`: avoid data loss by validating data files
marked for delete (throw validation error to make sure none of the data files
are deleted based on signed comparisons)
- Usage at `SparkTable`: avoid metadata-based data file removal -> force row
level deletions that use unsigned comparator.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]