Fokko commented on code in PR #6892:
URL: https://github.com/apache/iceberg/pull/6892#discussion_r1121804182
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -678,6 +705,59 @@ def visit_less_than_or_equal(self, term: BoundTerm[L],
literal: Literal[L]) -> b
return ROWS_MIGHT_MATCH
+ def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) ->
bool:
+ pos = term.ref().accessor.position
+ field = self.partition_fields[pos]
+ prefix = str(literal.value)
+ len_prefix = len(prefix)
+
+ if not field.lower_bound:
+ return ROWS_CANNOT_MATCH
+
+ lower = _from_byte_buffer(term.ref().field.field_type,
field.lower_bound)
+ # truncate lower bound so that its length is not greater than the
length of prefix
+ if lower and lower[:len_prefix] > prefix:
+ return ROWS_CANNOT_MATCH
+
+ if not field.upper_bound:
+ return ROWS_CANNOT_MATCH
+
+ upper = _from_byte_buffer(term.ref().field.field_type,
field.upper_bound)
+ # truncate upper bound so that its length is not greater than the
length of prefix
+ if upper and upper[:len_prefix] < prefix:
+ return ROWS_CANNOT_MATCH
+
+ return ROWS_MIGHT_MATCH
+
+ def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L])
-> bool:
+ pos = term.ref().accessor.position
+ field = self.partition_fields[pos]
+ prefix = str(literal.value)
+ len_prefix = len(prefix)
+
+ if field.contains_null or not field.lower_bound or not
field.upper_bound:
Review Comment:
```suggestion
if field.contains_null or field.lower_bound is None or
field.upper_bound is None:
```
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -1256,3 +1345,59 @@ def visit_not_in(self, term: BoundTerm[L], literals:
Set[L]) -> bool:
# because the bounds are not necessarily a min or max value, this
cannot be answered using
# them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a
value in col.
return ROWS_MIGHT_MATCH
+
+ def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) ->
bool:
+ field = term.ref().field
+ field_id = field.field_id
+
+ if self._contains_nulls_only(field_id):
+ return ROWS_CANNOT_MATCH
+
+ prefix = str(literal.value)
+ len_prefix = len(prefix)
+
+ if self.lower_bounds and field_id in self.lower_bounds:
+ lower_bound = str(from_bytes(field.field_type,
self.lower_bounds.get(field_id))) # type: ignore
+
+ # truncate lower bound so that its length is not greater than the
length of prefix
+ if lower_bound and lower_bound[:len_prefix] > prefix:
+ return ROWS_CANNOT_MATCH
+
+ if self.upper_bounds and field_id in self.upper_bounds:
+ upper_bound = str(from_bytes(field.field_type,
self.upper_bounds.get(field_id))) # type: ignore
+
+ # truncate upper bound so that its length is not greater than the
length of prefix
+ if upper_bound and upper_bound[:len_prefix] < prefix:
Review Comment:
```suggestion
if upper_bound is not None and upper_bound[:len_prefix] < prefix:
```
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -1256,3 +1345,59 @@ def visit_not_in(self, term: BoundTerm[L], literals:
Set[L]) -> bool:
# because the bounds are not necessarily a min or max value, this
cannot be answered using
# them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a
value in col.
return ROWS_MIGHT_MATCH
+
+ def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) ->
bool:
+ field = term.ref().field
+ field_id = field.field_id
+
+ if self._contains_nulls_only(field_id):
+ return ROWS_CANNOT_MATCH
+
+ prefix = str(literal.value)
+ len_prefix = len(prefix)
+
+ if self.lower_bounds and field_id in self.lower_bounds:
+ lower_bound = str(from_bytes(field.field_type,
self.lower_bounds.get(field_id))) # type: ignore
+
+ # truncate lower bound so that its length is not greater than the
length of prefix
+ if lower_bound and lower_bound[:len_prefix] > prefix:
+ return ROWS_CANNOT_MATCH
+
+ if self.upper_bounds and field_id in self.upper_bounds:
Review Comment:
This reduces the number of lookups in the dict:
```suggestion
if upper_bound_bytes := self.upper_bounds.get(field_id):
```
##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -1256,3 +1345,59 @@ def visit_not_in(self, term: BoundTerm[L], literals:
Set[L]) -> bool:
# because the bounds are not necessarily a min or max value, this
cannot be answered using
# them. notIn(col, {X, ...}) with (X, Y) doesn't guarantee that X is a
value in col.
return ROWS_MIGHT_MATCH
+
+ def visit_starts_with(self, term: BoundTerm[L], literal: Literal[L]) ->
bool:
+ field = term.ref().field
+ field_id = field.field_id
+
+ if self._contains_nulls_only(field_id):
+ return ROWS_CANNOT_MATCH
+
+ prefix = str(literal.value)
+ len_prefix = len(prefix)
+
+ if self.lower_bounds and field_id in self.lower_bounds:
Review Comment:
This reduces the number of lookups in the dict:
```suggestion
if lower_bound_bytes : = self.lower_bounds.get(field_id):
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]