[GitHub] [iceberg] Fokko commented on a diff in pull request #6714: Python: Filter on Datafile metrics

via GitHub Sun, 26 Feb 2023 10:51:45 -0800


Fokko commented on code in PR #6714:
URL: https://github.com/apache/iceberg/pull/6714#discussion_r1118134051



##########
python/pyiceberg/expressions/visitors.py:
##########
@@ -986,3 +989,263 @@ def expression_to_plain_format(
     # In the form of expr1 ∨ expr2 ∨ ... ∨ exprN
     visitor = ExpressionToPlainFormat(cast_int_to_datetime)
     return [visit(expression, visitor) for expression in expressions]
+
+
+class _InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]):
+    struct: StructType
+    expr: BooleanExpression
+
+    value_counts: Dict[int, int]
+    null_counts: Dict[int, int]
+    nan_counts: Dict[int, int]
+    lower_bounds: Dict[int, bytes]
+    upper_bounds: Dict[int, bytes]
+
+    def __init__(self, schema: Schema, expr: BooleanExpression, 
case_sensitive: bool = True) -> None:
+        self.struct = schema.as_struct()
+        self.expr = bind(schema, rewrite_not(expr), case_sensitive)
+
+    def eval(self, file: DataFile) -> bool:
+        """Test whether the file may contain records that match the 
expression."""
+
+        if file.record_count == 0:
+            return ROWS_CANNOT_MATCH
+
+        if file.record_count < 0:
+            # Older version don't correctly implement record count from avro 
file and thus
+            # set record count -1 when importing avro tables to iceberg 
tables. This should
+            # be updated once we implemented and set correct record count.
+            return ROWS_MIGHT_MATCH
+
+        self.value_counts = file.value_counts or EMPTY_DICT
+        self.null_counts = file.null_value_counts or EMPTY_DICT
+        self.nan_counts = file.nan_value_counts or EMPTY_DICT
+        self.lower_bounds = file.lower_bounds or EMPTY_DICT
+        self.upper_bounds = file.upper_bounds or EMPTY_DICT
+
+        return visit(self.expr, self)
+
+    def _contains_nulls_only(self, field_id: int) -> bool:
+        return (
+            self.value_counts is not None
+            and self.null_counts is not None
+            and field_id in self.value_counts
+            and field_id in self.null_counts
+            and self.value_counts[field_id] == self.null_counts[field_id]
+        )
+
+    def _contains_nans_only(self, field_id: int) -> bool:
+        return (
+            self.value_counts is not None
+            and self.nan_counts is not None
+            and field_id in self.nan_counts
+            and field_id in self.value_counts
+            and self.nan_counts[field_id] == self.value_counts[field_id]
+        )
+
+    def _is_nan(self, val: Any) -> bool:
+        try:
+            return math.isnan(val)
+        except TypeError:
+            # In the case of None or other non-numeric types
+            return False
+
+    def visit_true(self) -> bool:
+        # all rows match
+        return ROWS_MIGHT_MATCH
+
+    def visit_false(self) -> bool:
+        # all rows fail
+        return ROWS_CANNOT_MATCH
+
+    def visit_not(self, child_result: bool) -> bool:
+        raise ValueError(f"NOT should be rewritten: {child_result}")
+
+    def visit_and(self, left_result: bool, right_result: bool) -> bool:
+        return left_result and right_result
+
+    def visit_or(self, left_result: bool, right_result: bool) -> bool:
+        return left_result or right_result
+
+    def visit_is_null(self, term: BoundTerm[L]) -> bool:
+        field_id = term.ref().field.field_id
+
+        if self.null_counts is not None and field_id in self.null_counts and 
self.null_counts[field_id] == 0:
+            return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_not_null(self, term: BoundTerm[L]) -> bool:
+        # no need to check whether the field is required because binding 
evaluates that case
+        # if the column has no non-null values, the expression cannot match
+        field_id = term.ref().field.field_id
+
+        if self._contains_nulls_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_is_nan(self, term: BoundTerm[L]) -> bool:
+        field_id = term.ref().field.field_id
+
+        if field_id in self.nan_counts and self.nan_counts[field_id] == 0:
+            return ROWS_CANNOT_MATCH
+
+        # when there's no nanCounts information, but we already know the 
column only contains null,
+        # it's guaranteed that there's no NaN value
+        if self._contains_nulls_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_not_nan(self, term: BoundTerm[L]) -> bool:
+        field_id = term.ref().field.field_id
+
+        if self._contains_nans_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_less_than(self, term: BoundTerm[L], literal: Literal[L]) -> bool:
+        field = term.ref().field
+        field_id = field.field_id
+
+        if self._contains_nulls_only(field_id) or 
self._contains_nans_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        if not isinstance(field.field_type, PrimitiveType):
+            raise ValueError(f"Expected PrimitiveType: {field.field_type}")
+
+        if field_id in self.lower_bounds:
+            lower = from_bytes(field.field_type, self.lower_bounds[field_id])
+
+            if self._is_nan(lower):
+                # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
+                return ROWS_MIGHT_MATCH
+
+            if lower >= literal.value:
+                return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_less_than_or_equal(self, term: BoundTerm[L], literal: 
Literal[L]) -> bool:
+        field = term.ref().field
+        field_id = field.field_id
+
+        if self._contains_nulls_only(field_id) or 
self._contains_nans_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        if not isinstance(field.field_type, PrimitiveType):
+            raise ValueError(f"Expected PrimitiveType: {field.field_type}")
+
+        if field_id in self.lower_bounds:
+            lower = from_bytes(field.field_type, self.lower_bounds[field_id])
+
+            if self._is_nan(lower):
+                # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
+                return ROWS_MIGHT_MATCH
+
+            if lower > literal.value:
+                return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_greater_than(self, term: BoundTerm[L], literal: Literal[L]) -> 
bool:
+        field = term.ref().field
+        field_id = field.field_id
+
+        if self._contains_nulls_only(field_id) or 
self._contains_nans_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        if not isinstance(field.field_type, PrimitiveType):
+            raise ValueError(f"Expected PrimitiveType: {field.field_type}")
+
+        if field_id in self.upper_bounds and from_bytes(field.field_type, 
self.upper_bounds[field_id]) <= literal.value:
+            return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_greater_than_or_equal(self, term: BoundTerm[L], literal: 
Literal[L]) -> bool:
+        field = term.ref().field
+        field_id = field.field_id
+
+        if self._contains_nulls_only(field_id) or 
self._contains_nans_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        if not isinstance(field.field_type, PrimitiveType):
+            raise ValueError(f"Expected PrimitiveType: {field.field_type}")
+
+        if field_id in self.upper_bounds and from_bytes(field.field_type, 
self.upper_bounds[field_id]) < literal.value:
+            return ROWS_CANNOT_MATCH
+
+        return ROWS_MIGHT_MATCH
+
+    def visit_equal(self, term: BoundTerm[L], literal: Literal[L]) -> bool:
+        field = term.ref().field
+        field_id = field.field_id
+
+        if self._contains_nulls_only(field_id) or 
self._contains_nans_only(field_id):
+            return ROWS_CANNOT_MATCH
+
+        if not isinstance(field.field_type, PrimitiveType):
+            raise ValueError(f"Expected PrimitiveType: {field.field_type}")
+
+        if field_id in self.lower_bounds:
+            lower = from_bytes(field.field_type, self.lower_bounds[field_id])
+
+            if self._is_nan(lower):
+                # NaN indicates unreliable bounds. See the 
InclusiveMetricsEvaluator docs for more.
+                return ROWS_MIGHT_MATCH
+
+            if lower > literal.value:
+                return ROWS_CANNOT_MATCH
+
+        if field_id in self.upper_bounds and from_bytes(field.field_type, 
self.upper_bounds[field_id]) < literal.value:
+            return ROWS_CANNOT_MATCH

Review Comment:
   Got it, I misread the other comment, thanks!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [iceberg] Fokko commented on a diff in pull request #6714: Python: Filter on Datafile metrics

Reply via email to