Fokko commented on code in PR #6714: URL: https://github.com/apache/iceberg/pull/6714#discussion_r1110430198
########## python/pyiceberg/expressions/visitors.py: ########## @@ -986,3 +989,245 @@ def expression_to_plain_format( # In the form of expr1 ∨ expr2 ∨ ... ∨ exprN visitor = ExpressionToPlainFormat(cast_int_to_datetime) return [visit(expression, visitor) for expression in expressions] + + +class _InclusiveMetricsEvaluator(BoundBooleanExpressionVisitor[bool]): + struct: StructType + expr: BooleanExpression + + value_counts: Dict[int, int] + null_counts: Dict[int, int] + nan_counts: Dict[int, int] + lower_bounds: Dict[int, bytes] + upper_bounds: Dict[int, bytes] + + def __init__(self, schema: Schema, expr: BooleanExpression, case_sensitive: bool = True) -> None: + self.struct = schema.as_struct() + self.expr = bind(schema, rewrite_not(expr), case_sensitive) + + def eval(self, file: DataFile) -> bool: + """Test whether the file may contain records that match the expression.""" + + if file.record_count == 0: + return ROWS_CANNOT_MATCH + + if file.record_count < 0: + # @TODO we haven't implemented parsing record count from avro file and thus set record count -1 + # when importing avro tables to iceberg tables. This should be updated once we implemented + # and set correct record count. + return ROWS_MIGHT_MATCH + + self.value_counts = file.value_counts or EMPTY_DICT + self.null_counts = file.null_value_counts or EMPTY_DICT + self.nan_counts = file.nan_value_counts or EMPTY_DICT + self.lower_bounds = file.lower_bounds or EMPTY_DICT + self.upper_bounds = file.upper_bounds or EMPTY_DICT + + return visit(self.expr, self) + + def _contains_nulls_only(self, idx: int) -> bool: + return idx in self.value_counts and idx in self.null_counts and self.value_counts[idx] - self.null_counts[idx] == 0 + + def _contains_nans_only(self, idx: int) -> bool: + return idx in self.nan_counts and idx in self.value_counts and self.nan_counts[idx] == self.value_counts[idx] + + def _is_nan(self, val: Any) -> bool: + return math.isnan(val) Review Comment: Good question. It throws an error: ``` >>> math.isnan(None) Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: must be real number, not NoneType >>> math.isnan("vo") Traceback (most recent call last): File "<stdin>", line 1, in <module> TypeError: must be real number, not str ``` I think this should be fine because a non-float/double field can never be NaN. If you do `IsNan` expression on a string field, it will just skip the file. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org