Fokko commented on code in PR #518: URL: https://github.com/apache/iceberg-python/pull/518#discussion_r1525997387
########## pyiceberg/expressions/visitors.py: ########## @@ -1421,3 +1430,299 @@ def visit_not_starts_with(self, term: BoundTerm[L], literal: Literal[L]) -> bool return ROWS_CANNOT_MATCH return ROWS_MIGHT_MATCH + + +class _StrictMetricsEvaluator(_MetricsEvaluator): + struct: StructType + expr: BooleanExpression + + def __init__( + self, schema: Schema, expr: BooleanExpression, case_sensitive: bool = True, include_empty_files: bool = False + ) -> None: + self.struct = schema.as_struct() + self.include_empty_files = include_empty_files + self.expr = bind(schema, rewrite_not(expr), case_sensitive) + + def eval(self, file: DataFile) -> bool: + """Test whether all records within the file match the expression. + + Args: + file: A data file + + Returns: false if the file may contain any row that doesn't match + the expression, true otherwise. + """ + if file.record_count <= 0: + # Older version don't correctly implement record count from avro file and thus + # set record count -1 when importing avro tables to iceberg tables. This should + # be updated once we implemented and set correct record count. + return ROWS_MUST_MATCH + + self.value_counts = file.value_counts or EMPTY_DICT + self.null_counts = file.null_value_counts or EMPTY_DICT + self.nan_counts = file.nan_value_counts or EMPTY_DICT + self.lower_bounds = file.lower_bounds or EMPTY_DICT + self.upper_bounds = file.upper_bounds or EMPTY_DICT + + return visit(self.expr, self) + + def visit_is_null(self, term: BoundTerm[L]) -> bool: + # no need to check whether the field is required because binding evaluates that case + # if the column has any non-null values, the expression does not match + field_id = term.ref().field.field_id + field = self.struct.field(field_id=field_id) + if field is None: + raise ValueError(f"Cannot find field, might be nested or missing: {field_id}") Review Comment: That's a good suggestion, I've extracted it out into a separate function 👍 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org