HonahX commented on code in PR #955: URL: https://github.com/apache/iceberg-python/pull/955#discussion_r1687535523
########## pyiceberg/io/pyarrow.py: ########## @@ -638,10 +634,160 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p return left_result | right_result +class _CollectNullNaNUnmentionedTermsFromExpression(BoundBooleanExpressionVisitor[Any]): + def __init__(self) -> None: + # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. + self.is_null_or_not_bound_terms: set[BoundTerm[Any]] = set() + # The remaining BoundTerms appearing in the boolean expr. + self.null_unmentioned_bound_terms: set[BoundTerm[Any]] = set() + + # BoundTerms which have either is_nan or is_not_nan appearing at least once in the boolean expr. + self.is_nan_or_not_bound_terms: set[BoundTerm[Any]] = set() + # The remaining BoundTerms appearing in the boolean expr. + self.nan_unmentioned_bound_terms: set[BoundTerm[Any]] = set() Review Comment: ```suggestion class _CollectNullNaNUnmentionedTermsFromExpression(BoundBooleanExpressionVisitor[Any]): # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. is_null_or_not_bound_terms: set[BoundTerm[Any]] # The remaining BoundTerms appearing in the boolean expr. null_unmentioned_bound_terms: set[BoundTerm[Any]] # BoundTerms which have either is_nan or is_not_nan appearing at least once in the boolean expr. is_nan_or_not_bound_terms: set[BoundTerm[Any]] # The remaining BoundTerms appearing in the boolean expr. nan_unmentioned_bound_terms: set[BoundTerm[Any]] def __init__(self) -> None: self.is_null_or_not_bound_terms = set() self.null_unmentioned_bound_terms = set() self.is_nan_or_not_bound_terms = set() self.nan_unmentioned_bound_terms = set() ``` How about defining type hints above `__init__` like what we normally do in other classes? ########## pyiceberg/io/pyarrow.py: ########## @@ -638,10 +634,160 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p return left_result | right_result +class _CollectNullNaNUnmentionedTermsFromExpression(BoundBooleanExpressionVisitor[Any]): + def __init__(self) -> None: + # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. + self.is_null_or_not_bound_terms: set[BoundTerm[Any]] = set() + # The remaining BoundTerms appearing in the boolean expr. + self.null_unmentioned_bound_terms: set[BoundTerm[Any]] = set() + + # BoundTerms which have either is_nan or is_not_nan appearing at least once in the boolean expr. + self.is_nan_or_not_bound_terms: set[BoundTerm[Any]] = set() + # The remaining BoundTerms appearing in the boolean expr. + self.nan_unmentioned_bound_terms: set[BoundTerm[Any]] = set() + + super().__init__() + + def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_null or is_not_null is included.""" + if term in self.null_unmentioned_bound_terms: + self.null_unmentioned_bound_terms.remove(term) + self.is_null_or_not_bound_terms.add(term) + + def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_null or is_not_null is included.""" + if term not in self.is_null_or_not_bound_terms: + self.null_unmentioned_bound_terms.add(term) + + def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_nan or is_not_nan is included.""" + if term in self.nan_unmentioned_bound_terms: + self.nan_unmentioned_bound_terms.remove(term) + self.is_nan_or_not_bound_terms.add(term) + + def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_nan or is_not_nan is included.""" + if term not in self.is_nan_or_not_bound_terms: + self.nan_unmentioned_bound_terms.add(term) + + def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_is_nan(self, term: BoundTerm[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_explicit_is_nan_or_not(term) + + def visit_not_nan(self, term: BoundTerm[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_explicit_is_nan_or_not(term) + + def visit_is_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + self._handle_nan_unmentioned(term) + + def visit_not_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + self._handle_nan_unmentioned(term) + + def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_true(self) -> None: + return + + def visit_false(self) -> None: + return + + def visit_not(self, child_result: pc.Expression) -> None: + return + + def visit_and(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + +def _get_null_nan_refs( + expr: BooleanExpression, +) -> tuple[Set[BoundReference[Any]], Set[BoundReference[Any]], Set[BoundReference[Any]], Set[BoundReference[Any]]]: Review Comment: How about we creating a new type like [UpdatesAndRequirement](https://github.com/kevinjqliu/iceberg-python/blob/d0bfb4a67bccd15c2d2e9baa481a5f33fe4c5220/pyiceberg/table/__init__.py#L1344)? We could also consider create a private class, for example `_NullNaNRefsCollector` to hold these sets. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org