jqin61 commented on code in PR #955: URL: https://github.com/apache/iceberg-python/pull/955#discussion_r1690301032
########## pyiceberg/io/pyarrow.py: ########## @@ -638,10 +634,152 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p return left_result | right_result +class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[Any]): + # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. + is_null_or_not_bound_terms: set[BoundTerm[Any]] + # The remaining BoundTerms appearing in the boolean expr. + null_unmentioned_bound_terms: set[BoundTerm[Any]] + # BoundTerms which have either is_nan or is_not_nan appearing at least once in the boolean expr. + is_nan_or_not_bound_terms: set[BoundTerm[Any]] + # The remaining BoundTerms appearing in the boolean expr. + nan_unmentioned_bound_terms: set[BoundTerm[Any]] + + def __init__(self) -> None: + self.is_null_or_not_bound_terms = set() + self.null_unmentioned_bound_terms = set() + self.is_nan_or_not_bound_terms = set() + self.nan_unmentioned_bound_terms = set() + super().__init__() + + def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_null or is_not_null is included.""" + if term in self.null_unmentioned_bound_terms: + self.null_unmentioned_bound_terms.remove(term) + self.is_null_or_not_bound_terms.add(term) + + def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_null or is_not_null is included.""" + if term not in self.is_null_or_not_bound_terms: + self.null_unmentioned_bound_terms.add(term) + + def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_nan or is_not_nan is included.""" + if term in self.nan_unmentioned_bound_terms: + self.nan_unmentioned_bound_terms.remove(term) + self.is_nan_or_not_bound_terms.add(term) + + def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_nan or is_not_nan is included.""" + if term not in self.is_nan_or_not_bound_terms: + self.nan_unmentioned_bound_terms.add(term) + + def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_is_nan(self, term: BoundTerm[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_explicit_is_nan_or_not(term) + + def visit_not_nan(self, term: BoundTerm[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_explicit_is_nan_or_not(term) + + def visit_is_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + self._handle_nan_unmentioned(term) + + def visit_not_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + self._handle_nan_unmentioned(term) + + def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_true(self) -> None: + return + + def visit_false(self) -> None: + return + + def visit_not(self, child_result: pc.Expression) -> None: + return + + def visit_and(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> None: Review Comment: thanks! this is such a good catch! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org