syun64 commented on code in PR #955: URL: https://github.com/apache/iceberg-python/pull/955#discussion_r1688971266
########## pyiceberg/io/pyarrow.py: ########## @@ -638,10 +634,165 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p return left_result | right_result +class _NullNaNUnmentionedTermsCollector(BoundBooleanExpressionVisitor[Any]): + # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. + is_null_or_not_bound_terms: set[BoundTerm[Any]] + # The remaining BoundTerms appearing in the boolean expr. + null_unmentioned_bound_terms: set[BoundTerm[Any]] + # BoundTerms which have either is_nan or is_not_nan appearing at least once in the boolean expr. + is_nan_or_not_bound_terms: set[BoundTerm[Any]] + # The remaining BoundTerms appearing in the boolean expr. + nan_unmentioned_bound_terms: set[BoundTerm[Any]] + + def __init__(self) -> None: + self.is_null_or_not_bound_terms = set() + self.null_unmentioned_bound_terms = set() + self.is_nan_or_not_bound_terms = set() + self.nan_unmentioned_bound_terms = set() + super().__init__() + + def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_null or is_not_null is included.""" + if term in self.null_unmentioned_bound_terms: + self.null_unmentioned_bound_terms.remove(term) + self.is_null_or_not_bound_terms.add(term) + + def _handle_null_unmentioned(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_null or is_not_null is included.""" + if term not in self.is_null_or_not_bound_terms: + self.null_unmentioned_bound_terms.add(term) + + def _handle_explicit_is_nan_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_nan or is_not_nan is included.""" + if term in self.nan_unmentioned_bound_terms: + self.nan_unmentioned_bound_terms.remove(term) + self.is_nan_or_not_bound_terms.add(term) + + def _handle_nan_unmentioned(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_nan or is_not_nan is included.""" + if term not in self.is_nan_or_not_bound_terms: + self.nan_unmentioned_bound_terms.add(term) + + def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_is_nan(self, term: BoundTerm[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_explicit_is_nan_or_not(term) + + def visit_not_nan(self, term: BoundTerm[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_explicit_is_nan_or_not(term) + + def visit_is_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + self._handle_nan_unmentioned(term) + + def visit_not_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + self._handle_nan_unmentioned(term) + + def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_null_unmentioned(term) + self._handle_nan_unmentioned(term) + + def visit_true(self) -> None: + return + + def visit_false(self) -> None: + return + + def visit_not(self, child_result: pc.Expression) -> None: + return + + def visit_and(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + def collect( + self, + expr: BooleanExpression, + ) -> None: + """Collect the bound references categorized by having at least one is_null or is_not_null in the expr and the remaining.""" + boolean_expression_visit(expr, self) + + def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) +def _expression_to_complementary_pyarrow(expr: BooleanExpression) -> pc.Expression: + """Complementary filter conversion function of expression_to_pyarrow. + + Could not use expression_to_pyarrow(Not(expr)) to achieve this complementary effect because ~ in pyarrow.compute.Expression does not handle null. + """ + collector = _NullNaNUnmentionedTermsCollector() + collector.collect(expr) + + def _downcast_term_to_reference(bound_terms: Set[BoundTerm[Any]]) -> Set[BoundReference[Any]]: Review Comment: Hi Adrian, I don't have much knowledge regarding the difference between a Reference and a Term and why we organized the classes this way, but I took a stab at resolving this issue without requiring a new Visitor. I've created a PR to your branch - please have a look and let me know if this works for you! https://github.com/jqin61/iceberg-python/pull/5 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org