syun64 commented on code in PR #955: URL: https://github.com/apache/iceberg-python/pull/955#discussion_r1687162697
########## pyiceberg/io/pyarrow.py: ########## @@ -638,10 +634,124 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p return left_result | right_result +class _CollectNullUnmentionedTermsFromExpression(BoundBooleanExpressionVisitor[Any]): + def __init__(self) -> None: + # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. + self.is_valid_or_not_bound_terms: set[BoundTerm[Any]] = set() + # The remaining BoundTerms appearing in the boolean expr. + self.null_unmentioned_bound_terms: set[BoundTerm[Any]] = set() + super().__init__() + + def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_null or is_not_null is included.""" + if term in self.null_unmentioned_bound_terms: + self.null_unmentioned_bound_terms.remove(term) + self.is_valid_or_not_bound_terms.add(term) + + def _handle_skipped(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_null or is_not_null is included.""" + if term not in self.is_valid_or_not_bound_terms: + self.null_unmentioned_bound_terms.add(term) + + def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_skipped(term) + + def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_skipped(term) + + # todo: do I have to modify this as well + def visit_is_nan(self, term: BoundTerm[Any]) -> None: + self._handle_skipped(term) + + # todo: do I have to modify this as well, might need 2 self.xx sets for mentioned_nan and none-mentioned-nan + def visit_not_nan(self, term: BoundTerm[Any]) -> None: + self._handle_skipped(term) + + def visit_is_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + + def visit_not_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + + def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_true(self) -> None: + return + + def visit_false(self) -> None: + return + + def visit_not(self, child_result: pc.Expression) -> None: + return + + def visit_and(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + +def _get_is_valid_or_not_bound_refs(expr: BooleanExpression) -> tuple[Set[BoundReference[Any]], Set[BoundReference[Any]]]: + """Collect the bound terms catogorized by having at least one is_null or is_not_null in the expr and the remaining.""" + collector = _CollectNullUnmentionedTermsFromExpression() + boolean_expression_visit(expr, collector) + null_unmentioned_bound_terms = collector.null_unmentioned_bound_terms + is_valid_or_not_bound_terms = collector.is_valid_or_not_bound_terms + + null_unmentioned_bound_refs: Set[BoundReference[Any]] = set() + is_valid_or_not_bound_refs: Set[BoundReference[Any]] = set() + for t in null_unmentioned_bound_terms: + if not isinstance(t, BoundReference): + raise ValueError("Collected Bound Term that is not reference.") + else: + null_unmentioned_bound_refs.add(t) + for t in is_valid_or_not_bound_terms: + if not isinstance(t, BoundReference): + raise ValueError("Collected Bound Term that is not reference.") + else: + is_valid_or_not_bound_refs.add(t) + return null_unmentioned_bound_refs, is_valid_or_not_bound_refs + + def expression_to_pyarrow(expr: BooleanExpression) -> pc.Expression: return boolean_expression_visit(expr, _ConvertToArrowExpression()) +def expression_to_reverted_pyarrow(expr: BooleanExpression) -> pc.Expression: + """Complimentary filter convertion function of expression_to_pyarrow. Review Comment: Could we mention that this function is used specifically for reverting an expression to pyarrow expression for the delete operation? As we discussed offline, negating a pc.Expression has the expected outcome that ignores Nulls, and only becomes a problem when we use it for deletion, because we want to avoid deleting Nulls unless it is specified in the delete_filter explicitly. On that note, I feel that it may be good to make this a private function as well. WDYT? nit: typo ```suggestion """Complimentary filter conversion function of expression_to_pyarrow. ``` ########## pyiceberg/io/pyarrow.py: ########## @@ -638,10 +634,124 @@ def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> p return left_result | right_result +class _CollectNullUnmentionedTermsFromExpression(BoundBooleanExpressionVisitor[Any]): + def __init__(self) -> None: + # BoundTerms which have either is_null or is_not_null appearing at least once in the boolean expr. + self.is_valid_or_not_bound_terms: set[BoundTerm[Any]] = set() + # The remaining BoundTerms appearing in the boolean expr. + self.null_unmentioned_bound_terms: set[BoundTerm[Any]] = set() + super().__init__() + + def _handle_explicit_is_null_or_not(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where either is_null or is_not_null is included.""" + if term in self.null_unmentioned_bound_terms: + self.null_unmentioned_bound_terms.remove(term) + self.is_valid_or_not_bound_terms.add(term) + + def _handle_skipped(self, term: BoundTerm[Any]) -> None: + """Handle the predicate case where neither is_null or is_not_null is included.""" + if term not in self.is_valid_or_not_bound_terms: + self.null_unmentioned_bound_terms.add(term) + + def visit_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_skipped(term) + + def visit_not_in(self, term: BoundTerm[pc.Expression], literals: Set[Any]) -> None: + self._handle_skipped(term) + + # todo: do I have to modify this as well + def visit_is_nan(self, term: BoundTerm[Any]) -> None: + self._handle_skipped(term) + + # todo: do I have to modify this as well, might need 2 self.xx sets for mentioned_nan and none-mentioned-nan + def visit_not_nan(self, term: BoundTerm[Any]) -> None: + self._handle_skipped(term) + + def visit_is_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + + def visit_not_null(self, term: BoundTerm[Any]) -> None: + self._handle_explicit_is_null_or_not(term) + + def visit_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_not_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_greater_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_greater_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_less_than(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_less_than_or_equal(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_not_starts_with(self, term: BoundTerm[Any], literal: Literal[Any]) -> None: + self._handle_skipped(term) + + def visit_true(self) -> None: + return + + def visit_false(self) -> None: + return + + def visit_not(self, child_result: pc.Expression) -> None: + return + + def visit_and(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + def visit_or(self, left_result: pc.Expression, right_result: pc.Expression) -> None: + return + + +def _get_is_valid_or_not_bound_refs(expr: BooleanExpression) -> tuple[Set[BoundReference[Any]], Set[BoundReference[Any]]]: + """Collect the bound terms catogorized by having at least one is_null or is_not_null in the expr and the remaining.""" Review Comment: nit: typo ```suggestion """Collect the bound terms categorized by having at least one is_null or is_not_null in the expr and the remaining.""" ``` ########## pyiceberg/expressions/__init__.py: ########## @@ -135,6 +135,10 @@ def __repr__(self) -> str: def ref(self) -> BoundReference[L]: return self + def __hash__(self) -> int: + """Return hash value of the Record class.""" Review Comment: nit: ```suggestion """Return hash value of the BoundReference class.""" ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org