kevinjqliu commented on code in PR #1865: URL: https://github.com/apache/iceberg-python/pull/1865#discussion_r2021452599
########## tests/io/test_pyarrow.py: ########## @@ -2317,3 +2321,66 @@ def test_pyarrow_io_multi_fs() -> None: # Same PyArrowFileIO instance resolves local file input to LocalFileSystem assert isinstance(pyarrow_file_io.new_input("file:///path/to/file")._filesystem, LocalFileSystem) + + +def test_scan_nulls(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None: + import pyarrow.compute as pc + + catalog.create_namespace("default") + table = catalog.create_table( + "default.test_scan_nulls", + schema=arrow_table_with_null.schema, + ) + table.append(arrow_table_with_null) + + # "string": ["a", None, "z"] + assert len(table.scan(row_filter="string is null").to_arrow()) == 1 + assert len(table.scan(row_filter=IsNull("string")).to_arrow()) == 1 + assert len(table.scan().to_arrow().filter(pc.field("string").is_null())) == 1 + + assert len(table.scan(row_filter="string is not null").to_arrow()) == 2 + assert len(table.scan(row_filter=NotNull("string")).to_arrow()) == 2 + assert len(table.scan().to_arrow().filter(pc.field("string").is_valid())) == 2 + + assert len(table.scan(row_filter="string == 'a'").to_arrow()) == 1 + assert len(table.scan(row_filter=EqualTo(term="string", literal=("a"))).to_arrow()) == 1 + assert len(table.scan().to_arrow().filter(pc.field("string") == "a")) == 1 + + # this should be 2 + assert len(table.scan(row_filter="string != 'a'").to_arrow()) == 1 + assert len(table.scan(row_filter=NotEqualTo(term="string", literal=("a"))).to_arrow()) == 1 + assert len(table.scan(row_filter=Not(EqualTo(term="string", literal=("a")))).to_arrow()) == 1 + assert len(table.scan().to_arrow().filter(pc.field("string") != "a")) == 1 + + +def test_scan_kleene(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None: + catalog.create_namespace("default") + table = catalog.create_table( + "default.test_scan_nulls", + schema=arrow_table_with_null.schema, + ) + table.append(arrow_table_with_null) + + # "string": ["a", None, "z"] + assert len(table.scan(row_filter="string is null OR string = 'a'").to_arrow()) == 2 # {null, a} + assert len(table.scan(row_filter="string is null AND string = 'a'").to_arrow()) == 0 # {} + assert len(table.scan(row_filter="string is not null OR string = 'a'").to_arrow()) == 2 # {a, z} + assert len(table.scan(row_filter="string is not null AND string = 'a'").to_arrow()) == 1 # {a} + + +def test_scan_complements(catalog: InMemoryCatalog, arrow_table_with_null: pa.Table) -> None: + from pyiceberg.expressions.visitors import bind + from pyiceberg.io.pyarrow import _expression_to_complementary_pyarrow Review Comment: `_expression_to_complementary_pyarrow` explicitly calls out null handling https://github.com/apache/iceberg-python/blob/d69a19113ea537d16c34b60ab6e69c4285f933c0/pyiceberg/io/pyarrow.py#L883-L886 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org