This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1-lakehouse in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8b8e129c92093b37815b0070b239cff6bc5bbe2e Author: Socrates <[email protected]> AuthorDate: Sat Feb 8 19:56:13 2025 +0800 [fix](orc) dont't push down null aware predicate (#47625) ### What problem does this PR solve? Related PR: #43255 Problem Summary: Example: ```sql CREATE TABLE table_a ( id INT, age INT ) STORED AS ORC; INSERT INTO table_a VALUES (1, null), (2, 18), (3, null), (4, 25); CREATE TABLE table_b ( id INT, age INT ) STORED AS ORC; INSERT INTO table_b VALUES (1, null), (2, null), (3, 1000000), (4, 100); ``` run sql ``` select * from table_a inner join table_b on table_a.age <=> table_b.age and table_b.id in (1,3); ``` When executing this SQL, the backend generates a runtime filter on the table_a side during the join operation, resulting in a condition like WHERE table_a.age IN (NULL, 1000000). It’s important to note that since <=> is a null-aware comparison operator, the IN predicate must also be null-aware. However, the ORC predicate pushdown API does not support null-aware IN predicates. As a result, our current approach ignores null values, leading to an empty result set for this query. To fix this bug, we’ve adjusted the logic so that predicates with null-aware comparisons are not pushed down, ensuring the correct result as follows: ```text +------+------+------+------+ | id | age | id | age | +------+------+------+------+ | 1 | NULL | 1 | NULL | | 3 | NULL | 1 | NULL | +------+------+------+------+ ``` --- be/src/vec/exec/format/orc/vorc_reader.cpp | 5 ++++- .../orc_predicate/orc_predicate_table.hql | 22 +++++++++++++++++++++ .../hive/test_hive_orc_predicate.out | Bin 463 -> 585 bytes .../hive/test_hive_orc_predicate.groovy | 2 ++ 4 files changed, 28 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 104fd4b9744..9e0bbf4af9d 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -712,7 +712,10 @@ bool OrcReader::_check_expr_can_push_down(const VExprSPtr& expr) { case TExprOpcode::NE: case TExprOpcode::FILTER_IN: case TExprOpcode::FILTER_NOT_IN: - return _check_slot_can_push_down(expr) && _check_rest_children_can_push_down(expr); + // can't push down if expr is null aware predicate + return expr->node_type() != TExprNodeType::NULL_AWARE_BINARY_PRED && + expr->node_type() != TExprNodeType::NULL_AWARE_IN_PRED && + _check_slot_can_push_down(expr) && _check_rest_children_can_push_down(expr); case TExprOpcode::INVALID_OPCODE: if (expr->node_type() == TExprNodeType::FUNCTION_CALL) { diff --git a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql index a946b25ff1a..6a1c9dce521 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql @@ -14,3 +14,25 @@ create table type_changed_table ( ) stored as orc; insert into type_changed_table values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'); ALTER TABLE type_changed_table CHANGE COLUMN id id STRING; + +CREATE TABLE table_a ( + id INT, + age INT +) STORED AS ORC; + +INSERT INTO table_a VALUES +(1, null), +(2, 18), +(3, null), +(4, 25); + +CREATE TABLE table_b ( + id INT, + age INT +) STORED AS ORC; + +INSERT INTO table_b VALUES +(1, null), +(2, null), +(3, 1000000), +(4, 100); diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out index f42bb629550..d943f3f80c6 100644 Binary files a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out and b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out differ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy index 2dd647aa2c1..726354e1408 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy @@ -43,6 +43,8 @@ suite("test_hive_orc_predicate", "p0,external,hive,external_docker,external_dock qt_predicate_changed_type2 """ select * from type_changed_table where id = '2';""" qt_predicate_changed_type3 """ select * from type_changed_table where id = '3';""" + qt_predicate_null_aware_equal_in_rt """select * from table_a inner join table_b on table_a.age <=> table_b.age and table_b.id in (1,3);""" + sql """drop catalog if exists ${catalog_name}""" } finally { } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
