This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-2.1-lakehouse
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 8b8e129c92093b37815b0070b239cff6bc5bbe2e
Author: Socrates <[email protected]>
AuthorDate: Sat Feb 8 19:56:13 2025 +0800

    [fix](orc) dont't push down null aware predicate (#47625)
    
    ### What problem does this PR solve?
    
    Related PR: #43255
    
    Problem Summary:
    Example:
    ```sql
    CREATE TABLE table_a (
        id INT,
        age INT
    ) STORED AS ORC;
    
    INSERT INTO table_a VALUES
    (1, null),
    (2, 18),
    (3, null),
    (4, 25);
    
    CREATE TABLE table_b (
        id INT,
        age INT
    ) STORED AS ORC;
    
    INSERT INTO table_b VALUES
    (1, null),
    (2, null),
    (3, 1000000),
    (4, 100);
    ```
    run sql
    ```
    select * from table_a inner join table_b on table_a.age <=> table_b.age and 
table_b.id in (1,3);
    ```
    When executing this SQL, the backend generates a runtime filter on the
    table_a side during the join operation, resulting in a condition like
    WHERE table_a.age IN (NULL, 1000000). It’s important to note that since
    <=> is a null-aware comparison operator, the IN predicate must also be
    null-aware. However, the ORC predicate pushdown API does not support
    null-aware IN predicates. As a result, our current approach ignores null
    values, leading to an empty result set for this query.
    
    To fix this bug, we’ve adjusted the logic so that predicates with
    null-aware comparisons are not pushed down, ensuring the correct result
    as follows:
    ```text
    +------+------+------+------+
    | id   | age  | id   | age  |
    +------+------+------+------+
    |    1 | NULL |    1 | NULL |
    |    3 | NULL |    1 | NULL |
    +------+------+------+------+
    ```
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         |   5 ++++-
 .../orc_predicate/orc_predicate_table.hql          |  22 +++++++++++++++++++++
 .../hive/test_hive_orc_predicate.out               | Bin 463 -> 585 bytes
 .../hive/test_hive_orc_predicate.groovy            |   2 ++
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 104fd4b9744..9e0bbf4af9d 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -712,7 +712,10 @@ bool OrcReader::_check_expr_can_push_down(const VExprSPtr& 
expr) {
     case TExprOpcode::NE:
     case TExprOpcode::FILTER_IN:
     case TExprOpcode::FILTER_NOT_IN:
-        return _check_slot_can_push_down(expr) && 
_check_rest_children_can_push_down(expr);
+        // can't push down if expr is null aware predicate
+        return expr->node_type() != TExprNodeType::NULL_AWARE_BINARY_PRED &&
+               expr->node_type() != TExprNodeType::NULL_AWARE_IN_PRED &&
+               _check_slot_can_push_down(expr) && 
_check_rest_children_can_push_down(expr);
 
     case TExprOpcode::INVALID_OPCODE:
         if (expr->node_type() == TExprNodeType::FUNCTION_CALL) {
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
index a946b25ff1a..6a1c9dce521 100644
--- 
a/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/multi_catalog/orc_predicate/orc_predicate_table.hql
@@ -14,3 +14,25 @@ create table type_changed_table (
 ) stored as orc;
 insert into type_changed_table values (1, 'Alice'), (2, 'Bob'), (3, 'Charlie');
 ALTER TABLE type_changed_table CHANGE COLUMN id id STRING;
+
+CREATE TABLE table_a (
+    id INT,
+    age INT
+) STORED AS ORC;
+
+INSERT INTO table_a VALUES
+(1, null),
+(2, 18),
+(3, null),
+(4, 25);
+
+CREATE TABLE table_b (
+    id INT,
+    age INT
+) STORED AS ORC;
+
+INSERT INTO table_b VALUES
+(1, null),
+(2, null),
+(3, 1000000),
+(4, 100);
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out 
b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out
index f42bb629550..d943f3f80c6 100644
Binary files 
a/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out and 
b/regression-test/data/external_table_p0/hive/test_hive_orc_predicate.out differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
index 2dd647aa2c1..726354e1408 100644
--- 
a/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
+++ 
b/regression-test/suites/external_table_p0/hive/test_hive_orc_predicate.groovy
@@ -43,6 +43,8 @@ suite("test_hive_orc_predicate", 
"p0,external,hive,external_docker,external_dock
             qt_predicate_changed_type2 """ select * from type_changed_table 
where id = '2';"""
             qt_predicate_changed_type3 """ select * from type_changed_table 
where id = '3';"""
 
+            qt_predicate_null_aware_equal_in_rt """select * from table_a inner 
join table_b on table_a.age <=> table_b.age and table_b.id in (1,3);"""
+
             sql """drop catalog if exists ${catalog_name}"""
         } finally {
         }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to