This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 58f1df28540 [fix](orc) ignore null values when the literals of 
in_predicate contains (#45104)
58f1df28540 is described below

commit 58f1df285408163787bb9d52772064c18e5d9560
Author: Socrates <suyit...@selectdb.com>
AuthorDate: Wed Dec 18 15:32:17 2024 +0800

    [fix](orc) ignore null values when the literals of in_predicate contains 
(#45104)
    
    ### What problem does this PR solve?
    
    Related PR: #43255
    
    Problem Summary:
    Should ignore null values when the literals of in_predicate contains
    null value, like `in (1, null)`
    For example, init table in hive:
    ```sql
    CREATE TABLE sample_orc_table (
        id INT,
        name STRING,
        age INT
    )
    STORED AS ORC;
    INSERT INTO TABLE sample_orc_table VALUES
        (1, 'Alice', 25),
        (2, NULL, NULL);
    ```
    select result in Doris should be:
    ```sql
    mysql> select * from sample_orc_table where age in (null,25);
    +------+-------+------+
    | id   | name  | age  |
    +------+-------+------+
    |    1 | Alice |   25 |
    +------+-------+------+
    1 row in set (0.30 sec)
    
    mysql> select * from sample_orc_table where age in (25);
    +------+-------+------+
    | id   | name  | age  |
    +------+-------+------+
    |    1 | Alice |   25 |
    +------+-------+------+
    1 row in set (0.27 sec)
    
    mysql> select * from sample_orc_table where age in (null);
    Empty set (0.01 sec)
    
    mysql> select * from sample_orc_table where age is null;
    +------+------+------+
    | id   | name | age  |
    +------+------+------+
    |    2 | NULL | NULL |
    +------+------+------+
    1 row in set (0.11 sec)
    ```
---
 be/src/apache-orc                                  |   2 +-
 be/src/vec/exec/format/orc/vorc_reader.cpp         |  24 +++--
 be/test/vec/exec/orc_reader_test.cpp               |   3 +
 .../data/external_table_p0/hive/test_hive_orc.out  | 102 +++++++++++++++++++++
 .../external_table_p0/hive/test_hive_orc.groovy    |  22 +++++
 5 files changed, 145 insertions(+), 8 deletions(-)

diff --git a/be/src/apache-orc b/be/src/apache-orc
index db01184f765..2f937bdc764 160000
--- a/be/src/apache-orc
+++ b/be/src/apache-orc
@@ -1 +1 @@
-Subproject commit db01184f765c03496e4107bd3ac37c077ac4bc5f
+Subproject commit 2f937bdc76406f150b484b6e57629aa8a03d48b6
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index df8ec52efc2..a1ecb1ae0dc 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -586,6 +586,10 @@ std::tuple<bool, orc::Literal, orc::PredicateDataType> 
OrcReader::_make_orc_lite
         // only get the predicate_type
         return std::make_tuple(true, orc::Literal(true), predicate_type);
     }
+    // this only happens when the literals of in_predicate contains null 
value, like in (1, null)
+    if (literal->get_column_ptr()->is_null_at(0)) {
+        return std::make_tuple(false, orc::Literal(false), predicate_type);
+    }
     auto literal_data = literal->get_column_ptr()->get_data_at(0);
     auto* slot = _tuple_descriptor->slots()[slot_ref->column_id()];
     auto slot_type = slot->type();
@@ -666,12 +670,13 @@ bool OrcReader::_check_rest_children_can_push_down(const 
VExprSPtr& expr) {
         return false;
     }
 
+    bool at_least_one_child_can_push_down = false;
     for (size_t i = 1; i < expr->children().size(); ++i) {
-        if (!_check_literal_can_push_down(expr, i)) {
-            return false;
+        if (_check_literal_can_push_down(expr, i)) {
+            at_least_one_child_can_push_down = true;
         }
     }
-    return true;
+    return at_least_one_child_can_push_down;
 }
 
 // check if the expr can be pushed down to orc reader
@@ -774,12 +779,17 @@ void OrcReader::_build_filter_in(const VExprSPtr& expr,
     for (size_t i = 1; i < expr->children().size(); ++i) {
         DCHECK(expr->children()[i]->is_literal());
         const auto* literal = static_cast<const 
VLiteral*>(expr->children()[i].get());
-        DCHECK(_vliteral_to_orc_literal.contains(literal));
-        auto orc_literal = _vliteral_to_orc_literal.find(literal)->second;
-        literals.emplace_back(orc_literal);
+        if (_vliteral_to_orc_literal.contains(literal)) {
+            auto orc_literal = _vliteral_to_orc_literal.find(literal)->second;
+            literals.emplace_back(orc_literal);
+        }
     }
     DCHECK(!literals.empty());
-    builder->in(slot_ref->expr_name(), predicate_type, literals);
+    if (literals.size() == 1) {
+        builder->equals(slot_ref->expr_name(), predicate_type, literals[0]);
+    } else {
+        builder->in(slot_ref->expr_name(), predicate_type, literals);
+    }
 }
 
 void OrcReader::_build_is_null(const VExprSPtr& expr,
diff --git a/be/test/vec/exec/orc_reader_test.cpp 
b/be/test/vec/exec/orc_reader_test.cpp
index f2bba434368..ff7452ae625 100644
--- a/be/test/vec/exec/orc_reader_test.cpp
+++ b/be/test/vec/exec/orc_reader_test.cpp
@@ -111,6 +111,8 @@ TEST_F(OrcReaderTest, test_build_search_argument) {
                             
R"|({"1":{"lst":["rec",11,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},
 [...]
                             // select count(o_orderkey) from tpch1_orc.orders 
where o_orderkey < 1 + 1;
                             
R"|({"1":{"lst":["rec",3,{"1":{"i32":2},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":11},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"lt"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}},{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}}]},"4":{"rec":{"1":{"lst":["rec",1,{"1":{
 [...]
+                            // select count(o_orderkey) from tpch1_orc.orders 
where o_orderkey in (null, 25);
+                            
R"|({"1":{"lst":["rec",4,{"1":{"i32":11},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":5},"4":{"i32":3},"11":{"rec":{"1":{"tf":0}}},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":16},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5}}}}]},"3":{"i64":-1}}},"4":{"i32":0},"15":{"rec":{"1":{"i32":0},"2":{"i32":0},"3":{"i32":-1}}},"20":{"i32":-1},"29":{"tf":1},"36":{"str":"o_orderkey"}},{
 [...]
                             // SELECT count(o_orderkey) FROM tpch1_orc.orders 
WHERE o_comment LIKE '%delayed%' OR o_orderpriority = '1-URGENT';
                             
R"|({"1":{"lst":["rec",7,{"1":{"i32":6},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"3":{"i32":3},"4":{"i32":2},"20":{"i32":-1},"29":{"tf":1}},{"1":{"i32":20},"2":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":2}}}}]},"3":{"i64":-1}}},"4":{"i32":2},"20":{"i32":-1},"26":{"rec":{"1":{"rec":{"2":{"str":"like"}}},"2":{"i32":0},"3":{"lst":["rec",2,{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"
 [...]
                             // select count(o_orderkey) from tpch1_orc.orders 
where o_orderkey between 1 and 100 or random() > 0.5;
@@ -143,6 +145,7 @@ TEST_F(OrcReaderTest, test_build_search_argument) {
             "leaf-0 = (o_orderkey < 2), leaf-1 = (o_custkey < 36901), expr = 
(or leaf-0 (not "
             "leaf-1))",
             "leaf-0 = (o_orderkey < 2), expr = leaf-0",
+            "leaf-0 = (o_orderkey = 25), expr = leaf-0",
             CANNOT_PUSH_DOWN_ERROR,
             CANNOT_PUSH_DOWN_ERROR,
             CANNOT_PUSH_DOWN_ERROR,
diff --git a/regression-test/data/external_table_p0/hive/test_hive_orc.out 
b/regression-test/data/external_table_p0/hive/test_hive_orc.out
index b34f276020c..03942dbe9fb 100644
--- a/regression-test/data/external_table_p0/hive/test_hive_orc.out
+++ b/regression-test/data/external_table_p0/hive/test_hive_orc.out
@@ -134,6 +134,57 @@ tablets    tinyint_col     179     182     182     187     
183     181     177     183     177     187     183     202     202     186     
528
 -- !predicate_pushdown8 --
 1533
 
+-- !predicate_pushdown_in1 --
+0
+
+-- !predicate_pushdown_in2 --
+1634
+
+-- !predicate_pushdown_in3 --
+1597
+
+-- !orc_all_types_tinyint_col_is_null --
+366
+
+-- !orc_all_types_smallint_col_is_null --
+359
+
+-- !orc_all_types_int_col_is_null --
+361
+
+-- !orc_all_types_bigint_col_is_null --
+377
+
+-- !orc_all_types_boolean_col_is_null --
+369
+
+-- !orc_all_types_float_col_is_null --
+340
+
+-- !orc_all_types_double_col_is_null --
+395
+
+-- !orc_all_types_string_col_is_null --
+347
+
+-- !orc_all_types_binary_col_is_null --
+362
+
+-- !orc_all_types_timestamp_col_is_null --
+339
+
+-- !orc_all_types_decimal_col_is_null --
+367
+
+-- !orc_all_types_char_col_is_null --
+0
+
+-- !orc_all_types_varchar_col_is_null --
+0
+
+-- !orc_all_types_date_col_is_null --
+378
+
 -- !select_top50 --
 4      55      999742610       400899305488827731      false   6.5976813E8     
7.8723304616937395E17   \N      base tennis pit vertical friday 
2022-08-19T07:29:58     \N      tablets smallint_col    2019-02-07      
[7.53124931825377e+17]  ["NbSSBtwzpxNSkkwga"]   tablets smallint_col
 2      49      999613702       105493714032727452      \N      6.3322381E8     
9.8642324410240179E17   Unveil bright recruit participate. Suspect impression 
camera mathematical revelation. Fault live2 elbow debt west hydrogen current.   
  how literary    2022-09-03T17:20:21     481707.1065     tablets boolean_col   
  2020-01-12      []      ["HoMrAnn", "wteEFvIwoZsVpVQdscMb", null, "zcGFmv", 
"kGEBBckbMtX", "hrEtCGFdPWZK"]      tablets boolean_col
@@ -321,6 +372,57 @@ tablets    tinyint_col     179     182     182     187     
183     181     177     183     177     187     183     202     202     186     
528
 -- !predicate_pushdown8 --
 1533
 
+-- !predicate_pushdown_in1 --
+0
+
+-- !predicate_pushdown_in2 --
+1634
+
+-- !predicate_pushdown_in3 --
+1597
+
+-- !orc_all_types_tinyint_col_is_null --
+366
+
+-- !orc_all_types_smallint_col_is_null --
+359
+
+-- !orc_all_types_int_col_is_null --
+361
+
+-- !orc_all_types_bigint_col_is_null --
+377
+
+-- !orc_all_types_boolean_col_is_null --
+369
+
+-- !orc_all_types_float_col_is_null --
+340
+
+-- !orc_all_types_double_col_is_null --
+395
+
+-- !orc_all_types_string_col_is_null --
+347
+
+-- !orc_all_types_binary_col_is_null --
+362
+
+-- !orc_all_types_timestamp_col_is_null --
+339
+
+-- !orc_all_types_decimal_col_is_null --
+367
+
+-- !orc_all_types_char_col_is_null --
+0
+
+-- !orc_all_types_varchar_col_is_null --
+0
+
+-- !orc_all_types_date_col_is_null --
+378
+
 -- !select_top50 --
 4      55      999742610       400899305488827731      false   6.5976813E8     
7.8723304616937395E17   \N      base tennis pit vertical friday 
2022-08-19T07:29:58     \N      tablets smallint_col    2019-02-07      
[7.53124931825377e+17]  ["NbSSBtwzpxNSkkwga"]   tablets smallint_col
 2      49      999613702       105493714032727452      \N      6.3322381E8     
9.8642324410240179E17   Unveil bright recruit participate. Suspect impression 
camera mathematical revelation. Fault live2 elbow debt west hydrogen current.   
  how literary    2022-09-03T17:20:21     481707.1065     tablets boolean_col   
  2020-01-12      []      ["HoMrAnn", "wteEFvIwoZsVpVQdscMb", null, "zcGFmv", 
"kGEBBckbMtX", "hrEtCGFdPWZK"]      tablets boolean_col
diff --git a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
index 6457d2b3edd..8d85feaa77a 100644
--- a/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
+++ b/regression-test/suites/external_table_p0/hive/test_hive_orc.groovy
@@ -90,6 +90,28 @@ suite("test_hive_orc", 
"all_types,p0,external,hive,external_docker,external_dock
         qt_predicate_pushdown6 """ SELECT count(o_orderkey) FROM 
tpch1_orc.orders WHERE o_orderstatus <> 'F' AND o_custkey < 54321; """
         qt_predicate_pushdown7 """ SELECT count(o_orderkey) FROM 
tpch1_orc.orders WHERE o_comment LIKE '%delayed%' OR o_orderpriority = 
'1-URGENT'; """
         qt_predicate_pushdown8 """ SELECT count(o_orderkey) FROM 
tpch1_orc.orders WHERE o_orderkey IN (1000000, 2000000, 3000000) OR o_clerk = 
'Clerk#000000470'; """
+
+        qt_predicate_pushdown_in1 """ select count(*)  from orc_all_types 
where boolean_col in (null); """
+        qt_predicate_pushdown_in2 """ select count(*)  from orc_all_types 
where boolean_col in (null, 0); """
+        qt_predicate_pushdown_in3 """ select count(*)  from orc_all_types 
where boolean_col in (null, 1); """
+
+        def test_col_is_null = { String col ->
+            "qt_orc_all_types_${col}_is_null" """ select count(*)  from 
orc_all_types where ${col} is null; """
+        }
+        test_col_is_null("tinyint_col")
+        test_col_is_null("smallint_col")
+        test_col_is_null("int_col")
+        test_col_is_null("bigint_col")
+        test_col_is_null("boolean_col")
+        test_col_is_null("float_col")
+        test_col_is_null("double_col")
+        test_col_is_null("string_col")
+        test_col_is_null("binary_col")
+        test_col_is_null("timestamp_col")
+        test_col_is_null("decimal_col")
+        test_col_is_null("char_col")
+        test_col_is_null("varchar_col")
+        test_col_is_null("date_col")
     }
 
     String enabled = context.config.otherConfigs.get("enableHiveTest")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to