This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3593a82c6f4 [opt](inverted index) unified optimization judgment to 
prevent omissions (#38027)
3593a82c6f4 is described below

commit 3593a82c6f4d42eae703469b867ef22ff93f6827
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Jul 18 11:47:03 2024 +0800

    [opt](inverted index) unified optimization judgment to prevent omissions 
(#38027)
    
    1. optimize inverted index, strengthen logical judgment.
---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 62 +++++++++-------------
 be/src/olap/rowset/segment_v2/segment_iterator.h   |  5 +-
 be/src/vec/exprs/vexpr.cpp                         | 17 +++++-
 .../test_all_index_hit_fault_injection.out         | 24 +++++++++
 .../test_topn_fault_injection.out                  | 30 +++++++++++
 .../test_all_index_hit_fault_injection.groovy      | 11 +++-
 .../test_topn_fault_injection.groovy               | 32 ++++++++---
 7 files changed, 132 insertions(+), 49 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index db1d2e9e676..61be47cced7 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1345,19 +1345,15 @@ Status SegmentIterator::_apply_inverted_index() {
     return Status::OK();
 }
 
-bool 
SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId
 cid) {
+bool 
SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId
 cid,
+                                                                             
bool default_return) {
     auto it = _column_predicate_inverted_index_status.find(cid);
     if (it != _column_predicate_inverted_index_status.end()) {
         const auto& pred_map = it->second;
-
-        bool all_true = std::all_of(pred_map.begin(), pred_map.end(),
-                                    [](const auto& pred_entry) { return 
pred_entry.second; });
-
-        if (all_true) {
-            return true;
-        }
+        return std::all_of(pred_map.begin(), pred_map.end(),
+                           [](const auto& pred_entry) { return 
pred_entry.second; });
     }
-    return false;
+    return default_return;
 }
 
 Status SegmentIterator::_init_return_column_iterators() {
@@ -2404,9 +2400,9 @@ Status 
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
         nrows_read_limit = std::min(static_cast<uint32_t>(_opts.topn_limit), 
nrows_read_limit);
     }
 
-    DBUG_EXECUTE_IF("segment_iterator.topn_opt", {
+    DBUG_EXECUTE_IF("segment_iterator.topn_opt_1", {
         if (nrows_read_limit != 1) {
-            return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt execute 
failed: {}",
+            return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt 1 
execute failed: {}",
                                                             nrows_read_limit);
         }
     })
@@ -2887,19 +2883,7 @@ bool SegmentIterator::_no_need_read_key_data(ColumnId 
cid, vectorized::MutableCo
         return false;
     }
 
-    std::set<uint32_t> cids;
-    for (auto* pred : _col_predicates) {
-        cids.insert(pred->column_id());
-    }
-    for (auto* pred : _col_preds_except_leafnode_of_andnode) {
-        cids.insert(pred->column_id());
-    }
-
-    // If the key is present in expr, data needs to be read.
-    if (cids.contains(cid)) {
-        return false;
-    }
-    if 
(_column_pred_in_remaining_vconjunct.contains(_opts.tablet_schema->column(cid).name()))
 {
+    if (!_check_all_predicates_passed_inverted_index_for_column(cid, true)) {
         return false;
     }
 
@@ -2920,7 +2904,7 @@ bool SegmentIterator::_has_delete_predicate(ColumnId cid) 
{
     return delete_columns_set.contains(cid);
 }
 
-bool SegmentIterator::_can_opt_topn_reads() const {
+bool SegmentIterator::_can_opt_topn_reads() {
     if (_opts.topn_limit <= 0) {
         return false;
     }
@@ -2929,20 +2913,24 @@ bool SegmentIterator::_can_opt_topn_reads() const {
         return false;
     }
 
-    std::set<uint32_t> cids;
-    for (auto* pred : _col_predicates) {
-        cids.insert(pred->column_id());
-    }
-    for (auto* pred : _col_preds_except_leafnode_of_andnode) {
-        cids.insert(pred->column_id());
-    }
-
-    uint32_t delete_sign_idx = _opts.tablet_schema->delete_sign_idx();
-    bool result = std::ranges::all_of(cids.begin(), cids.end(), 
[delete_sign_idx](auto cid) {
-        return cid == delete_sign_idx;
+    bool all_true = std::ranges::all_of(_schema->column_ids(), [this](auto 
cid) {
+        if (cid == _opts.tablet_schema->delete_sign_idx() ||
+            _opts.tablet_schema->column(cid).is_key()) {
+            return true;
+        }
+        if (_check_all_predicates_passed_inverted_index_for_column(cid, true)) 
{
+            return true;
+        }
+        return false;
     });
 
-    return result;
+    DBUG_EXECUTE_IF("segment_iterator.topn_opt_2", {
+        if (all_true) {
+            return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt 2 
execute failed");
+        }
+    })
+
+    return all_true;
 }
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h 
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index cb904f21c6a..c9284e592e4 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -387,10 +387,11 @@ private:
 
     bool _has_delete_predicate(ColumnId cid);
 
-    bool _can_opt_topn_reads() const;
+    bool _can_opt_topn_reads();
 
     void _initialize_predicate_results();
-    bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid);
+    bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid,
+                                                                bool 
default_return = false);
 
     class BitmapRangeIterator;
     class BackwardBitmapRangeIterator;
diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp
index bb6e48f6084..31a8e04cad1 100644
--- a/be/src/vec/exprs/vexpr.cpp
+++ b/be/src/vec/exprs/vexpr.cpp
@@ -23,6 +23,7 @@
 #include <thrift/protocol/TDebugProtocol.h>
 
 #include <algorithm>
+#include <boost/algorithm/string/split.hpp>
 #include <boost/iterator/iterator_facade.hpp>
 #include <memory>
 #include <stack>
@@ -604,8 +605,20 @@ bool VExpr::fast_execute(Block& block, const 
ColumnNumbers& arguments, size_t re
                          size_t input_rows_count, const std::string& 
function_name) {
     std::string result_column_name = gen_predicate_result_sign(block, 
arguments, function_name);
     if (!block.has(result_column_name)) {
-        DBUG_EXECUTE_IF("segment_iterator.fast_execute",
-                        { return 
Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute failed"); })
+        DBUG_EXECUTE_IF("segment_iterator.fast_execute", {
+            auto debug_col_name = 
DebugPoints::instance()->get_debug_param_or_default<std::string>(
+                    "segment_iterator._read_columns_by_index", "column_name", 
"");
+
+            std::vector<std::string> column_names;
+            boost::split(column_names, debug_col_name, 
boost::algorithm::is_any_of(","));
+
+            std::string column_name = block.get_by_position(arguments[0]).name;
+            auto it = std::find(column_names.begin(), column_names.end(), 
column_name);
+            if (it == column_names.end()) {
+                return Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute 
failed: {}",
+                                                                
result_column_name);
+            }
+        })
         return false;
     }
 
diff --git 
a/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
 
b/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
index ea2b79e0c9b..205effb8dbf 100644
--- 
a/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
+++ 
b/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out
@@ -14,6 +14,18 @@
 -- !sql --
 14
 
+-- !sql --
+999
+
+-- !sql --
+209
+
+-- !sql --
+209
+
+-- !sql --
+334
+
 -- !sql --
 120
 
@@ -29,3 +41,15 @@
 -- !sql --
 11
 
+-- !sql --
+279
+
+-- !sql --
+119
+
+-- !sql --
+119
+
+-- !sql --
+154
+
diff --git 
a/regression-test/data/fault_injection_p0/test_topn_fault_injection.out 
b/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
index 9cc3f4146b5..fe5034df477 100644
--- a/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
+++ b/regression-test/data/fault_injection_p0/test_topn_fault_injection.out
@@ -17,9 +17,39 @@
 -- !sql --
 893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
 
+-- !sql --
+893964617      40.135.0.0      GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964617      40.135.0.0      GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
 -- !sql --
 893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
 
+-- !sql --
+893964617      40.135.0.0      GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964617      40.135.0.0      GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964617      40.135.0.0      GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
+-- !sql --
+893964617      40.135.0.0      GET /images/hm_bg.jpg HTTP/1.0  200     24736
+
 -- !sql --
 893964672      26.1.0.0        GET /images/hm_bg.jpg HTTP/1.0  200     24736
 
diff --git 
a/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
 
b/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
index 3bd884a5d87..d1a8e7c7642 100644
--- 
a/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
+++ 
b/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy
@@ -99,19 +99,28 @@ suite("test_all_index_hit_fault_injection", 
"nonConcurrent") {
 
       try {
         
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index",
 [column_name: "clientip,request"])
-        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute")
+        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute", 
[column_name: "status,size"])
 
+        
         qt_sql """ select count() from ${indexTbName1} where (request 
match_phrase 'hm'); """
         qt_sql """ select count() from ${indexTbName1} where (request 
match_phrase 'hm' and clientip = '126.1.0.0'); """
         qt_sql """ select count() from ${indexTbName1} where (request 
match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and 
clientip = '201.0.0.0'); """
         qt_sql """ select count() from ${indexTbName1} where (request 
match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or 
(request match_phrase 'bg' and clientip = '201.0.0.0' or clientip = 
'232.0.0.0'); """
         qt_sql """ select count() from ${indexTbName1} where (request 
match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request 
match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """
+        qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455); """
+        qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm'); """
+        qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or 
request match_phrase 'ag'); """
+        qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or 
request match_phrase 'ag' or status = 304); """
 
         qt_sql """ select count() from ${indexTbName2} where (request 
match_phrase 'hm'); """
         qt_sql """ select count() from ${indexTbName2} where (request 
match_phrase 'hm' and clientip = '126.1.0.0'); """
         qt_sql """ select count() from ${indexTbName2} where (request 
match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and 
clientip = '201.0.0.0'); """
         qt_sql """ select count() from ${indexTbName2} where (request 
match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or 
(request match_phrase 'bg' and clientip = '201.0.0.0' or clientip = 
'232.0.0.0'); """
         qt_sql """ select count() from ${indexTbName2} where (request 
match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request 
match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """
+        qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455); """
+        qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm'); """
+        qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or 
request match_phrase 'ag'); """
+        qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or 
request match_phrase 'ag' or status = 304); """
 
       } finally {
         
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
diff --git 
a/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy 
b/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
index 08a1ef0164d..37315a49525 100644
--- a/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
+++ b/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy
@@ -31,12 +31,11 @@ suite("test_topn_fault_injection", "nonConcurrent") {
         INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
         INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
       ) ENGINE=OLAP
-      UNIQUE KEY(`@timestamp`)
+      DUPLICATE KEY(`@timestamp`)
       COMMENT "OLAP"
-      DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
+      DISTRIBUTED BY RANDOM BUCKETS 1
       PROPERTIES (
         "replication_allocation" = "tag.location.default: 1",
-        "enable_unique_key_merge_on_write" = "true",
         "disable_auto_compaction" = "true"
       );
     """
@@ -52,11 +51,12 @@ suite("test_topn_fault_injection", "nonConcurrent") {
         INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
         INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
       ) ENGINE=OLAP
-      DUPLICATE KEY(`@timestamp`)
+      UNIQUE KEY(`@timestamp`)
       COMMENT "OLAP"
-      DISTRIBUTED BY RANDOM BUCKETS 1
+      DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
       PROPERTIES (
         "replication_allocation" = "tag.location.default: 1",
+        "enable_unique_key_merge_on_write" = "true",
         "disable_auto_compaction" = "true"
       );
     """
@@ -98,19 +98,37 @@ suite("test_topn_fault_injection", "nonConcurrent") {
       sql "sync"
 
       try {
-        GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt")
+        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt_1")
 
         qt_sql """ select * from ${indexTbName1} where (request match_phrase 
'hm') order by `@timestamp` limit 1; """
         qt_sql """ select * from ${indexTbName1} where (request match_phrase 
'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """
         qt_sql """ select * from ${indexTbName1} where (request match_phrase 
'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip 
match_phrase '2') order by `@timestamp` limit 1; """
         qt_sql """ select * from ${indexTbName1} where (request match_phrase 
'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request 
match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4') 
order by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm') order 
by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or 
clientip match_phrase '3') order by `@timestamp` limit 1; """
 
         qt_sql """ select * from ${indexTbName2} where (request match_phrase 
'hm') order by `@timestamp` limit 1; """
         qt_sql """ select * from ${indexTbName2} where (request match_phrase 
'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """
         qt_sql """ select * from ${indexTbName2} where (request match_phrase 
'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip 
match_phrase '2') order by `@timestamp` limit 1; """
         qt_sql """ select * from ${indexTbName2} where (request match_phrase 
'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request 
match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4') 
order by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm') order 
by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or 
clientip match_phrase '3') order by `@timestamp` limit 1; """
+      } finally {
+        
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt_1")
+      }
+
+      try {
+        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt_2")
+
+        qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and 
request like '%ag%') order by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and 
clientip like '%1%') order by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or 
clientip match_phrase '3' and request like '%ag%') order by `@timestamp` limit 
1; """
+
+        qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and 
request like '%ag%') order by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and 
clientip like '%1%') order by `@timestamp` limit 1; """
+        qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 
893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or 
clientip match_phrase '3' and request like '%ag%') order by `@timestamp` limit 
1; """
       } finally {
-        GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt")
+        
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt_2")
       }
     } finally {
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to