This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 3593a82c6f4 [opt](inverted index) unified optimization judgment to prevent omissions (#38027) 3593a82c6f4 is described below commit 3593a82c6f4d42eae703469b867ef22ff93f6827 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Thu Jul 18 11:47:03 2024 +0800 [opt](inverted index) unified optimization judgment to prevent omissions (#38027) 1. optimize inverted index, strengthen logical judgment. --- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 62 +++++++++------------- be/src/olap/rowset/segment_v2/segment_iterator.h | 5 +- be/src/vec/exprs/vexpr.cpp | 17 +++++- .../test_all_index_hit_fault_injection.out | 24 +++++++++ .../test_topn_fault_injection.out | 30 +++++++++++ .../test_all_index_hit_fault_injection.groovy | 11 +++- .../test_topn_fault_injection.groovy | 32 ++++++++--- 7 files changed, 132 insertions(+), 49 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index db1d2e9e676..61be47cced7 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1345,19 +1345,15 @@ Status SegmentIterator::_apply_inverted_index() { return Status::OK(); } -bool SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId cid) { +bool SegmentIterator::_check_all_predicates_passed_inverted_index_for_column(ColumnId cid, + bool default_return) { auto it = _column_predicate_inverted_index_status.find(cid); if (it != _column_predicate_inverted_index_status.end()) { const auto& pred_map = it->second; - - bool all_true = std::all_of(pred_map.begin(), pred_map.end(), - [](const auto& pred_entry) { return pred_entry.second; }); - - if (all_true) { - return true; - } + return std::all_of(pred_map.begin(), pred_map.end(), + [](const auto& pred_entry) { return pred_entry.second; }); } - return false; + return default_return; } Status SegmentIterator::_init_return_column_iterators() { @@ -2404,9 +2400,9 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { nrows_read_limit = std::min(static_cast<uint32_t>(_opts.topn_limit), nrows_read_limit); } - DBUG_EXECUTE_IF("segment_iterator.topn_opt", { + DBUG_EXECUTE_IF("segment_iterator.topn_opt_1", { if (nrows_read_limit != 1) { - return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt execute failed: {}", + return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt 1 execute failed: {}", nrows_read_limit); } }) @@ -2887,19 +2883,7 @@ bool SegmentIterator::_no_need_read_key_data(ColumnId cid, vectorized::MutableCo return false; } - std::set<uint32_t> cids; - for (auto* pred : _col_predicates) { - cids.insert(pred->column_id()); - } - for (auto* pred : _col_preds_except_leafnode_of_andnode) { - cids.insert(pred->column_id()); - } - - // If the key is present in expr, data needs to be read. - if (cids.contains(cid)) { - return false; - } - if (_column_pred_in_remaining_vconjunct.contains(_opts.tablet_schema->column(cid).name())) { + if (!_check_all_predicates_passed_inverted_index_for_column(cid, true)) { return false; } @@ -2920,7 +2904,7 @@ bool SegmentIterator::_has_delete_predicate(ColumnId cid) { return delete_columns_set.contains(cid); } -bool SegmentIterator::_can_opt_topn_reads() const { +bool SegmentIterator::_can_opt_topn_reads() { if (_opts.topn_limit <= 0) { return false; } @@ -2929,20 +2913,24 @@ bool SegmentIterator::_can_opt_topn_reads() const { return false; } - std::set<uint32_t> cids; - for (auto* pred : _col_predicates) { - cids.insert(pred->column_id()); - } - for (auto* pred : _col_preds_except_leafnode_of_andnode) { - cids.insert(pred->column_id()); - } - - uint32_t delete_sign_idx = _opts.tablet_schema->delete_sign_idx(); - bool result = std::ranges::all_of(cids.begin(), cids.end(), [delete_sign_idx](auto cid) { - return cid == delete_sign_idx; + bool all_true = std::ranges::all_of(_schema->column_ids(), [this](auto cid) { + if (cid == _opts.tablet_schema->delete_sign_idx() || + _opts.tablet_schema->column(cid).is_key()) { + return true; + } + if (_check_all_predicates_passed_inverted_index_for_column(cid, true)) { + return true; + } + return false; }); - return result; + DBUG_EXECUTE_IF("segment_iterator.topn_opt_2", { + if (all_true) { + return Status::Error<ErrorCode::INTERNAL_ERROR>("topn opt 2 execute failed"); + } + }) + + return all_true; } } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index cb904f21c6a..c9284e592e4 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -387,10 +387,11 @@ private: bool _has_delete_predicate(ColumnId cid); - bool _can_opt_topn_reads() const; + bool _can_opt_topn_reads(); void _initialize_predicate_results(); - bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid); + bool _check_all_predicates_passed_inverted_index_for_column(ColumnId cid, + bool default_return = false); class BitmapRangeIterator; class BackwardBitmapRangeIterator; diff --git a/be/src/vec/exprs/vexpr.cpp b/be/src/vec/exprs/vexpr.cpp index bb6e48f6084..31a8e04cad1 100644 --- a/be/src/vec/exprs/vexpr.cpp +++ b/be/src/vec/exprs/vexpr.cpp @@ -23,6 +23,7 @@ #include <thrift/protocol/TDebugProtocol.h> #include <algorithm> +#include <boost/algorithm/string/split.hpp> #include <boost/iterator/iterator_facade.hpp> #include <memory> #include <stack> @@ -604,8 +605,20 @@ bool VExpr::fast_execute(Block& block, const ColumnNumbers& arguments, size_t re size_t input_rows_count, const std::string& function_name) { std::string result_column_name = gen_predicate_result_sign(block, arguments, function_name); if (!block.has(result_column_name)) { - DBUG_EXECUTE_IF("segment_iterator.fast_execute", - { return Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute failed"); }) + DBUG_EXECUTE_IF("segment_iterator.fast_execute", { + auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default<std::string>( + "segment_iterator._read_columns_by_index", "column_name", ""); + + std::vector<std::string> column_names; + boost::split(column_names, debug_col_name, boost::algorithm::is_any_of(",")); + + std::string column_name = block.get_by_position(arguments[0]).name; + auto it = std::find(column_names.begin(), column_names.end(), column_name); + if (it == column_names.end()) { + return Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute failed: {}", + result_column_name); + } + }) return false; } diff --git a/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out b/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out index ea2b79e0c9b..205effb8dbf 100644 --- a/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out +++ b/regression-test/data/fault_injection_p0/test_all_index_hit_fault_injection.out @@ -14,6 +14,18 @@ -- !sql -- 14 +-- !sql -- +999 + +-- !sql -- +209 + +-- !sql -- +209 + +-- !sql -- +334 + -- !sql -- 120 @@ -29,3 +41,15 @@ -- !sql -- 11 +-- !sql -- +279 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +154 + diff --git a/regression-test/data/fault_injection_p0/test_topn_fault_injection.out b/regression-test/data/fault_injection_p0/test_topn_fault_injection.out index 9cc3f4146b5..fe5034df477 100644 --- a/regression-test/data/fault_injection_p0/test_topn_fault_injection.out +++ b/regression-test/data/fault_injection_p0/test_topn_fault_injection.out @@ -17,9 +17,39 @@ -- !sql -- 893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 +-- !sql -- +893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + -- !sql -- 893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 +-- !sql -- +893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + +-- !sql -- +893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 + -- !sql -- 893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736 diff --git a/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy b/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy index 3bd884a5d87..d1a8e7c7642 100644 --- a/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy +++ b/regression-test/suites/fault_injection_p0/test_all_index_hit_fault_injection.groovy @@ -99,19 +99,28 @@ suite("test_all_index_hit_fault_injection", "nonConcurrent") { try { GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index", [column_name: "clientip,request"]) - GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute") + GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute", [column_name: "status,size"]) + qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm'); """ qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip = '126.1.0.0'); """ qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0'); """ qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0' or clientip = '232.0.0.0'); """ qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """ + qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455); """ + qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm'); """ + qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or request match_phrase 'ag'); """ + qt_sql """ select count() from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or request match_phrase 'ag' or status = 304); """ qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm'); """ qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip = '126.1.0.0'); """ qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0'); """ qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0' or clientip = '232.0.0.0'); """ qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """ + qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455); """ + qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm'); """ + qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or request match_phrase 'ag'); """ + qt_sql """ select count() from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' or request match_phrase 'ag' or status = 304); """ } finally { GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index") diff --git a/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy b/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy index 08a1ef0164d..37315a49525 100644 --- a/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy +++ b/regression-test/suites/fault_injection_p0/test_topn_fault_injection.groovy @@ -31,12 +31,11 @@ suite("test_topn_fault_injection", "nonConcurrent") { INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '', INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' ) ENGINE=OLAP - UNIQUE KEY(`@timestamp`) + DUPLICATE KEY(`@timestamp`) COMMENT "OLAP" - DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1 + DISTRIBUTED BY RANDOM BUCKETS 1 PROPERTIES ( "replication_allocation" = "tag.location.default: 1", - "enable_unique_key_merge_on_write" = "true", "disable_auto_compaction" = "true" ); """ @@ -52,11 +51,12 @@ suite("test_topn_fault_injection", "nonConcurrent") { INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '', INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' ) ENGINE=OLAP - DUPLICATE KEY(`@timestamp`) + UNIQUE KEY(`@timestamp`) COMMENT "OLAP" - DISTRIBUTED BY RANDOM BUCKETS 1 + DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1 PROPERTIES ( "replication_allocation" = "tag.location.default: 1", + "enable_unique_key_merge_on_write" = "true", "disable_auto_compaction" = "true" ); """ @@ -98,19 +98,37 @@ suite("test_topn_fault_injection", "nonConcurrent") { sql "sync" try { - GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt") + GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt_1") qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip match_phrase '2') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or clientip match_phrase '3') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip match_phrase '2') order by `@timestamp` limit 1; """ qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or clientip match_phrase '3') order by `@timestamp` limit 1; """ + } finally { + GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt_1") + } + + try { + GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt_2") + + qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and request like '%ag%') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and clientip like '%1%') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName1} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or clientip match_phrase '3' and request like '%ag%') order by `@timestamp` limit 1; """ + + qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and request like '%ag%') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (request match_phrase 'hm' and clientip like '%1%') order by `@timestamp` limit 1; """ + qt_sql """ select * from ${indexTbName2} where (`@timestamp` >= 893964617 and `@timestamp` < 893966455) and (clientip match_phrase '1' or clientip match_phrase '3' and request like '%ag%') order by `@timestamp` limit 1; """ } finally { - GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt") + GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt_2") } } finally { } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org