This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new dabd27edd26 [opt](inverted index) performance optimization for need_read_data in compound #35346 #36292 (#36404) dabd27edd26 is described below commit dabd27edd26f7bb857b03dad9e0521b27fd82267 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Thu Jun 20 08:43:16 2024 +0800 [opt](inverted index) performance optimization for need_read_data in compound #35346 #36292 (#36404) pick from master https://github.com/apache/doris/pull/35346 https://github.com/apache/doris/pull/36292 --- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 54 ++++++++++--- .../test_need_read_data_fault_injection.out | 22 +++++ .../data/inverted_index_p0/test_need_read_data.out | 15 ++++ .../test_need_read_data_fault_injection.groovy | 94 ++++++++++++++++++++++ .../inverted_index_p0/test_need_read_data.groovy | 82 +++++++++++++++++++ 5 files changed, 254 insertions(+), 13 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index c31ac3c659a..f93d6264058 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -359,6 +359,34 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { _storage_name_and_type[i] = std::make_pair(field_name, storage_type); } } + + // find columns that definitely require reading data, such as functions that are not pushed down. + { + std::set<std::string> push_down_preds; + for (auto* pred : _col_predicates) { + if (!_check_apply_by_inverted_index(pred)) { + continue; + } + push_down_preds.insert(_gen_predicate_result_sign(pred)); + } + for (auto* pred : _col_preds_except_leafnode_of_andnode) { + if (!_check_apply_by_inverted_index(pred)) { + continue; + } + push_down_preds.insert(_gen_predicate_result_sign(pred)); + } + for (auto& preds_in_remaining_vconjuct : _column_pred_in_remaining_vconjunct) { + const auto& column_name = preds_in_remaining_vconjuct.first; + for (auto& pred_info : preds_in_remaining_vconjuct.second) { + auto column_sign = _gen_predicate_result_sign(&pred_info); + if (!push_down_preds.contains(column_sign)) { + auto cid = _opts.tablet_schema->field_index(column_name); + _need_read_data_indices[cid] = true; + } + } + } + } + return Status::OK(); } @@ -891,6 +919,7 @@ Status SegmentIterator::_apply_inverted_index_except_leafnode_of_andnode( Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { for (auto* pred : _col_preds_except_leafnode_of_andnode) { + auto column_id = pred->column_id(); auto pred_type = pred->type(); bool is_support = pred_type == PredicateType::EQ || pred_type == PredicateType::NE || pred_type == PredicateType::LT || pred_type == PredicateType::LE || @@ -899,6 +928,7 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { pred_type == PredicateType::IN_LIST || pred_type == PredicateType::NOT_IN_LIST; if (!is_support) { + _need_read_data_indices[column_id] = true; continue; } @@ -908,16 +938,17 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { if (can_apply_by_inverted_index) { res = _apply_inverted_index_except_leafnode_of_andnode(pred, &bitmap); } else { + _need_read_data_indices[column_id] = true; continue; } - bool need_remaining_after_evaluate = _column_has_fulltext_index(pred->column_id()) && + bool need_remaining_after_evaluate = _column_has_fulltext_index(column_id) && PredicateTypeTraits::is_equal_or_list(pred_type); if (!res.ok()) { if (_downgrade_without_index(res, need_remaining_after_evaluate)) { // downgrade without index query - _not_apply_index_pred.insert(pred->column_id()); - _need_read_data_indices[pred->column_id()] = true; + _not_apply_index_pred.insert(column_id); + _need_read_data_indices[column_id] = true; continue; } LOG(WARNING) << "failed to evaluate index" @@ -928,17 +959,10 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { std::string pred_result_sign = _gen_predicate_result_sign(pred); _rowid_result_for_index.emplace(pred_result_sign, std::make_pair(true, std::move(bitmap))); - } - for (auto* pred : _col_preds_except_leafnode_of_andnode) { - auto column_name = _schema->column(pred->column_id())->name(); - if (!_remaining_conjunct_roots.empty() && - _check_column_pred_all_push_down(column_name, true, - pred->type() == PredicateType::MATCH) && - !pred->predicate_params()->marked_by_runtime_filter) { - // if column's need_read_data already set true, we can not set it to false now. - if (_need_read_data_indices.find(pred->column_id()) == _need_read_data_indices.end()) { - _need_read_data_indices[pred->column_id()] = false; + if (!pred->predicate_params()->marked_by_runtime_filter) { + if (!_need_read_data_indices.contains(column_id)) { + _need_read_data_indices[column_id] = false; } } } @@ -1928,6 +1952,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 continue; } + DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", { + return Status::Error<ErrorCode::INTERNAL_ERROR>("{} does not need to read data"); + }) + if (is_continuous) { size_t rows_read = nrows_read; _opts.stats->block_first_read_seek_num += 1; diff --git a/regression-test/data/fault_injection_p0/test_need_read_data_fault_injection.out b/regression-test/data/fault_injection_p0/test_need_read_data_fault_injection.out new file mode 100644 index 00000000000..37885e404d3 --- /dev/null +++ b/regression-test/data/fault_injection_p0/test_need_read_data_fault_injection.out @@ -0,0 +1,22 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +863 + +-- !sql -- +210 + +-- !sql -- +0 + +-- !sql -- +819 + +-- !sql -- +199 + +-- !sql -- +713 + +-- !sql -- +18 + diff --git a/regression-test/data/inverted_index_p0/test_need_read_data.out b/regression-test/data/inverted_index_p0/test_need_read_data.out new file mode 100644 index 00000000000..d4ea5870e3e --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_need_read_data.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +8 \N +4 -10 +13 -4 +2 1 +3 2 +3 3 +5 4 +5 5 +1 6 +1 7 +4 8 +1 9 + diff --git a/regression-test/suites/fault_injection_p0/test_need_read_data_fault_injection.groovy b/regression-test/suites/fault_injection_p0/test_need_read_data_fault_injection.groovy new file mode 100644 index 00000000000..d7a92f8e7e9 --- /dev/null +++ b/regression-test/suites/fault_injection_p0/test_need_read_data_fault_injection.groovy @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_need_read_data_fault_injection", "nonConcurrent") { + // define a sql table + def indexTbName = "test_need_read_data_fault_injection" + + sql "DROP TABLE IF EXISTS ${indexTbName}" + sql """ + CREATE TABLE ${indexTbName} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '', + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + } + } + } + + try { + load_httplogs_data.call(indexTbName, 'test_need_read_data_fault_injection', 'true', 'json', 'documents-1000.json') + + sql "sync" + + try { + GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index") + + qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' or request match_phrase 'jpg' or request match_phrase 'gif'); """ + qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' or request match_phrase 'jpg' and request match_phrase 'gif'); """ + qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' and request match_phrase 'jpg' and request match_phrase 'gif'); """ + qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' and request match_phrase 'jpg' or request match_phrase 'gif'); """ + + qt_sql """ select count() from ${indexTbName} where (clientip match '1' or request match 'jpg' or clientip match '2'); """ + qt_sql """ select count() from ${indexTbName} where (clientip match '3' or request match 'gif' or clientip match '4'); """ + qt_sql """ select count() from ${indexTbName} where (clientip match 'images' or clientip match '5' or clientip match 'english'); """ + + } finally { + GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index") + } + } finally { + } +} \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_need_read_data.groovy b/regression-test/suites/inverted_index_p0/test_need_read_data.groovy new file mode 100644 index 00000000000..86993d81e03 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_need_read_data.groovy @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_need_read_data", "p0"){ + def indexTbName1 = "test_need_read_data" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + create table ${indexTbName1} ( + col_int_undef_signed_not_null int not null , + col_bigint_undef_signed_not_null_index_inverted bigint not null , + col_bigint_undef_signed_not_null bigint not null , + col_int_undef_signed int null , + col_int_undef_signed_index_inverted int null , + col_int_undef_signed_not_null_index_inverted int not null , + col_bigint_undef_signed bigint null , + col_bigint_undef_signed_index_inverted bigint null , + col_date_undef_signed date null , + col_date_undef_signed_index_inverted date null , + col_date_undef_signed_not_null date not null , + col_date_undef_signed_not_null_index_inverted date not null , + col_varchar_10__undef_signed varchar(10) null , + col_varchar_10__undef_signed_index_inverted varchar(10) null , + col_varchar_10__undef_signed_not_null varchar(10) not null , + col_varchar_10__undef_signed_not_null_index_inverted varchar(10) not null , + col_varchar_1024__undef_signed varchar(1024) null , + col_varchar_1024__undef_signed_index_inverted varchar(1024) null , + col_varchar_1024__undef_signed_not_null varchar(1024) not null , + col_varchar_1024__undef_signed_not_null_index_inverted varchar(1024) not null , + pk int, + INDEX col_int_undef_signed_index_inverted_idx (`col_int_undef_signed_index_inverted`) USING INVERTED, + INDEX col_int_undef_signed_not_null_index_inverted_idx (`col_int_undef_signed_not_null_index_inverted`) USING INVERTED, + INDEX col_bigint_undef_signed_index_inverted_idx (`col_bigint_undef_signed_index_inverted`) USING INVERTED, + INDEX col_bigint_undef_signed_not_null_index_inverted_idx (`col_bigint_undef_signed_not_null_index_inverted`) USING INVERTED, + INDEX col_date_undef_signed_index_inverted_idx (`col_date_undef_signed_index_inverted`) USING INVERTED, + INDEX col_date_undef_signed_not_null_index_inverted_idx (`col_date_undef_signed_not_null_index_inverted`) USING INVERTED, + INDEX col_varchar_10__undef_signed_index_inverted_idx (`col_varchar_10__undef_signed_index_inverted`) USING INVERTED, + INDEX col_varchar_10__undef_signed_not_null_index_inverted_idx (`col_varchar_10__undef_signed_not_null_index_inverted`) USING INVERTED, + INDEX col_varchar_1024__undef_signed_index_inverted_idx (`col_varchar_1024__undef_signed_index_inverted`) USING INVERTED, + INDEX col_varchar_1024__undef_signed_not_null_index_inverted_idx (`col_varchar_1024__undef_signed_not_null_index_inverted`) USING INVERTED + ) engine=olap + UNIQUE KEY(col_int_undef_signed_not_null, col_bigint_undef_signed_not_null_index_inverted, col_bigint_undef_signed_not_null) + PARTITION BY RANGE(col_int_undef_signed_not_null) ( + PARTITION p0 VALUES LESS THAN ('4'), + PARTITION p1 VALUES LESS THAN ('6'), + PARTITION p2 VALUES LESS THAN ('7'), + PARTITION p3 VALUES LESS THAN ('8'), + PARTITION p4 VALUES LESS THAN ('10'), + PARTITION p5 VALUES LESS THAN ('1147483647'), + PARTITION p100 VALUES LESS THAN ('2147483647') + ) + distributed by hash(col_bigint_undef_signed_not_null) + properties("enable_unique_key_merge_on_write" = "true", "replication_num" = "1"); + """ + + try { + sql """ insert into ${indexTbName1}(pk,col_int_undef_signed,col_int_undef_signed_index_inverted,col_int_undef_signed_not_null,col_int_undef_signed_not_null_index_inverted,col_bigint_undef_signed,col_bigint_undef_signed_index_inverted,col_bigint_undef_signed_not_null,col_bigint_undef_signed_not_null_index_inverted,col_date_undef_signed,col_date_undef_signed_index_inverted,col_date_undef_signed_not_null,col_date_undef_signed_not_null_index_inverted,col_varchar_10__undef_signed,col_ [...] + + sql "sync" + + qt_sql """ SELECT COUNT( *) AS field1, col_int_undef_signed AS field2 FROM ${indexTbName1} WHERE( col_date_undef_signed_not_null_index_inverted == '2024-01-01' OR day( col_date_undef_signed_not_null_index_inverted ) !=0 ) GROUP BY field2 ORDER BY field2; """ + + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org