This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0-beta in repository https://gitbox.apache.org/repos/asf/doris.git
commit 4a33b956e620c969abbfccced21cf063ba3780e7 Author: YueW <45946325+tany...@users.noreply.github.com> AuthorDate: Fri Jun 9 21:54:48 2023 +0800 [enhancement](index) Nereids support no need to read raw data for index column that only in filter conditions (#20605) --- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 19 ++- be/src/olap/rowset/segment_v2/segment_iterator.h | 1 + be/src/vec/exec/scan/new_olap_scan_node.cpp | 3 - .../org/apache/doris/planner/OriginalPlanner.java | 44 +------ .../test_index_no_need_read_data.out | 129 +++++++++++++++++++++ .../test_index_no_need_read_data.groovy | 88 ++++++++++++++ 6 files changed, 241 insertions(+), 43 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 92717945b1..53d89e6395 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -250,6 +250,10 @@ Status SegmentIterator::init(const StorageReadOptions& opts) { if (_char_type_idx.empty() && _char_type_idx_no_0.empty()) { _vec_init_char_column_id(); } + + if (opts.output_columns != nullptr) { + _output_columns = *(opts.output_columns); + } return Status::OK(); } @@ -917,7 +921,20 @@ Status SegmentIterator::_apply_inverted_index_on_block_column_predicate( } bool SegmentIterator::_need_read_data(ColumnId cid) { - // TODO(xk) impl right logic + if (_output_columns.count(-1)) { + // if _output_columns contains -1, it means that the light + // weight schema change may not be enabled or other reasons + // caused the column unique_id not be set, to prevent errors + // occurring, return true here that column data needs to be read + return true; + } + int32_t unique_id = _opts.tablet_schema->column(cid).unique_id(); + if (_need_read_data_indices.count(unique_id) > 0 && !_need_read_data_indices[unique_id] && + _output_columns.count(unique_id) < 1) { + VLOG_DEBUG << "SegmentIterator no need read data for column: " + << _opts.tablet_schema->column_by_uid(unique_id).name(); + return false; + } return true; } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 899ee46e3a..93b8b398e7 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -431,6 +431,7 @@ private: std::vector<ColumnPredicate*> _filter_info_id; bool _record_rowids = false; int32_t _tablet_id = 0; + std::set<int32_t> _output_columns; }; } // namespace segment_v2 diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp index ac916bfcd1..740a57e793 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.cpp +++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp @@ -440,9 +440,6 @@ Status NewOlapScanNode::_init_scanners(std::list<VScannerSPtr>* scanners) { if (!_olap_scan_node.output_column_unique_ids.empty()) { for (auto uid : _olap_scan_node.output_column_unique_ids) { - if (uid < 0) { - continue; - } _maybe_read_column_ids.emplace(uid); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java index 024ea0647b..1fbd140d45 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OriginalPlanner.java @@ -572,50 +572,16 @@ public class OriginalPlanner extends Planner { * column unique id for `A` and `B` will put into outputColumnUniqueIds. * */ + // this opt will only work with nereidsPlanner private void pushOutColumnUniqueIdsToOlapScan(PlanFragment rootFragment, Analyzer analyzer) { Set<Integer> outputColumnUniqueIds = new HashSet<>(); - ArrayList<Expr> outputExprs = rootFragment.getOutputExprs(); - for (Expr expr : outputExprs) { - if (expr instanceof SlotRef) { - if (((SlotRef) expr).getColumn() != null) { - outputColumnUniqueIds.add(((SlotRef) expr).getColumn().getUniqueId()); - } - } - } + // add '-1' to avoid the optimization incorrect work with OriginalPlanner, + // because in the storage layer will skip this optimization if outputColumnUniqueIds contains '-1', + // to ensure the optimization only correct work with nereidsPlanner + outputColumnUniqueIds.add(-1); for (PlanFragment fragment : fragments) { PlanNode node = fragment.getPlanRoot(); - PlanNode parent = null; - while (node.getChildren().size() != 0) { - for (PlanNode childNode : node.getChildren()) { - List<SlotId> outputSlotIds = childNode.getOutputSlotIds(); - if (outputSlotIds != null) { - for (SlotId sid : outputSlotIds) { - SlotDescriptor slotDesc = analyzer.getSlotDesc(sid); - outputColumnUniqueIds.add(slotDesc.getUniqueId()); - } - } - } - // OlapScanNode is the last node. - // So, just get the two node and check if they are SortNode and OlapScan. - parent = node; - node = node.getChildren().get(0); - } - - if (parent instanceof SortNode) { - SortNode sortNode = (SortNode) parent; - List<Expr> orderingExprs = sortNode.getSortInfo().getOrigOrderingExprs(); - if (orderingExprs != null) { - for (Expr expr : orderingExprs) { - if (expr instanceof SlotRef) { - if (((SlotRef) expr).getColumn() != null) { - outputColumnUniqueIds.add(((SlotRef) expr).getColumn().getUniqueId()); - } - } - } - } - } - if (!(node instanceof OlapScanNode)) { continue; } diff --git a/regression-test/data/inverted_index_p0/test_index_no_need_read_data.out b/regression-test/data/inverted_index_p0/test_index_no_need_read_data.out new file mode 100644 index 0000000000..01a08f324d --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_no_need_read_data.out @@ -0,0 +1,129 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_nereids_0 -- +1 \N addr qie3 yy lj 100 +2 \N hehe \N lala 200 +3 beijing addr xuanwu wugui \N 300 +4 beijing addr fengtai fengtai1 fengtai2 \N +5 beijing addr chaoyang wangjing donghuqu 500 +6 shanghai hehe \N haha \N +7 tengxun qie addr gg lj \N +8 tengxun2 qie \N lj 800 + +-- !select_nereids_1 -- +4 + +-- !select_nereids_2 -- +3 + +-- !select_nereids_3 -- +3 + +-- !select_nereids_4 -- +3 beijing addr xuanwu wugui \N 300 +4 beijing addr fengtai fengtai1 fengtai2 \N +5 beijing addr chaoyang wangjing donghuqu 500 + +-- !select_nereids_5 -- +beijing addr xuanwu wugui +beijing addr fengtai fengtai1 +beijing addr chaoyang wangjing + +-- !select_nereids_6 -- +hehe \N +qie addr gg +qie \N + +-- !select_nereids_7 -- +hehe \N +qie addr gg +qie \N + +-- !select_nereids_8 -- +SHANGHAI \N +TENGXUN addr gg +TENGXUN2 \N + +-- !select_nereids_9 -- +4 \N +3 addr gg +3 \N + +-- !select_nereids_10 -- +hehe \N +qie addr gg +qie \N + +-- !select_nereids_11 -- +hehe \N SHANGHAI +qie addr gg TENGXUN +qie \N TENGXUN2 + +-- !select_nereids_12 -- +300 +\N +500 + +-- !select_0 -- +1 \N addr qie3 yy lj 100 +2 \N hehe \N lala 200 +3 beijing addr xuanwu wugui \N 300 +4 beijing addr fengtai fengtai1 fengtai2 \N +5 beijing addr chaoyang wangjing donghuqu 500 +6 shanghai hehe \N haha \N +7 tengxun qie addr gg lj \N +8 tengxun2 qie \N lj 800 + +-- !select_1 -- +4 + +-- !select_2 -- +3 + +-- !select_3 -- +3 + +-- !select_4 -- +3 beijing addr xuanwu wugui \N 300 +4 beijing addr fengtai fengtai1 fengtai2 \N +5 beijing addr chaoyang wangjing donghuqu 500 + +-- !select_5 -- +beijing addr xuanwu wugui +beijing addr fengtai fengtai1 +beijing addr chaoyang wangjing + +-- !select_6 -- +hehe \N +qie addr gg +qie \N + +-- !select_7 -- +hehe \N +qie addr gg +qie \N + +-- !select_8 -- +SHANGHAI \N +TENGXUN addr gg +TENGXUN2 \N + +-- !select_9 -- +4 \N +3 addr gg +3 \N + +-- !select_10 -- +hehe \N +qie addr gg +qie \N + +-- !select_11 -- +hehe \N SHANGHAI +qie addr gg TENGXUN +qie \N TENGXUN2 + +-- !select_12 -- +300 +\N +500 + diff --git a/regression-test/suites/inverted_index_p0/test_index_no_need_read_data.groovy b/regression-test/suites/inverted_index_p0/test_index_no_need_read_data.groovy new file mode 100644 index 0000000000..e5bc37e2fd --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_no_need_read_data.groovy @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_index_no_need_read_data", "inverted_index_select"){ + def table1 = "test_index_no_need_read_data" + + sql "drop table if exists ${table1}" + + sql """ + CREATE TABLE IF NOT EXISTS `${table1}` ( + `id` int NULL COMMENT "", + `city` varchar(20) NULL COMMENT "", + `addr` varchar(20) NULL COMMENT "", + `name` varchar(20) NULL COMMENT "", + `compy` varchar(20) NULL COMMENT "", + `n` int NULL COMMENT "", + INDEX idx_city(city) USING INVERTED, + INDEX idx_addr(addr) USING INVERTED PROPERTIES("parser"="english"), + INDEX idx_n(n) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "in_memory" = "false", + "storage_format" = "V2" + ) + """ + + sql """insert into ${table1} values + (1,null,'addr qie3','yy','lj',100), + (2,null,'hehe',null,'lala',200), + (3,'beijing','addr xuanwu','wugui',null,300), + (4,'beijing','addr fengtai','fengtai1','fengtai2',null), + (5,'beijing','addr chaoyang','wangjing','donghuqu',500), + (6,'shanghai','hehe',null,'haha',null), + (7,'tengxun','qie','addr gg','lj',null), + (8,'tengxun2','qie',null,'lj',800) + """ + + // case1: enable nereids planner + sql "set enable_nereids_planner = true" + + qt_select_nereids_0 "SELECT * FROM ${table1} ORDER BY id" + qt_select_nereids_1 "SELECT count() FROM ${table1} WHERE n > 100" + qt_select_nereids_2 "SELECT count() FROM ${table1} WHERE city = 'beijing'" + qt_select_nereids_3 "SELECT count(*) FROM ${table1} WHERE city = 'beijing'" + qt_select_nereids_4 "SELECT * FROM ${table1} WHERE city = 'beijing' ORDER BY id" + qt_select_nereids_5 "SELECT city, addr, name FROM ${table1} WHERE city = 'beijing' ORDER BY id" + qt_select_nereids_6 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY city" + qt_select_nereids_7 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY id" + qt_select_nereids_8 "SELECT upper(city), name FROM ${table1} WHERE city != 'beijing' ORDER BY id" + qt_select_nereids_9 "SELECT length(addr), name FROM ${table1} WHERE city != 'beijing' ORDER BY id" + qt_select_nereids_10 "SELECT addr, name FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t" + qt_select_nereids_11 "SELECT addr, name, upper(city) FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t" + qt_select_nereids_12 "SELECT sum(n) FROM ${table1} WHERE city = 'beijing' group by id ORDER BY id" + + // case2: disable nereids planner + sql "set enable_nereids_planner = false" + + qt_select_0 "SELECT * FROM ${table1} ORDER BY id" + qt_select_1 "SELECT count() FROM ${table1} WHERE n > 100" + qt_select_2 "SELECT count() FROM ${table1} WHERE city = 'beijing'" + qt_select_3 "SELECT count(*) FROM ${table1} WHERE city = 'beijing'" + qt_select_4 "SELECT * FROM ${table1} WHERE city = 'beijing' ORDER BY id" + qt_select_5 "SELECT city, addr, name FROM ${table1} WHERE city = 'beijing' ORDER BY id" + qt_select_6 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY city" + qt_select_7 "SELECT addr, name FROM ${table1} WHERE city > 'beijing' ORDER BY id" + qt_select_8 "SELECT upper(city), name FROM ${table1} WHERE city != 'beijing' ORDER BY id" + qt_select_9 "SELECT length(addr), name FROM ${table1} WHERE city != 'beijing' ORDER BY id" + qt_select_10 "SELECT addr, name FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t" + qt_select_11 "SELECT addr, name, upper(city) FROM ( SELECT * from ${table1} WHERE city != 'beijing' ORDER BY id) t" + qt_select_12 "SELECT sum(n) FROM ${table1} WHERE city = 'beijing' group by id ORDER BY id" +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org