This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 283bd59ebaa [improvement](scanner) Remove the predicate that is always true for the segment (#25366) 283bd59ebaa is described below commit 283bd59ebaa58284204d877e7b2e7486acc5c0f8 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Fri Oct 13 02:25:38 2023 -0500 [improvement](scanner) Remove the predicate that is always true for the segment (#25366) By utilizing the zonemap index of the segment, we can ascertain if a predicate is always true. For example, if the segment’s maximum value is 100 and the predicate is col < 101, then this predicate is always true for this segment. --- be/src/common/config.cpp | 2 + be/src/common/config.h | 3 + be/src/olap/column_predicate.h | 4 ++ be/src/olap/comparison_predicate.h | 23 ++++++++ be/src/olap/rowset/segment_v2/column_reader.cpp | 26 +++++++++ be/src/olap/rowset/segment_v2/column_reader.h | 3 + be/src/olap/rowset/segment_v2/segment.cpp | 20 ++++++- .../query_p0/test_select_with_predicate_prune.out | 25 ++++++++ .../test_select_with_predicate_prune.groovy | 67 ++++++++++++++++++++++ 9 files changed, 172 insertions(+), 1 deletion(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 93708312705..b53ed53c3e0 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1109,6 +1109,8 @@ DEFINE_Bool(exit_on_exception, "false"); DEFINE_String(doris_cgroup_cpu_path, ""); DEFINE_Bool(enable_cpu_hard_limit, "false"); +DEFINE_Bool(ignore_always_true_predicate_for_segment, "true"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 466f9919f09..4207b354410 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1179,6 +1179,9 @@ DECLARE_mBool(exit_on_exception); DECLARE_String(doris_cgroup_cpu_path); DECLARE_Bool(enable_cpu_hard_limit); +// Remove predicate that is always true for a segment. +DECLARE_Bool(ignore_always_true_predicate_for_segment); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index b98156f5fb8..05e84999a83 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -173,6 +173,10 @@ public: return true; } + virtual bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const { + return false; + } + virtual bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const { return false; } diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index b6ac7217fcd..de966833f9d 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -174,6 +174,29 @@ public: } } + bool is_always_true(const std::pair<WrapperField*, WrapperField*>& statistic) const override { + if (statistic.first->is_null() || statistic.second->is_null()) { + return false; + } + + T tmp_min_value {}; + T tmp_max_value {}; + memcpy((char*)(&tmp_min_value), statistic.first->cell_ptr(), sizeof(WarpperFieldType)); + memcpy((char*)(&tmp_max_value), statistic.second->cell_ptr(), sizeof(WarpperFieldType)); + + if constexpr (PT == PredicateType::LT) { + return _value > tmp_max_value; + } else if constexpr (PT == PredicateType::LE) { + return _value >= tmp_max_value; + } else if constexpr (PT == PredicateType::GT) { + return _value < tmp_min_value; + } else if constexpr (PT == PredicateType::GE) { + return _value <= tmp_min_value; + } + + return false; + } + bool evaluate_del(const std::pair<WrapperField*, WrapperField*>& statistic) const override { if (statistic.first->is_null() || statistic.second->is_null()) { return false; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 82e1e145393..73cd7730fd7 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -31,6 +31,7 @@ #include "io/fs/file_reader.h" #include "olap/block_column_predicate.h" #include "olap/column_predicate.h" +#include "olap/comparison_predicate.h" #include "olap/decimal12.h" #include "olap/inverted_index_parser.h" #include "olap/iterators.h" @@ -338,6 +339,31 @@ bool ColumnReader::match_condition(const AndBlockColumnPredicate* col_predicates col_predicates); } +bool ColumnReader::prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& predicates, + const int column_id) const { + if (_zone_map_index == nullptr) { + return false; + } + + FieldType type = _type_info->type(); + std::unique_ptr<WrapperField> min_value(WrapperField::create_by_type(type, _meta_length)); + std::unique_ptr<WrapperField> max_value(WrapperField::create_by_type(type, _meta_length)); + _parse_zone_map(*_segment_zone_map, min_value.get(), max_value.get()); + + auto pruned = false; + for (auto it = predicates.begin(); it != predicates.end();) { + auto predicate = *it; + if (predicate->column_id() == column_id && + predicate->is_always_true({min_value.get(), max_value.get()})) { + pruned = true; + it = predicates.erase(it); + } else { + ++it; + } + } + return pruned; +} + void ColumnReader::_parse_zone_map(const ZoneMapPB& zone_map, WrapperField* min_value_container, WrapperField* max_value_container) const { // min value and max value are valid if has_not_null is true diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index f287ba5b611..3a792eb762c 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -162,6 +162,9 @@ public: bool is_empty() const { return _num_rows == 0; } + bool prune_predicates_by_zone_map(std::vector<ColumnPredicate*>& predicates, + const int column_id) const; + CompressionTypePB get_compression() const { return _meta_compression; } uint64_t num_rows() const { return _num_rows; } diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 30ef514ffc7..5399e3ee656 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -129,7 +129,6 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o return Status::OK(); } } - if (read_options.use_topn_opt) { auto query_ctx = read_options.runtime_state->get_query_ctx(); auto runtime_predicate = query_ctx->get_runtime_predicate().get_predictate(); @@ -157,6 +156,25 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o iter->reset(new SegmentIterator(this->shared_from_this(), schema)); } + if (config::ignore_always_true_predicate_for_segment && + read_options.io_ctx.reader_type == ReaderType::READER_QUERY && + !read_options.column_predicates.empty()) { + auto pruned_predicates = read_options.column_predicates; + auto pruned = false; + for (auto& it : _column_readers) { + if (it.second->prune_predicates_by_zone_map(pruned_predicates, it.first)) { + pruned = true; + } + } + + if (pruned) { + auto options_with_pruned_predicates = read_options; + options_with_pruned_predicates.column_predicates = pruned_predicates; + LOG(INFO) << "column_predicates pruned from " << read_options.column_predicates.size() + << " to " << pruned_predicates.size(); + return iter->get()->init(options_with_pruned_predicates); + } + } return iter->get()->init(read_options); } diff --git a/regression-test/data/query_p0/test_select_with_predicate_prune.out b/regression-test/data/query_p0/test_select_with_predicate_prune.out new file mode 100644 index 00000000000..2e1fad87499 --- /dev/null +++ b/regression-test/data/query_p0/test_select_with_predicate_prune.out @@ -0,0 +1,25 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- +1 jerry 2020-10-01 +2 tom 2020-10-02 +3 jack 2020-10-01 +4 tony 2020-10-02 + +-- !select2 -- +1 jerry 2020-10-01 +3 jack 2020-10-01 + +-- !select3 -- + +-- !select4 -- +1 jerry 2020-10-01 +2 tom 2020-10-02 +3 jack 2020-10-01 +4 tony 2020-10-02 + +-- !select5 -- +2 tom 2020-10-02 +4 tony 2020-10-02 + +-- !select6 -- + diff --git a/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy new file mode 100644 index 00000000000..768e04b4c32 --- /dev/null +++ b/regression-test/suites/query_p0/test_select_with_predicate_prune.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_select_with_predicate_prune") { + sql """ + drop table if exists `test_select_with_predicate_prune`; + """ + sql """ + CREATE TABLE IF NOT EXISTS `test_select_with_predicate_prune` ( + id int, + name string, + birthday date not null + ) + duplicate key(`id`) + AUTO PARTITION BY LIST (`birthday`)() + DISTRIBUTED BY HASH(`id`) buckets 1 + PROPERTIES + ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + insert into test_select_with_predicate_prune values (1, 'jerry', '2020-10-01'), (2, 'tom', '2020-10-02'); + """ + sql """ + insert into test_select_with_predicate_prune values (3, 'jack', '2020-10-01'), (4, 'tony', '2020-10-02'); + """ + + qt_select1 """ + select * from test_select_with_predicate_prune where birthday < '2020-10-03' order by id; + """ + + qt_select2 """ + select * from test_select_with_predicate_prune where birthday < '2020-10-02' order by id; + """ + + qt_select3 """ + select * from test_select_with_predicate_prune where birthday < '2020-10-01' order by id; + """ + + + qt_select4 """ + select * from test_select_with_predicate_prune where birthday > '2020-09-30' order by id; + """ + + qt_select5 """ + select * from test_select_with_predicate_prune where birthday > '2020-10-01' order by id; + """ + + qt_select6 """ + select * from test_select_with_predicate_prune where birthday > '2020-10-02' order by id; + """ +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org