This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 45c451ef2ba [Feature](runtime filter) normalize ignore runtime filter (#30152) 45c451ef2ba is described below commit 45c451ef2ba17a7c62599681476151d8d37aed95 Author: Pxl <pxl...@qq.com> AuthorDate: Fri Feb 2 15:30:29 2024 +0800 [Feature](runtime filter) normalize ignore runtime filter (#30152) normalize ignore runtime filter --- be/src/common/config.cpp | 2 +- be/src/common/config.h | 2 +- be/src/exprs/create_predicate_function.h | 2 +- be/src/exprs/minmax_predicate.h | 117 ++++++++------------- be/src/exprs/runtime_filter.h | 4 +- be/src/olap/accept_null_predicate.h | 90 ++++++++-------- be/src/olap/bitmap_filter_predicate.h | 14 +-- be/src/olap/bloom_filter_predicate.h | 38 ++----- be/src/olap/column_predicate.h | 43 +++++++- be/src/olap/comparison_predicate.h | 72 ++++++++----- be/src/olap/in_list_predicate.h | 61 ++++++----- be/src/olap/like_column_predicate.cpp | 4 +- be/src/olap/like_column_predicate.h | 6 +- be/src/olap/match_predicate.h | 9 +- be/src/olap/null_predicate.cpp | 4 +- be/src/olap/null_predicate.h | 6 +- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 28 +++-- be/src/vec/exprs/vruntimefilter_wrapper.cpp | 3 +- be/src/vec/exprs/vruntimefilter_wrapper.h | 8 +- .../olap/bitmap_filter_column_predicate_test.cpp | 4 +- .../join-optimization/runtime-filter.md | 2 +- .../join-optimization/runtime-filter.md | 2 +- .../java/org/apache/doris/qe/SessionVariable.java | 2 +- .../data/variable_p0/set_and_unset_variable.out | 20 ++-- 24 files changed, 289 insertions(+), 254 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index e4711ce59a8..5ec35009cb6 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -889,7 +889,7 @@ DEFINE_mInt64(small_column_size_buffer, "100"); // When the rows number reached this limit, will check the filter rate the of bloomfilter // if it is lower than a specific threshold, the predicate will be disabled. -DEFINE_mInt32(bloom_filter_predicate_check_row_num, "204800"); +DEFINE_mInt32(rf_predicate_check_row_num, "204800"); // cooldown task configs DEFINE_Int32(cooldown_thread_num, "5"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 25b426f6adc..d931c0205a2 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -941,7 +941,7 @@ DECLARE_mInt64(small_column_size_buffer); // When the rows number reached this limit, will check the filter rate the of bloomfilter // if it is lower than a specific threshold, the predicate will be disabled. -DECLARE_mInt32(bloom_filter_predicate_check_row_num); +DECLARE_mInt32(rf_predicate_check_row_num); // cooldown task configs DECLARE_Int32(cooldown_thread_num); diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 0e792563acf..11889ff2ec3 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -232,7 +232,7 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, std::shared_ptr<BloomFilterFuncBase> filter_olap; filter_olap.reset(create_bloom_filter(PT)); filter_olap->light_copy(filter.get()); - return new BloomFilterColumnPredicate<PT>(column_id, filter, be_exec_version); + return new BloomFilterColumnPredicate<PT>(column_id, filter); } template <PrimitiveType PT> diff --git a/be/src/exprs/minmax_predicate.h b/be/src/exprs/minmax_predicate.h index fcf2ef44a19..b9ee56a8dc1 100644 --- a/be/src/exprs/minmax_predicate.h +++ b/be/src/exprs/minmax_predicate.h @@ -30,9 +30,7 @@ namespace doris { // only used in Runtime Filter class MinMaxFuncBase { public: - virtual void insert(const void* data) = 0; virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0; - virtual bool find(void* data) = 0; virtual void* get_max() = 0; virtual void* get_min() = 0; // assign minmax data @@ -48,66 +46,54 @@ public: MinMaxNumFunc() = default; ~MinMaxNumFunc() override = default; - void insert(const void* data) override { - if (data == nullptr) { - return; - } - - T val_data = *reinterpret_cast<const T*>(data); - - if constexpr (NeedMin) { - if (val_data < _min) { - _min = val_data; - } - } - - if constexpr (NeedMax) { - if (val_data > _max) { - _max = val_data; - } - } - } - void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { if (column->empty()) { return; } if (column->is_nullable()) { const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get()); - const auto& col = nullable->get_nested_column(); - const auto& nullmap = - assert_cast<const vectorized::ColumnUInt8&>(nullable->get_null_map_column()) - .get_data(); - - if constexpr (std::is_same_v<T, StringRef>) { - const auto& column_string = assert_cast<const vectorized::ColumnString&>(col); - for (size_t i = start; i < column->size(); i++) { - if (!nullmap[i]) { - if constexpr (NeedMin) { - _min = std::min(_min, column_string.get_data_at(i)); - } - if constexpr (NeedMax) { - _max = std::max(_max, column_string.get_data_at(i)); - } - } - } + const auto& col = nullable->get_nested_column_ptr(); + const auto& nullmap = nullable->get_null_map_data(); + if (nullable->has_null()) { + update_batch(col, nullmap, start); } else { - const T* data = (T*)col.get_raw_data().data; - for (size_t i = start; i < column->size(); i++) { - if (!nullmap[i]) { - if constexpr (NeedMin) { - _min = std::min(_min, *(data + i)); - } - if constexpr (NeedMax) { - _max = std::max(_max, *(data + i)); - } - } + update_batch(col, start); + } + } else { + update_batch(column, start); + } + } + + void update_batch(const vectorized::ColumnPtr& column, size_t start) { + if constexpr (std::is_same_v<T, StringRef>) { + const auto& column_string = assert_cast<const vectorized::ColumnString&>(*column); + for (size_t i = start; i < column->size(); i++) { + if constexpr (NeedMin) { + _min = std::min(_min, column_string.get_data_at(i)); + } + if constexpr (NeedMax) { + _max = std::max(_max, column_string.get_data_at(i)); } } } else { - if constexpr (std::is_same_v<T, StringRef>) { - const auto& column_string = assert_cast<const vectorized::ColumnString&>(*column); - for (size_t i = start; i < column->size(); i++) { + const T* data = (T*)column->get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + if constexpr (NeedMin) { + _min = std::min(_min, *(data + i)); + } + if constexpr (NeedMax) { + _max = std::max(_max, *(data + i)); + } + } + } + } + + void update_batch(const vectorized::ColumnPtr& column, const vectorized::NullMap& nullmap, + size_t start) { + if constexpr (std::is_same_v<T, StringRef>) { + const auto& column_string = assert_cast<const vectorized::ColumnString&>(*column); + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { if constexpr (NeedMin) { _min = std::min(_min, column_string.get_data_at(i)); } @@ -115,9 +101,11 @@ public: _max = std::max(_max, column_string.get_data_at(i)); } } - } else { - const T* data = (T*)column->get_raw_data().data; - for (size_t i = start; i < column->size(); i++) { + } + } else { + const T* data = (T*)column->get_raw_data().data; + for (size_t i = start; i < column->size(); i++) { + if (!nullmap[i]) { if constexpr (NeedMin) { _min = std::min(_min, *(data + i)); } @@ -129,25 +117,6 @@ public: } } - bool find(void* data) override { - if (data == nullptr) { - return false; - } - - T val_data = *reinterpret_cast<T*>(data); - if constexpr (NeedMin) { - if (val_data < _min) { - return false; - } - } - if constexpr (NeedMax) { - if (val_data > _max) { - return false; - } - } - return true; - } - Status merge(MinMaxFuncBase* minmax_func, ObjectPool* pool) override { if constexpr (std::is_same_v<T, StringRef>) { auto* other_minmax = static_cast<MinMaxNumFunc<T>*>(minmax_func); diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index bc487bfe9c9..9a7c1a2ae3c 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -140,7 +140,9 @@ public: } } - [[nodiscard]] int get_filter_id() const { return _filter_id; } + int get_filter_id() const { return _filter_id; } + + bool is_runtime_filter() const { return _filter_id != -1; } private: int _filter_id = -1; diff --git a/be/src/olap/accept_null_predicate.h b/be/src/olap/accept_null_predicate.h index 90cff5cc70b..3d6103e81cd 100644 --- a/be/src/olap/accept_null_predicate.h +++ b/be/src/olap/accept_null_predicate.h @@ -59,55 +59,12 @@ public: return _nested->can_do_apply_safely(input_type, is_null); } - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override { - if (column.has_null()) { - // create selected_flags - uint16_t max_idx = *std::max_element(sel, sel + size); - auto selected_flags_ptr = std::make_unique<bool[]>(max_idx + 1); - auto selected_flags = selected_flags_ptr.get(); - // init to 0 / false - memset(selected_flags, 0, (max_idx + 1) * sizeof(bool)); - for (uint16_t i = 0; i < size; ++i) { - uint16_t row_idx = sel[i]; - if (column.is_null_at(row_idx)) { - // set selected flag true for NULL value - selected_flags[row_idx] = true; - } - } - - // call nested predicate evaluate - uint16_t new_size = _nested->evaluate(column, sel, size); - - // process NULL values - if (new_size < size) { - // add rows selected by _nested->evaluate - for (uint16_t i = 0; i < new_size; ++i) { - uint16_t row_idx = sel[i]; - selected_flags[row_idx] = true; - } - - // recaculate new_size and sel array - new_size = 0; - for (uint16_t row_idx = 0; row_idx < max_idx + 1; ++row_idx) { - if (selected_flags[row_idx]) { - sel[new_size++] = row_idx; - } - } - } - - return new_size; - } else { - return _nested->evaluate(column, sel, size); - } - } - void evaluate_and(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override { if (column.has_null()) { // copy original flags auto original_flags_buf = std::make_unique<bool[]>(size); - auto original_flags = original_flags_buf.get(); + auto* original_flags = original_flags_buf.get(); memcpy(original_flags, flags, size * sizeof(bool)); // call evaluate_and and restore true for NULL rows @@ -175,7 +132,7 @@ public: if (column.has_null()) { // copy original flags auto original_flags_buf = std::make_unique<bool[]>(size); - auto original_flags = original_flags_buf.get(); + auto* original_flags = original_flags_buf.get(); memcpy(original_flags, flags, size * sizeof(bool)); // call evaluate_and_vec and restore true for NULL rows @@ -208,6 +165,49 @@ public: } private: + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override { + if (column.has_null()) { + // create selected_flags + uint16_t max_idx = *std::max_element(sel, sel + size); + auto selected_flags_ptr = std::make_unique<bool[]>(max_idx + 1); + auto* selected_flags = selected_flags_ptr.get(); + // init to 0 / false + memset(selected_flags, 0, (max_idx + 1) * sizeof(bool)); + for (uint16_t i = 0; i < size; ++i) { + uint16_t row_idx = sel[i]; + if (column.is_null_at(row_idx)) { + // set selected flag true for NULL value + selected_flags[row_idx] = true; + } + } + + // call nested predicate evaluate + uint16_t new_size = _nested->evaluate(column, sel, size); + + // process NULL values + if (new_size < size) { + // add rows selected by _nested->evaluate + for (uint16_t i = 0; i < new_size; ++i) { + uint16_t row_idx = sel[i]; + selected_flags[row_idx] = true; + } + + // recaculate new_size and sel array + new_size = 0; + for (uint16_t row_idx = 0; row_idx < max_idx + 1; ++row_idx) { + if (selected_flags[row_idx]) { + sel[new_size++] = row_idx; + } + } + } + + return new_size; + } else { + return _nested->evaluate(column, sel, size); + } + } + std::string _debug_string() const override { return "passnull predicate for " + _nested->debug_string(); } diff --git a/be/src/olap/bitmap_filter_predicate.h b/be/src/olap/bitmap_filter_predicate.h index 7420356a33a..a540346990a 100644 --- a/be/src/olap/bitmap_filter_predicate.h +++ b/be/src/olap/bitmap_filter_predicate.h @@ -76,10 +76,10 @@ public: return Status::OK(); } - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override; - private: + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override; + template <bool is_nullable> uint16_t evaluate(const vectorized::IColumn& column, const uint8_t* null_map, uint16_t* sel, uint16_t size) const { @@ -109,12 +109,12 @@ private: }; template <PrimitiveType T> -uint16_t BitmapFilterColumnPredicate<T>::evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const { +uint16_t BitmapFilterColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& column, + uint16_t* sel, uint16_t size) const { uint16_t new_size = 0; if (column.is_nullable()) { - auto* nullable_col = reinterpret_cast<const vectorized::ColumnNullable*>(&column); - auto& null_map_data = nullable_col->get_null_map_column().get_data(); + const auto* nullable_col = reinterpret_cast<const vectorized::ColumnNullable*>(&column); + const auto& null_map_data = nullable_col->get_null_map_column().get_data(); new_size = evaluate<true>(nullable_col->get_nested_column(), null_map_data.data(), sel, size); } else { diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index bacf61b8362..9cc95d7152a 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -37,12 +37,10 @@ public: using SpecificFilter = BloomFilterFunc<T>; BloomFilterColumnPredicate(uint32_t column_id, - const std::shared_ptr<BloomFilterFuncBase>& filter, - int be_exec_version) + const std::shared_ptr<BloomFilterFuncBase>& filter) : ColumnPredicate(column_id), _filter(filter), - _specific_filter(reinterpret_cast<SpecificFilter*>(_filter.get())), - _be_exec_version(be_exec_version) {} + _specific_filter(reinterpret_cast<SpecificFilter*>(_filter.get())) {} ~BloomFilterColumnPredicate() override = default; PredicateType type() const override { return PredicateType::BF; } @@ -52,14 +50,16 @@ public: return Status::OK(); } - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override; - bool can_do_apply_safely(PrimitiveType input_type, bool is_null) const override { return input_type == T || (is_string_type(input_type) && is_string_type(T)); } private: + bool _can_ignore() const override { return _filter->is_runtime_filter(); } + + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override; + template <bool is_nullable> uint16_t evaluate(const vectorized::IColumn& column, const uint8_t* null_map, uint16_t* sel, uint16_t size) const { @@ -97,34 +97,18 @@ private: std::shared_ptr<BloomFilterFuncBase> _filter; SpecificFilter* _specific_filter; // owned by _filter - mutable bool _always_true = false; - mutable bool _has_calculate_filter = false; - int _be_exec_version; }; template <PrimitiveType T> -uint16_t BloomFilterColumnPredicate<T>::evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const { - uint16_t new_size = 0; - if (_always_true) { - return size; - } +uint16_t BloomFilterColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& column, + uint16_t* sel, uint16_t size) const { if (column.is_nullable()) { const auto* nullable_col = reinterpret_cast<const vectorized::ColumnNullable*>(&column); const auto& null_map_data = nullable_col->get_null_map_column().get_data(); - new_size = - evaluate<true>(nullable_col->get_nested_column(), null_map_data.data(), sel, size); + return evaluate<true>(nullable_col->get_nested_column(), null_map_data.data(), sel, size); } else { - new_size = evaluate<false>(column, nullptr, sel, size); + return evaluate<false>(column, nullptr, sel, size); } - // If the pass rate is very high, for example > 50%, then the bloomfilter is useless. - // Some bloomfilter is useless, for example ssb 4.3, it consumes a lot of cpu but it is - // useless. - _evaluated_rows += size; - _passed_rows += new_size; - vectorized::VRuntimeFilterWrapper::calculate_filter( - _evaluated_rows - _passed_rows, _evaluated_rows, _has_calculate_filter, _always_true); - return new_size; } } //namespace doris diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 5c3ad135062..adbb9a695ed 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -19,6 +19,7 @@ #include <roaring/roaring.hh> +#include "common/exception.h" #include "olap/rowset/segment_v2/bitmap_index_reader.h" #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" @@ -26,6 +27,7 @@ #include "olap/selection_vector.h" #include "runtime/define_primitive_type.h" #include "vec/columns/column.h" +#include "vec/exprs/vruntimefilter_wrapper.h" using namespace doris::segment_v2; @@ -181,11 +183,29 @@ public: "Not Implemented evaluate with inverted index, please check the predicate"); } + virtual double get_ignore_threshold() const { + return vectorized::VRuntimeFilterWrapper::EXPECTED_FILTER_RATE; + } + // evaluate predicate on IColumn // a short circuit eval way - virtual uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const { - return size; + uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, uint16_t size) const { + if (_always_true) { + return size; + } + + uint16_t new_size = _evaluate_inner(column, sel, size); + _evaluated_rows += size; + _passed_rows += new_size; + if (_can_ignore()) { + // If the pass rate is very high, for example > 50%, then the filter is useless. + // Some filter is useless, for example ssb 4.3, it consumes a lot of cpu but it is + // useless. + vectorized::VRuntimeFilterWrapper::calculate_filter( + get_ignore_threshold(), _evaluated_rows - _passed_rows, _evaluated_rows, + _has_calculate_filter, _always_true); + } + return new_size; } virtual void evaluate_and(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const {} @@ -262,7 +282,7 @@ public: std::shared_ptr<PredicateParams> predicate_params() { return _predicate_params; } - const std::string pred_type_string(PredicateType type) { + static std::string pred_type_string(PredicateType type) { switch (type) { case PredicateType::EQ: return "eq"; @@ -293,8 +313,21 @@ public: } } + bool always_true() const { return _always_true; } + protected: virtual std::string _debug_string() const = 0; + virtual bool _can_ignore() const { + if (_predicate_params) { + // minmax filter will set marked_by_runtime_filter to true + return _predicate_params->marked_by_runtime_filter; + } + return false; + } + virtual uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const { + throw Exception(INTERNAL_ERROR, "Not Implemented _evaluate_inner"); + } uint32_t _column_id; // TODO: the value is only in delete condition, better be template value @@ -302,6 +335,8 @@ protected: std::shared_ptr<PredicateParams> _predicate_params; mutable uint64_t _evaluated_rows = 1; mutable uint64_t _passed_rows = 0; + mutable bool _always_true = false; + mutable bool _has_calculate_filter = false; }; } //namespace doris diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 35157808d75..17b334d7b8d 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -139,22 +139,6 @@ public: return Status::OK(); } - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override { - if (column.is_nullable()) { - auto* nullable_column_ptr = - vectorized::check_and_get_column<vectorized::ColumnNullable>(column); - auto& nested_column = nullable_column_ptr->get_nested_column(); - auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>( - nullable_column_ptr->get_null_map_column()) - .get_data(); - - return _base_evaluate<true>(&nested_column, null_map.data(), sel, size); - } else { - return _base_evaluate<false>(&column, nullptr, sel, size); - } - } - void evaluate_and(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override { _evaluate_bit<true>(column, sel, size, flags); @@ -273,17 +257,27 @@ public: template <bool is_and> __attribute__((flatten)) void _evaluate_vec_internal(const vectorized::IColumn& column, uint16_t size, bool* flags) const { + if (_can_ignore() && !_has_calculate_filter) { + if (is_and) { + for (uint16_t i = 0; i < size; i++) { + _evaluated_rows += flags[i]; + } + } else { + _evaluated_rows += size; + } + } + if (column.is_nullable()) { - auto* nullable_column_ptr = + const auto* nullable_column_ptr = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); - auto& nested_column = nullable_column_ptr->get_nested_column(); - auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>( - nullable_column_ptr->get_null_map_column()) - .get_data(); + const auto& nested_column = nullable_column_ptr->get_nested_column(); + const auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>( + nullable_column_ptr->get_null_map_column()) + .get_data(); if (nested_column.is_column_dictionary()) { if constexpr (std::is_same_v<T, StringRef>) { - auto* dict_column_ptr = + const auto* dict_column_ptr = vectorized::check_and_get_column<vectorized::ColumnDictI32>( nested_column); @@ -295,7 +289,7 @@ public: break; } } - auto* data_array = dict_column_ptr->get_data().data(); + const auto* data_array = dict_column_ptr->get_data().data(); _base_loop_vec<true, is_and>(size, flags, null_map.data(), data_array, dict_code); @@ -316,7 +310,7 @@ public: } else { if (column.is_column_dictionary()) { if constexpr (std::is_same_v<T, StringRef>) { - auto* dict_column_ptr = + const auto* dict_column_ptr = vectorized::check_and_get_column<vectorized::ColumnDictI32>(column); auto dict_code = _find_code_from_dictionary_column(*dict_column_ptr); do { @@ -326,7 +320,7 @@ public: break; } } - auto* data_array = dict_column_ptr->get_data().data(); + const auto* data_array = dict_column_ptr->get_data().data(); _base_loop_vec<false, is_and>(size, flags, nullptr, data_array, dict_code); } while (false); @@ -350,6 +344,15 @@ public: flags[i] = !flags[i]; } } + + if (_can_ignore() && !_has_calculate_filter) { + for (uint16_t i = 0; i < size; i++) { + _passed_rows += flags[i]; + } + vectorized::VRuntimeFilterWrapper::calculate_filter( + get_ignore_threshold(), _evaluated_rows - _passed_rows, _evaluated_rows, + _has_calculate_filter, _always_true); + } } void evaluate_vec(const vectorized::IColumn& column, uint16_t size, @@ -362,7 +365,26 @@ public: _evaluate_vec_internal<true>(column, size, flags); } + // todo: It may be necessary to set a more reasonable threshold + double get_ignore_threshold() const override { return 0.1; } + private: + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override { + if (column.is_nullable()) { + const auto* nullable_column_ptr = + vectorized::check_and_get_column<vectorized::ColumnNullable>(column); + const auto& nested_column = nullable_column_ptr->get_nested_column(); + const auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>( + nullable_column_ptr->get_null_map_column()) + .get_data(); + + return _base_evaluate<true>(&nested_column, null_map.data(), sel, size); + } else { + return _base_evaluate<false>(&column, nullptr, sel, size); + } + } + template <typename LeftT, typename RightT> bool _operator(const LeftT& lhs, const RightT& rhs) const { if constexpr (PT == PredicateType::EQ) { diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 7d3a9d1d3a7..5d7fb783239 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -228,34 +228,6 @@ public: return Status::OK(); } - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override { - int64_t new_size = 0; - - if (column.is_nullable()) { - auto* nullable_col = - vectorized::check_and_get_column<vectorized::ColumnNullable>(column); - auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>( - nullable_col->get_null_map_column()) - .get_data(); - auto& nested_col = nullable_col->get_nested_column(); - - if (_opposite) { - new_size = _base_evaluate<true, true>(&nested_col, &null_map, sel, size); - } else { - new_size = _base_evaluate<true, false>(&nested_col, &null_map, sel, size); - } - } else { - if (_opposite) { - new_size = _base_evaluate<false, true>(&column, nullptr, sel, size); - } else { - new_size = _base_evaluate<false, false>(&column, nullptr, sel, size); - } - } - _evaluated_rows += size; - _passed_rows += new_size; - return new_size; - } int get_filter_id() const override { return _values->get_filter_id(); } bool is_filter() const override { return true; } @@ -372,7 +344,40 @@ public: return PT == PredicateType::IN_LIST && !ngram; } + double get_ignore_threshold() const override { return std::log2(_values->size() + 1) / 64; } + private: + bool _can_ignore() const override { return _values->is_runtime_filter(); } + + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override { + int64_t new_size = 0; + + if (column.is_nullable()) { + const auto* nullable_col = + vectorized::check_and_get_column<vectorized::ColumnNullable>(column); + const auto& null_map = reinterpret_cast<const vectorized::ColumnUInt8&>( + nullable_col->get_null_map_column()) + .get_data(); + const auto& nested_col = nullable_col->get_nested_column(); + + if (_opposite) { + new_size = _base_evaluate<true, true>(&nested_col, &null_map, sel, size); + } else { + new_size = _base_evaluate<true, false>(&nested_col, &null_map, sel, size); + } + } else { + if (_opposite) { + new_size = _base_evaluate<false, true>(&column, nullptr, sel, size); + } else { + new_size = _base_evaluate<false, false>(&column, nullptr, sel, size); + } + } + _evaluated_rows += size; + _passed_rows += new_size; + return new_size; + } + template <typename LeftT, typename RightT> bool _operator(const LeftT& lhs, const RightT& rhs) const { if constexpr (PT == PredicateType::IN_LIST) { diff --git a/be/src/olap/like_column_predicate.cpp b/be/src/olap/like_column_predicate.cpp index 94b65c0eca2..b441e982606 100644 --- a/be/src/olap/like_column_predicate.cpp +++ b/be/src/olap/like_column_predicate.cpp @@ -51,8 +51,8 @@ void LikeColumnPredicate<T>::evaluate_and_vec(const vectorized::IColumn& column, } template <PrimitiveType T> -uint16_t LikeColumnPredicate<T>::evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const { +uint16_t LikeColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const { uint16_t new_size = 0; if (column.is_nullable()) { auto* nullable_col = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index 3a918d7605d..eda033095a0 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -66,9 +66,6 @@ public: return input_type == T || (is_string_type(input_type) && is_string_type(T)); } - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override; - void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override; @@ -91,6 +88,9 @@ public: bool can_do_bloom_filter(bool ngram) const override { return ngram; } private: + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override; + template <bool is_and> void _evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const { if (column.is_nullable()) { diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 776c9c4c258..5ec8a8bf9cb 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -53,8 +53,8 @@ public: bool support_zonemap() const override { return false; } //evaluate predicate on Bitmap - virtual Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, - roaring::Roaring* roaring) const override { + Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, + roaring::Roaring* roaring) const override { LOG(FATAL) << "Not Implemented MatchPredicate::evaluate"; } @@ -68,6 +68,11 @@ public: } private: + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override { + return size; + } + InvertedIndexQueryType _to_inverted_index_query_type(MatchType match_type) const; std::string _debug_string() const override { std::string info = "MatchPredicate"; diff --git a/be/src/olap/null_predicate.cpp b/be/src/olap/null_predicate.cpp index 7aec65cc7ea..0b184707d8f 100644 --- a/be/src/olap/null_predicate.cpp +++ b/be/src/olap/null_predicate.cpp @@ -77,8 +77,8 @@ Status NullPredicate::evaluate(const vectorized::NameAndTypePair& name_with_type return Status::OK(); } -uint16_t NullPredicate::evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const { +uint16_t NullPredicate::_evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const { uint16_t new_size = 0; if (auto* nullable = check_and_get_column<ColumnNullable>(column)) { if (!nullable->has_null()) { diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h index 388d0803178..ccca5c51027 100644 --- a/be/src/olap/null_predicate.h +++ b/be/src/olap/null_predicate.h @@ -56,9 +56,6 @@ public: InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override; - uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, - uint16_t size) const override; - void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const override; @@ -105,6 +102,9 @@ public: void evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const override; private: + uint16_t _evaluate_inner(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override; + std::string _debug_string() const override { std::string info = "NullPredicate(" + std::string(_is_null ? "is_null" : "not_null") + ")"; return info; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 3c7de0adb7b..68e67d28830 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1960,6 +1960,12 @@ void SegmentIterator::_replace_version_col(size_t num_rows) { uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size) { SCOPED_RAW_TIMER(&_opts.stats->vec_cond_ns); + if (_is_need_vec_eval) { + _is_need_vec_eval = false; + for (const auto& pred : _pre_eval_block_predicate) { + _is_need_vec_eval |= (!pred->always_true()); + } + } if (!_is_need_vec_eval) { for (uint32_t i = 0; i < selected_size; ++i) { sel_rowid_idx[i] = i; @@ -1969,14 +1975,20 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ uint16_t original_size = selected_size; bool ret_flags[original_size]; - DCHECK(_pre_eval_block_predicate.size() > 0); - auto column_id = _pre_eval_block_predicate[0]->column_id(); - auto& column = _current_return_columns[column_id]; - _pre_eval_block_predicate[0]->evaluate_vec(*column, original_size, ret_flags); - for (int i = 1; i < _pre_eval_block_predicate.size(); i++) { - auto column_id2 = _pre_eval_block_predicate[i]->column_id(); - auto& column2 = _current_return_columns[column_id2]; - _pre_eval_block_predicate[i]->evaluate_and_vec(*column2, original_size, ret_flags); + DCHECK(!_pre_eval_block_predicate.empty()); + bool is_first = true; + for (int i = 0; i < _pre_eval_block_predicate.size(); i++) { + if (_pre_eval_block_predicate[i]->always_true()) { + continue; + } + auto column_id = _pre_eval_block_predicate[i]->column_id(); + auto& column = _current_return_columns[column_id]; + if (is_first) { + _pre_eval_block_predicate[i]->evaluate_vec(*column, original_size, ret_flags); + is_first = false; + } else { + _pre_eval_block_predicate[i]->evaluate_and_vec(*column, original_size, ret_flags); + } } uint16_t new_size = 0; diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.cpp b/be/src/vec/exprs/vruntimefilter_wrapper.cpp index c623355d673..075e5194866 100644 --- a/be/src/vec/exprs/vruntimefilter_wrapper.cpp +++ b/be/src/vec/exprs/vruntimefilter_wrapper.cpp @@ -121,7 +121,8 @@ Status VRuntimeFilterWrapper::execute(VExprContext* context, Block* block, int* _expr_name, _data_type->get_name(), *result_column_id, block->dump_structure()); } - calculate_filter(_filtered_rows, _scan_rows, _has_calculate_filter, _always_true); + calculate_filter(VRuntimeFilterWrapper::EXPECTED_FILTER_RATE, _filtered_rows, _scan_rows, + _has_calculate_filter, _always_true); return Status::OK(); } } diff --git a/be/src/vec/exprs/vruntimefilter_wrapper.h b/be/src/vec/exprs/vruntimefilter_wrapper.h index 642e669288f..6d50b914beb 100644 --- a/be/src/vec/exprs/vruntimefilter_wrapper.h +++ b/be/src/vec/exprs/vruntimefilter_wrapper.h @@ -60,10 +60,10 @@ public: // if filter rate less than this, bloom filter will set always true constexpr static double EXPECTED_FILTER_RATE = 0.4; - static void calculate_filter(int64_t filter_rows, int64_t scan_rows, bool& has_calculate, - bool& always_true) { - if ((!has_calculate) && (scan_rows > config::bloom_filter_predicate_check_row_num)) { - if (filter_rows / (scan_rows * 1.0) < VRuntimeFilterWrapper::EXPECTED_FILTER_RATE) { + static void calculate_filter(double ignore_threshold, int64_t filter_rows, int64_t scan_rows, + bool& has_calculate, bool& always_true) { + if ((!has_calculate) && (scan_rows > config::rf_predicate_check_row_num)) { + if (filter_rows / (scan_rows * 1.0) < ignore_threshold) { always_true = true; } has_calculate = true; diff --git a/be/test/olap/bitmap_filter_column_predicate_test.cpp b/be/test/olap/bitmap_filter_column_predicate_test.cpp index da7d92d546d..2f97704de50 100644 --- a/be/test/olap/bitmap_filter_column_predicate_test.cpp +++ b/be/test/olap/bitmap_filter_column_predicate_test.cpp @@ -136,7 +136,7 @@ TEST_F(BitmapFilterColumnPredicateTest, evaluate_column) { sel[i] = i; } uint16_t size = column->size(); - size = predicate.evaluate(*column, sel, size); + size = predicate.ColumnPredicate::evaluate(*column, sel, size); EXPECT_EQ(size, 8); EXPECT_EQ(sel[0], 0); EXPECT_EQ(sel[1], 1); @@ -178,7 +178,7 @@ TEST_F(BitmapFilterColumnPredicateTest, evaluate_column_nullable) { } uint16_t size = column_nullable->size(); - size = predicate.evaluate(*column_nullable, sel, size); + size = predicate.ColumnPredicate::evaluate(*column_nullable, sel, size); EXPECT_EQ(size, 6); EXPECT_EQ(sel[0], 1); diff --git a/docs/en/docs/query-acceleration/join-optimization/runtime-filter.md b/docs/en/docs/query-acceleration/join-optimization/runtime-filter.md index a70522b0215..88a4e6dd169 100644 --- a/docs/en/docs/query-acceleration/join-optimization/runtime-filter.md +++ b/docs/en/docs/query-acceleration/join-optimization/runtime-filter.md @@ -114,7 +114,7 @@ The query options are further explained below. #### 1.runtime_filter_type Type of Runtime Filter used. -**Type**: Number (1, 2, 4, 8, 16) or the corresponding mnemonic string (IN, BLOOM_FILTER, MIN_MAX, IN_OR_BLOOM_FILTER, BITMAP_FILTER), the default is 8 (IN_OR_BLOOM FILTER), use multiple commas to separate, pay attention to the need to add quotation marks , Or add any number of types, for example: +**Type**: Number (1, 2, 4, 8, 16) or the corresponding mnemonic string (IN, BLOOM_FILTER, MIN_MAX, IN_OR_BLOOM_FILTER, BITMAP_FILTER), the default is 12 (MIN_MAX,IN_OR_BLOOM_FILTER), use multiple commas to separate, pay attention to the need to add quotation marks , Or add any number of types, for example: ``` set runtime_filter_type="BLOOM_FILTER,IN,MIN_MAX"; ``` diff --git a/docs/zh-CN/docs/query-acceleration/join-optimization/runtime-filter.md b/docs/zh-CN/docs/query-acceleration/join-optimization/runtime-filter.md index 8ed2876235a..bcb8fcf9de1 100644 --- a/docs/zh-CN/docs/query-acceleration/join-optimization/runtime-filter.md +++ b/docs/zh-CN/docs/query-acceleration/join-optimization/runtime-filter.md @@ -111,7 +111,7 @@ Runtime Filter主要用于大表join小表的优化,如果左表的数据量 使用的Runtime Filter类型。 -**类型**: 数字(1, 2, 4, 8, 16)或者相对应的助记符字符串(IN, BLOOM_FILTER, MIN_MAX, `IN_OR_BLOOM_FILTER`, BITMAP_FILTER),默认8(`IN_OR_BLOOM_FILTER`),使用多个时用逗号分隔,注意需要加引号,或者将任意多个类型的数字相加,例如: +**类型**: 数字(1, 2, 4, 8, 16)或者相对应的助记符字符串(IN, BLOOM_FILTER, MIN_MAX, IN_OR_BLOOM_FILTER, BITMAP_FILTER),默认12(MIN_MAX,IN_OR_BLOOM_FILTER),使用多个时用逗号分隔,注意需要加引号,或者将任意多个类型的数字相加,例如: ```sql set runtime_filter_type="BLOOM_FILTER,IN,MIN_MAX"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 5ee90f40a46..84c4289271a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -902,7 +902,7 @@ public class SessionVariable implements Serializable, Writable { // Set runtimeFilterType to IN_OR_BLOOM filter @VariableMgr.VarAttr(name = RUNTIME_FILTER_TYPE, fuzzy = true, needForward = true) - private int runtimeFilterType = 8; + private int runtimeFilterType = 12; @VariableMgr.VarAttr(name = RUNTIME_FILTER_MAX_IN_NUM, needForward = true) private int runtimeFilterMaxInNum = 1024; diff --git a/regression-test/data/variable_p0/set_and_unset_variable.out b/regression-test/data/variable_p0/set_and_unset_variable.out index 7f6aeed2efe..e7f0dc2fdf1 100644 --- a/regression-test/data/variable_p0/set_and_unset_variable.out +++ b/regression-test/data/variable_p0/set_and_unset_variable.out @@ -21,37 +21,37 @@ wait_timeout 28800 28800 0 0 -- !cmd -- -runtime_filter_type BLOOM_FILTER IN_OR_BLOOM_FILTER 1 +runtime_filter_type BLOOM_FILTER IN_OR_BLOOM_FILTER,MIN_MAX 1 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- 0 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- 0 -- !cmd -- -runtime_filter_type BLOOM_FILTER IN_OR_BLOOM_FILTER 1 +runtime_filter_type BLOOM_FILTER IN_OR_BLOOM_FILTER,MIN_MAX 1 -- !cmd -- -runtime_filter_type BLOOM_FILTER IN_OR_BLOOM_FILTER 1 +runtime_filter_type BLOOM_FILTER IN_OR_BLOOM_FILTER,MIN_MAX 1 -- !cmd -- 0 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- 0 @@ -123,7 +123,7 @@ deprecated_enable_local_exchange true true 0 0 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- experimental_enable_agg_state false false 0 @@ -150,7 +150,7 @@ show_hidden_columns false false 0 0 -- !cmd -- -runtime_filter_type IN_OR_BLOOM_FILTER IN_OR_BLOOM_FILTER 0 +runtime_filter_type IN_OR_BLOOM_FILTER,MIN_MAX IN_OR_BLOOM_FILTER,MIN_MAX 0 -- !cmd -- experimental_enable_agg_state false false 0 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org