This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.1 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit dfc51faa3974b4c89ccd835dd2d98af8dcb00ff9 Author: ZenoYang <cookie...@qq.com> AuthorDate: Tue Mar 29 19:11:54 2022 +0800 [refactor][optimize](storage) Code optimization and refactoring for low-cardinality columns in storage layer (#8627) * Optimize predicate calculation and refactor --- be/src/olap/bloom_filter_predicate.h | 26 +-- be/src/olap/column_predicate.h | 23 ++- be/src/olap/comparison_predicate.cpp | 190 ++++++++++++------- be/src/olap/comparison_predicate.h | 26 +-- be/src/olap/in_list_predicate.cpp | 111 +++++++---- be/src/olap/in_list_predicate.h | 17 +- be/src/olap/null_predicate.cpp | 4 + be/src/olap/null_predicate.h | 2 + be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 13 +- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 27 ++- be/src/runtime/string_value.h | 6 + be/src/vec/columns/column.h | 8 +- be/src/vec/columns/column_dictionary.h | 211 ++++++++++----------- be/src/vec/columns/column_nullable.h | 11 ++ 14 files changed, 388 insertions(+), 287 deletions(-) diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 3b49cb0..fa65293 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -37,10 +37,10 @@ namespace doris { class VectorizedRowBatch; // only use in runtime filter and segment v2 -template <PrimitiveType type> +template <PrimitiveType T> class BloomFilterColumnPredicate : public ColumnPredicate { public: - using SpecificFilter = BloomFilterFunc<type, CurrentBloomFilterAdaptor>; + using SpecificFilter = BloomFilterFunc<T, CurrentBloomFilterAdaptor>; BloomFilterColumnPredicate(uint32_t column_id, const std::shared_ptr<IBloomFilterFuncBase>& filter) @@ -49,6 +49,8 @@ public: _specific_filter(static_cast<SpecificFilter*>(_filter.get())) {} ~BloomFilterColumnPredicate() override = default; + PredicateType type() const override { return PredicateType::BF; } + void evaluate(VectorizedRowBatch* batch) const override; void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override; @@ -65,16 +67,14 @@ public: void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const override; - bool is_bloom_filter_predicate() override { return true; } - private: std::shared_ptr<IBloomFilterFuncBase> _filter; SpecificFilter* _specific_filter; // owned by _filter }; // bloom filter column predicate do not support in segment v1 -template <PrimitiveType type> -void BloomFilterColumnPredicate<type>::evaluate(VectorizedRowBatch* batch) const { +template <PrimitiveType T> +void BloomFilterColumnPredicate<T>::evaluate(VectorizedRowBatch* batch) const { uint16_t n = batch->size(); uint16_t* sel = batch->selected(); if (!batch->selected_in_use()) { @@ -84,8 +84,8 @@ void BloomFilterColumnPredicate<type>::evaluate(VectorizedRowBatch* batch) const } } -template <PrimitiveType type> -void BloomFilterColumnPredicate<type>::evaluate(ColumnBlock* block, uint16_t* sel, +template <PrimitiveType T> +void BloomFilterColumnPredicate<T>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const { uint16_t new_size = 0; if (block->is_nullable()) { @@ -107,16 +107,16 @@ void BloomFilterColumnPredicate<type>::evaluate(ColumnBlock* block, uint16_t* se *size = new_size; } -template <PrimitiveType type> -void BloomFilterColumnPredicate<type>::evaluate(vectorized::IColumn& column, uint16_t* sel, +template <PrimitiveType T> +void BloomFilterColumnPredicate<T>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { uint16_t new_size = 0; - using T = typename PredicatePrimitiveTypeTraits<type>::PredicateFieldType; + using FT = typename PredicatePrimitiveTypeTraits<T>::PredicateFieldType; if (column.is_nullable()) { auto* nullable_col = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); auto& null_map_data = nullable_col->get_null_map_column().get_data(); - auto* pred_col = vectorized::check_and_get_column<vectorized::PredicateColumnType<T>>( + auto* pred_col = vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>( nullable_col->get_nested_column()); auto& pred_col_data = pred_col->get_data(); for (uint16_t i = 0; i < *size; i++) { @@ -127,7 +127,7 @@ void BloomFilterColumnPredicate<type>::evaluate(vectorized::IColumn& column, uin } } else { auto* pred_col = - vectorized::check_and_get_column<vectorized::PredicateColumnType<T>>(column); + vectorized::check_and_get_column<vectorized::PredicateColumnType<FT>>(column); auto& pred_col_data = pred_col->get_data(); for (uint16_t i = 0; i < *size; i++) { uint16_t idx = sel[i]; diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 342d8fc..45f8fcf 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -33,6 +33,21 @@ class VectorizedRowBatch; class Schema; class RowBlockV2; +enum class PredicateType { + UNKNOWN = 0, + EQ = 1, + NE = 2, + LT = 3, + LE = 4, + GT = 5, + GE = 6, + IN_LIST = 7, + NO_IN_LIST = 8, + IS_NULL = 9, + NOT_IS_NULL = 10, + BF = 11, // BloomFilter +}; + class ColumnPredicate { public: explicit ColumnPredicate(uint32_t column_id, bool opposite = false) @@ -40,6 +55,8 @@ public: virtual ~ColumnPredicate() = default; + virtual PredicateType type() const = 0; + //evaluate predicate on VectorizedRowBatch virtual void evaluate(VectorizedRowBatch* batch) const = 0; @@ -69,11 +86,7 @@ public: virtual void evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) const {}; uint32_t column_id() const { return _column_id; } - virtual bool is_in_predicate() { return false; } - - virtual bool is_bloom_filter_predicate() { return false; } - - virtual bool is_range_comparison_predicate() { return false; } + virtual void set_dict_code_if_necessary(vectorized::IColumn& column) { } protected: uint32_t _column_id; diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp index d74dd10..ef6ee3a 100644 --- a/be/src/olap/comparison_predicate.cpp +++ b/be/src/olap/comparison_predicate.cpp @@ -28,9 +28,9 @@ namespace doris { -#define COMPARISON_PRED_CONSTRUCTOR(CLASS) \ - template <class type> \ - CLASS<type>::CLASS(uint32_t column_id, const type& value, bool opposite) \ +#define COMPARISON_PRED_CONSTRUCTOR(CLASS) \ + template <class T> \ + CLASS<T>::CLASS(uint32_t column_id, const T& value, bool opposite) \ : ColumnPredicate(column_id, opposite), _value(value) {} COMPARISON_PRED_CONSTRUCTOR(EqualPredicate) @@ -56,15 +56,15 @@ COMPARISON_PRED_CONSTRUCTOR_STRING(GreaterPredicate) COMPARISON_PRED_CONSTRUCTOR_STRING(GreaterEqualPredicate) #define COMPARISON_PRED_EVALUATE(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate(VectorizedRowBatch* batch) const { \ + template <class T> \ + void CLASS<T>::evaluate(VectorizedRowBatch* batch) const { \ uint16_t n = batch->size(); \ if (n == 0) { \ return; \ } \ uint16_t* sel = batch->selected(); \ - const type* col_vector = \ - reinterpret_cast<const type*>(batch->column(_column_id)->col_data()); \ + const T* col_vector = \ + reinterpret_cast<const T*>(batch->column(_column_id)->col_data()); \ uint16_t new_size = 0; \ if (batch->column(_column_id)->no_nulls()) { \ if (batch->selected_in_use()) { \ @@ -114,15 +114,15 @@ COMPARISON_PRED_EVALUATE(GreaterPredicate, >) COMPARISON_PRED_EVALUATE(GreaterEqualPredicate, >=) #define COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const { \ + template <class T> \ + void CLASS<T>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const { \ uint16_t new_size = 0; \ if (block->is_nullable()) { \ for (uint16_t i = 0; i < *size; ++i) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (!block->cell(idx).is_null() && (*cell_value OP _value)); \ new_size += _opposite ? !result : result; \ } \ @@ -130,8 +130,8 @@ COMPARISON_PRED_EVALUATE(GreaterEqualPredicate, >=) for (uint16_t i = 0; i < *size; ++i) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (*cell_value OP _value); \ new_size += _opposite ? !result : result; \ } \ @@ -147,9 +147,9 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterPredicate, >) COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterEqualPredicate, >=) // todo(zeno) define interface in IColumn to simplify code -#define COMPARISON_PRED_COLUMN_EVALUATE(CLASS, OP, IS_RANGE) \ - template <class type> \ - void CLASS<type>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { \ +#define COMPARISON_PRED_COLUMN_EVALUATE(CLASS, OP) \ + template <class T> \ + void CLASS<T>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { \ uint16_t new_size = 0; \ if (column.is_nullable()) { \ auto* nullable_col = \ @@ -159,60 +159,54 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterEqualPredicate, >=) .get_data(); \ auto& nested_col = nullable_col->get_nested_column(); \ if (nested_col.is_column_dictionary()) { \ - if constexpr (std::is_same_v<type, StringValue>) { \ + if constexpr (std::is_same_v<T, StringValue>) { \ auto* nested_col_ptr = vectorized::check_and_get_column< \ vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); \ - auto code = IS_RANGE ? nested_col_ptr->find_bound_code(_value, 0 OP 1, 1 OP 1) \ - : nested_col_ptr->find_code(_value); \ auto& data_array = nested_col_ptr->get_data(); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const auto& cell_value = \ - reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \ - bool ret = !null_bitmap[idx] && (cell_value OP code); \ + const auto& cell_value = data_array[idx]; \ + bool ret = !null_bitmap[idx] && (cell_value OP _dict_code); \ new_size += _opposite ? !ret : ret; \ } \ } \ } else { \ auto* nested_col_ptr = \ - vectorized::check_and_get_column<vectorized::PredicateColumnType<type>>( \ + vectorized::check_and_get_column<vectorized::PredicateColumnType<T>>( \ nested_col); \ auto& data_array = nested_col_ptr->get_data(); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \ + const auto& cell_value = reinterpret_cast<const T&>(data_array[idx]); \ bool ret = !null_bitmap[idx] && (cell_value OP _value); \ new_size += _opposite ? !ret : ret; \ } \ } \ *size = new_size; \ } else if (column.is_column_dictionary()) { \ - if constexpr (std::is_same_v<type, StringValue>) { \ + if constexpr (std::is_same_v<T, StringValue>) { \ auto& dict_col = \ reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(column);\ auto& data_array = dict_col.get_data(); \ - auto code = IS_RANGE ? dict_col.find_bound_code(_value, 0 OP 1, 1 OP 1) \ - : dict_col.find_code(_value); \ for (uint16_t i = 0; i < *size; ++i) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const auto& cell_value = \ - reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \ - bool ret = cell_value OP code; \ + const auto& cell_value = data_array[idx]; \ + bool ret = cell_value OP _dict_code; \ new_size += _opposite ? !ret : ret; \ } \ *size = new_size; \ } \ } else { \ auto& pred_column_ref = \ - reinterpret_cast<vectorized::PredicateColumnType<type>&>(column); \ + reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \ auto& data_array = pred_column_ref.get_data(); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \ + const auto& cell_value = reinterpret_cast<const T&>(data_array[idx]); \ auto ret = cell_value OP _value; \ new_size += _opposite ? !ret : ret; \ } \ @@ -221,21 +215,21 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE(GreaterEqualPredicate, >=) } -COMPARISON_PRED_COLUMN_EVALUATE(EqualPredicate, ==, false) -COMPARISON_PRED_COLUMN_EVALUATE(NotEqualPredicate, !=, false) -COMPARISON_PRED_COLUMN_EVALUATE(LessPredicate, <, true) -COMPARISON_PRED_COLUMN_EVALUATE(LessEqualPredicate, <=, true) -COMPARISON_PRED_COLUMN_EVALUATE(GreaterPredicate, >, true) -COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true) +COMPARISON_PRED_COLUMN_EVALUATE(EqualPredicate, ==) +COMPARISON_PRED_COLUMN_EVALUATE(NotEqualPredicate, !=) +COMPARISON_PRED_COLUMN_EVALUATE(LessPredicate, <) +COMPARISON_PRED_COLUMN_EVALUATE(LessEqualPredicate, <=) +COMPARISON_PRED_COLUMN_EVALUATE(GreaterPredicate, >) +COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=) #define COMPARISON_PRED_COLUMN_EVALUATE_VEC(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) \ + template <class T> \ + void CLASS<T>::evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) \ const { \ if (column.is_nullable()) { \ auto* nullable_column = \ vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \ - auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<type>&>( \ + auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<T>&>( \ nullable_column->get_nested_column()) \ .get_data(); \ auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>( \ @@ -246,7 +240,7 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=, true) } \ } else { \ auto& predicate_column = \ - reinterpret_cast<vectorized::PredicateColumnType<type>&>(column); \ + reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \ auto& data_array = predicate_column.get_data(); \ for (uint16_t i = 0; i < size; i++) { \ flags[i] = data_array[i] OP _value; \ @@ -267,15 +261,15 @@ COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterPredicate, >) COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterEqualPredicate, >=) #define COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_OR(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ + template <class T> \ + void CLASS<T>::evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ const { \ if (block->is_nullable()) { \ for (uint16_t i = 0; i < size; ++i) { \ if (flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (!block->cell(idx).is_null() && (*cell_value OP _value)); \ flags[i] |= _opposite ? !result : result; \ } \ @@ -283,8 +277,8 @@ COMPARISON_PRED_COLUMN_EVALUATE_VEC(GreaterEqualPredicate, >=) for (uint16_t i = 0; i < size; ++i) { \ if (flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (*cell_value OP _value); \ flags[i] |= _opposite ? !result : result; \ } \ @@ -299,11 +293,11 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_OR(GreaterPredicate, >) COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_OR(GreaterEqualPredicate, >=) #define COMPARISON_PRED_COLUMN_EVALUATE_OR(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const { \ + template <class T> \ + void CLASS<T>::evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const { \ if (column.is_nullable()) { \ auto* nullable_column = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \ - auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<type>&>(nullable_column->get_nested_column()).get_data(); \ + auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<T>&>(nullable_column->get_nested_column()).get_data(); \ auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>(*(nullable_column->get_null_map_column_ptr())).get_data();\ for (uint16_t i = 0; i < size; i++) { \ if (flags[i]) continue; \ @@ -312,7 +306,7 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_OR(GreaterEqualPredicate, >=) flags[i] |= _opposite ? !ret : ret; \ } \ } else { \ - auto& predicate_column = reinterpret_cast<vectorized::PredicateColumnType<type>&>(column); \ + auto& predicate_column = reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \ auto& data_array = predicate_column.get_data(); \ for (uint16_t i = 0; i < size; ++i) { \ if (flags[i]) continue; \ @@ -331,15 +325,15 @@ COMPARISON_PRED_COLUMN_EVALUATE_OR(GreaterPredicate, >) COMPARISON_PRED_COLUMN_EVALUATE_OR(GreaterEqualPredicate, >=) #define COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_AND(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ + template <class T> \ + void CLASS<T>::evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ const { \ if (block->is_nullable()) { \ for (uint16_t i = 0; i < size; ++i) { \ if (!flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (!block->cell(idx).is_null() && (*cell_value OP _value)); \ flags[i] &= _opposite ? !result : result; \ } \ @@ -347,8 +341,8 @@ COMPARISON_PRED_COLUMN_EVALUATE_OR(GreaterEqualPredicate, >=) for (uint16_t i = 0; i < size; ++i) { \ if (!flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (*cell_value OP _value); \ flags[i] &= _opposite ? !result : result; \ } \ @@ -363,11 +357,11 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_AND(GreaterPredicate, >) COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_AND(GreaterEqualPredicate, >=) #define COMPARISON_PRED_COLUMN_EVALUATE_AND(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const { \ + template <class T> \ + void CLASS<T>::evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const { \ if (column.is_nullable()) { \ auto* nullable_column = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); \ - auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<type>&>(nullable_column->get_nested_column()).get_data(); \ + auto& data_array = reinterpret_cast<const vectorized::PredicateColumnType<T>&>(nullable_column->get_nested_column()).get_data(); \ auto& null_bitmap = reinterpret_cast<const vectorized::ColumnVector<uint8_t>&>(*(nullable_column->get_null_map_column_ptr())).get_data();\ for (uint16_t i = 0; i < size; i++) { \ if (!flags[i]) continue; \ @@ -376,7 +370,7 @@ COMPARISON_PRED_COLUMN_BLOCK_EVALUATE_AND(GreaterEqualPredicate, >=) flags[i] &= _opposite ? !ret : ret; \ } \ } else { \ - auto& predicate_column = reinterpret_cast<vectorized::PredicateColumnType<type>&>(column); \ + auto& predicate_column = reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \ auto& data_array = predicate_column.get_data(); \ for (uint16_t i = 0; i < size; ++i) { \ if (!flags[i]) continue; \ @@ -477,8 +471,8 @@ COMPARISON_PRED_COLUMN_EVALUATE_AND(GreaterEqualPredicate, >=) BITMAP_COMPARE_##CLASS(s, exact_match, seeked_ordinal, iterator, bitmap, roaring) #define COMPARISON_PRED_BITMAP_EVALUATE(CLASS, OP) \ - template <class type> \ - Status CLASS<type>::evaluate(const Schema& schema, \ + template <class T> \ + Status CLASS<T>::evaluate(const Schema& schema, \ const std::vector<BitmapIndexIterator*>& iterators, \ uint32_t num_rows, roaring::Roaring* bitmap) const { \ BitmapIndexIterator* iterator = iterators[_column_id]; \ @@ -508,6 +502,64 @@ COMPARISON_PRED_BITMAP_EVALUATE(LessEqualPredicate, <=) COMPARISON_PRED_BITMAP_EVALUATE(GreaterPredicate, >) COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=) + +#define COMPARISON_PRED_SET_DICT_CODE(CLASS) \ + template <class T> \ + void CLASS<T>::set_dict_code_if_necessary(vectorized::IColumn& column) { \ + if (_dict_code_inited) { \ + return; \ + } \ + if constexpr (std::is_same_v<T, StringValue>) { \ + auto* col_ptr = column.get_ptr().get(); \ + if (column.is_nullable()) { \ + auto nullable_col = \ + reinterpret_cast<vectorized::ColumnNullable*>(col_ptr); \ + col_ptr = nullable_col->get_nested_column_ptr().get(); \ + } \ + if (col_ptr->is_column_dictionary()) { \ + auto& dict_col = \ + reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ + *col_ptr); \ + auto code = dict_col.find_code(_value); \ + _dict_code = code; \ + _dict_code_inited = true; \ + } \ + } \ + } + +COMPARISON_PRED_SET_DICT_CODE(EqualPredicate) +COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate) + +#define RAMGE_COMPARISON_PRED_SET_DICT_CODE(CLASS, OP) \ + template <class T> \ + void CLASS<T>::set_dict_code_if_necessary(vectorized::IColumn& column) { \ + if (_dict_code_inited) { \ + return; \ + } \ + if constexpr (std::is_same_v<T, StringValue>) { \ + auto* col_ptr = column.get_ptr().get(); \ + if (column.is_nullable()) { \ + auto nullable_col = \ + reinterpret_cast<vectorized::ColumnNullable*>(col_ptr); \ + col_ptr = nullable_col->get_nested_column_ptr().get(); \ + } \ + \ + if (col_ptr->is_column_dictionary()) { \ + auto& dict_col = \ + reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ + *col_ptr); \ + auto code = dict_col.find_code_by_bound(_value, 0 OP 1, 1 OP 1); \ + _dict_code = code; \ + _dict_code_inited = true; \ + } \ + } \ + } + +RAMGE_COMPARISON_PRED_SET_DICT_CODE(LessPredicate, <) +RAMGE_COMPARISON_PRED_SET_DICT_CODE(LessEqualPredicate, <=) +RAMGE_COMPARISON_PRED_SET_DICT_CODE(GreaterPredicate, >) +RAMGE_COMPARISON_PRED_SET_DICT_CODE(GreaterEqualPredicate, >=) + #define COMPARISON_PRED_CONSTRUCTOR_DECLARATION(CLASS) \ template CLASS<int8_t>::CLASS(uint32_t column_id, const int8_t& value, bool opposite); \ template CLASS<int16_t>::CLASS(uint32_t column_id, const int16_t& value, bool opposite); \ @@ -692,4 +744,14 @@ COMPARISON_PRED_COLUMN_EVALUATE_VEC_DECLARATION(LessEqualPredicate) COMPARISON_PRED_COLUMN_EVALUATE_VEC_DECLARATION(GreaterPredicate) COMPARISON_PRED_COLUMN_EVALUATE_VEC_DECLARATION(GreaterEqualPredicate) +#define COMPARISON_PRED_SET_DICT_CODE_DECLARATION(CLASS) \ +template void CLASS<StringValue>::set_dict_code_if_necessary(vectorized::IColumn& column); + +COMPARISON_PRED_SET_DICT_CODE_DECLARATION(EqualPredicate) +COMPARISON_PRED_SET_DICT_CODE_DECLARATION(NotEqualPredicate) +COMPARISON_PRED_SET_DICT_CODE_DECLARATION(LessPredicate) +COMPARISON_PRED_SET_DICT_CODE_DECLARATION(LessEqualPredicate) +COMPARISON_PRED_SET_DICT_CODE_DECLARATION(GreaterPredicate) +COMPARISON_PRED_SET_DICT_CODE_DECLARATION(GreaterEqualPredicate) + } //namespace doris diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index e363675..3df31c3 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -26,11 +26,12 @@ namespace doris { class VectorizedRowBatch; -#define COMPARISON_PRED_CLASS_DEFINE(CLASS, IS_RANGE) \ - template <class type> \ +#define COMPARISON_PRED_CLASS_DEFINE(CLASS, PT) \ + template <class T> \ class CLASS : public ColumnPredicate { \ public: \ - CLASS(uint32_t column_id, const type& value, bool opposite = false); \ + CLASS(uint32_t column_id, const T& value, bool opposite = false); \ + PredicateType type() const override { return PredicateType::PT; } \ virtual void evaluate(VectorizedRowBatch* batch) const override; \ void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override; \ void evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, \ @@ -46,18 +47,19 @@ class VectorizedRowBatch; void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size, \ bool* flags) const override; \ void evaluate_vec(vectorized::IColumn& column, uint16_t size, bool* flags) const override; \ - bool is_range_comparison_predicate() override { return IS_RANGE; } \ - \ + void set_dict_code_if_necessary(vectorized::IColumn& column) override; \ private: \ - type _value; \ + T _value; \ + bool _dict_code_inited = false; \ + int32_t _dict_code; \ }; -COMPARISON_PRED_CLASS_DEFINE(EqualPredicate, false) -COMPARISON_PRED_CLASS_DEFINE(NotEqualPredicate, false) -COMPARISON_PRED_CLASS_DEFINE(LessPredicate, true) -COMPARISON_PRED_CLASS_DEFINE(LessEqualPredicate, true) -COMPARISON_PRED_CLASS_DEFINE(GreaterPredicate, true) -COMPARISON_PRED_CLASS_DEFINE(GreaterEqualPredicate, true) +COMPARISON_PRED_CLASS_DEFINE(EqualPredicate, EQ) +COMPARISON_PRED_CLASS_DEFINE(NotEqualPredicate, NE) +COMPARISON_PRED_CLASS_DEFINE(LessPredicate, LT) +COMPARISON_PRED_CLASS_DEFINE(LessEqualPredicate, LE) +COMPARISON_PRED_CLASS_DEFINE(GreaterPredicate, GT) +COMPARISON_PRED_CLASS_DEFINE(GreaterEqualPredicate, GE) } //namespace doris diff --git a/be/src/olap/in_list_predicate.cpp b/be/src/olap/in_list_predicate.cpp index 3fdac7d..21214f2 100644 --- a/be/src/olap/in_list_predicate.cpp +++ b/be/src/olap/in_list_predicate.cpp @@ -27,23 +27,23 @@ namespace doris { #define IN_LIST_PRED_CONSTRUCTOR(CLASS) \ - template <class type> \ - CLASS<type>::CLASS(uint32_t column_id, phmap::flat_hash_set<type>&& values, bool opposite) \ + template <class T> \ + CLASS<T>::CLASS(uint32_t column_id, phmap::flat_hash_set<T>&& values, bool opposite) \ : ColumnPredicate(column_id, opposite), _values(std::move(values)) {} IN_LIST_PRED_CONSTRUCTOR(InListPredicate) IN_LIST_PRED_CONSTRUCTOR(NotInListPredicate) #define IN_LIST_PRED_EVALUATE(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate(VectorizedRowBatch* batch) const { \ + template <class T> \ + void CLASS<T>::evaluate(VectorizedRowBatch* batch) const { \ uint16_t n = batch->size(); \ if (n == 0) { \ return; \ } \ uint16_t* sel = batch->selected(); \ - const type* col_vector = \ - reinterpret_cast<const type*>(batch->column(_column_id)->col_data()); \ + const T* col_vector = \ + reinterpret_cast<const T*>(batch->column(_column_id)->col_data()); \ uint16_t new_size = 0; \ if (batch->column(_column_id)->no_nulls()) { \ if (batch->selected_in_use()) { \ @@ -89,15 +89,15 @@ IN_LIST_PRED_EVALUATE(InListPredicate, !=) IN_LIST_PRED_EVALUATE(NotInListPredicate, ==) #define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const { \ + template <class T> \ + void CLASS<T>::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const { \ uint16_t new_size = 0; \ if (block->is_nullable()) { \ for (uint16_t i = 0; i < *size; ++i) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (!block->cell(idx).is_null() && _values.find(*cell_value) \ OP _values.end()); \ new_size += _opposite ? !result : result; \ @@ -106,8 +106,8 @@ IN_LIST_PRED_EVALUATE(NotInListPredicate, ==) for (uint16_t i = 0; i < *size; ++i) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (_values.find(*cell_value) OP _values.end()); \ new_size += _opposite ? !result : result; \ } \ @@ -120,8 +120,8 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(NotInListPredicate, ==) // todo(zeno) define interface in IColumn to simplify code #define IN_LIST_PRED_COLUMN_EVALUATE(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { \ + template <class T> \ + void CLASS<T>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { \ uint16_t new_size = 0; \ if (column.is_nullable()) { \ auto* nullable_col = \ @@ -130,57 +130,53 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(NotInListPredicate, ==) nullable_col->get_null_map_column()).get_data(); \ auto& nested_col = nullable_col->get_nested_column(); \ if (nested_col.is_column_dictionary()) { \ - if constexpr (std::is_same_v<type, StringValue>) { \ + if constexpr (std::is_same_v<T, StringValue>) { \ auto* nested_col_ptr = vectorized::check_and_get_column< \ vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); \ - auto code_set = nested_col_ptr->find_codes(_values); \ auto& data_array = nested_col_ptr->get_data(); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const auto& cell_value = \ - reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \ + const auto& cell_value = data_array[idx]; \ bool ret = !null_bitmap[idx] \ - && (code_set.find(cell_value) OP code_set.end()); \ + && (_dict_codes.find(cell_value) OP _dict_codes.end()); \ new_size += _opposite ? !ret : ret; \ } \ } \ } else { \ auto* nested_col_ptr = vectorized::check_and_get_column< \ - vectorized::PredicateColumnType<type>>(nested_col); \ + vectorized::PredicateColumnType<T>>(nested_col); \ auto& data_array = nested_col_ptr->get_data(); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \ + const auto& cell_value = reinterpret_cast<const T&>(data_array[idx]); \ bool ret = !null_bitmap[idx] && (_values.find(cell_value) OP _values.end()); \ new_size += _opposite ? !ret : ret; \ } \ } \ *size = new_size; \ } else if (column.is_column_dictionary()) { \ - if constexpr (std::is_same_v<type, StringValue>) { \ + if constexpr (std::is_same_v<T, StringValue>) { \ auto& dict_col = \ reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ column); \ auto& data_array = dict_col.get_data(); \ - auto code_set = dict_col.find_codes(_values); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const auto& cell_value = \ - reinterpret_cast<const vectorized::Int32&>(data_array[idx]); \ - auto result = (code_set.find(cell_value) OP code_set.end()); \ + const auto& cell_value = data_array[idx]; \ + auto result = (_dict_codes.find(cell_value) OP _dict_codes.end()); \ new_size += _opposite ? !result : result; \ } \ } \ } else { \ - auto& number_column = reinterpret_cast<vectorized::PredicateColumnType<type>&>(column);\ + auto& number_column = reinterpret_cast<vectorized::PredicateColumnType<T>&>(column); \ auto& data_array = number_column.get_data(); \ for (uint16_t i = 0; i < *size; i++) { \ uint16_t idx = sel[i]; \ sel[new_size] = idx; \ - const type& cell_value = reinterpret_cast<const type&>(data_array[idx]); \ + const auto& cell_value = reinterpret_cast<const T&>(data_array[idx]); \ auto result = (_values.find(cell_value) OP _values.end()); \ new_size += _opposite ? !result : result; \ } \ @@ -192,15 +188,15 @@ IN_LIST_PRED_COLUMN_EVALUATE(InListPredicate, !=) IN_LIST_PRED_COLUMN_EVALUATE(NotInListPredicate, ==) #define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ + template <class T> \ + void CLASS<T>::evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ const { \ if (block->is_nullable()) { \ for (uint16_t i = 0; i < size; ++i) { \ if (flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (!block->cell(idx).is_null() && _values.find(*cell_value) \ OP _values.end()); \ flags[i] |= _opposite ? !result : result; \ @@ -209,8 +205,8 @@ IN_LIST_PRED_COLUMN_EVALUATE(NotInListPredicate, ==) for (uint16_t i = 0; i < size; ++i) { \ if (flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (_values.find(*cell_value) OP _values.end()); \ flags[i] |= _opposite ? !result : result; \ } \ @@ -221,15 +217,15 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(InListPredicate, !=) IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(NotInListPredicate, ==) #define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(CLASS, OP) \ - template <class type> \ - void CLASS<type>::evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ + template <class T> \ + void CLASS<T>::evaluate_and(ColumnBlock* block, uint16_t* sel, uint16_t size, bool* flags) \ const { \ if (block->is_nullable()) { \ for (uint16_t i = 0; i < size; ++i) { \ if (!flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (!block->cell(idx).is_null() && _values.find(*cell_value) \ OP _values.end()); \ flags[i] &= _opposite ? !result : result; \ @@ -238,8 +234,8 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(NotInListPredicate, ==) for (uint16_t i = 0; i < size; ++i) { \ if (!flags[i]) continue; \ uint16_t idx = sel[i]; \ - const type* cell_value = \ - reinterpret_cast<const type*>(block->cell(idx).cell_ptr()); \ + const T* cell_value = \ + reinterpret_cast<const T*>(block->cell(idx).cell_ptr()); \ auto result = (_values.find(*cell_value) OP _values.end()); \ flags[i] &= _opposite ? !result : result; \ } \ @@ -250,8 +246,8 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(InListPredicate, !=) IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(NotInListPredicate, ==) #define IN_LIST_PRED_BITMAP_EVALUATE(CLASS, OP) \ - template <class type> \ - Status CLASS<type>::evaluate(const Schema& schema, \ + template <class T> \ + Status CLASS<T>::evaluate(const Schema& schema, \ const std::vector<BitmapIndexIterator*>& iterators, \ uint32_t num_rows, roaring::Roaring* result) const { \ BitmapIndexIterator* iterator = iterators[_column_id]; \ @@ -286,6 +282,33 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_AND(NotInListPredicate, ==) IN_LIST_PRED_BITMAP_EVALUATE(InListPredicate, &=) IN_LIST_PRED_BITMAP_EVALUATE(NotInListPredicate, -=) +#define IN_LIST_PRED_SET_DICT_CODE(CLASS) \ + template <class T> \ + void CLASS<T>::set_dict_code_if_necessary(vectorized::IColumn& column) { \ + if (_dict_code_inited) { \ + return; \ + } \ + if constexpr (std::is_same_v<T, StringValue>) { \ + auto* col_ptr = column.get_ptr().get(); \ + if (column.is_nullable()) { \ + auto nullable_col = \ + reinterpret_cast<vectorized::ColumnNullable*>(col_ptr); \ + col_ptr = nullable_col->get_nested_column_ptr().get(); \ + } \ + if (col_ptr->is_column_dictionary()) { \ + auto& dict_col = \ + reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ + *col_ptr); \ + auto code_set = dict_col.find_codes(_values); \ + _dict_codes = std::move(code_set); \ + _dict_code_inited = true; \ + } \ + } \ + } + +IN_LIST_PRED_SET_DICT_CODE(InListPredicate) +IN_LIST_PRED_SET_DICT_CODE(NotInListPredicate) + #define IN_LIST_PRED_CONSTRUCTOR_DECLARATION(CLASS) \ template CLASS<int8_t>::CLASS(uint32_t column_id, phmap::flat_hash_set<int8_t>&& values, \ bool opposite); \ @@ -393,4 +416,8 @@ IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_DECLARATION(NotInListPredicate) IN_LIST_PRED_BITMAP_EVALUATE_DECLARATION(InListPredicate) IN_LIST_PRED_BITMAP_EVALUATE_DECLARATION(NotInListPredicate) +template void InListPredicate<StringValue>::set_dict_code_if_necessary(vectorized::IColumn& column); +template void NotInListPredicate<StringValue>::set_dict_code_if_necessary( + vectorized::IColumn& column); + } //namespace doris diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 7cd237b..089ee84 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -78,11 +78,12 @@ class VectorizedRowBatch; // todo(wb) support evaluate_and,evaluate_or -#define IN_LIST_PRED_CLASS_DEFINE(CLASS) \ - template <class type> \ +#define IN_LIST_PRED_CLASS_DEFINE(CLASS, PT) \ + template <class T> \ class CLASS : public ColumnPredicate { \ public: \ - CLASS(uint32_t column_id, phmap::flat_hash_set<type>&& values, bool is_opposite = false); \ + CLASS(uint32_t column_id, phmap::flat_hash_set<T>&& values, bool is_opposite = false); \ + PredicateType type() const override { return PredicateType::PT; } \ virtual void evaluate(VectorizedRowBatch* batch) const override; \ void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override; \ void evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t size, \ @@ -95,13 +96,15 @@ class VectorizedRowBatch; void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const override; \ void evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const override {} \ void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t size, bool* flags) const override {} \ - bool is_in_predicate() override { return true; } \ + void set_dict_code_if_necessary(vectorized::IColumn& column) override; \ private: \ - phmap::flat_hash_set<type> _values; \ + phmap::flat_hash_set<T> _values; \ + bool _dict_code_inited = false; \ + phmap::flat_hash_set<int32_t> _dict_codes; \ }; -IN_LIST_PRED_CLASS_DEFINE(InListPredicate) -IN_LIST_PRED_CLASS_DEFINE(NotInListPredicate) +IN_LIST_PRED_CLASS_DEFINE(InListPredicate, IN_LIST) +IN_LIST_PRED_CLASS_DEFINE(NotInListPredicate, NO_IN_LIST) } //namespace doris diff --git a/be/src/olap/null_predicate.cpp b/be/src/olap/null_predicate.cpp index da3eb29..43cfbca 100644 --- a/be/src/olap/null_predicate.cpp +++ b/be/src/olap/null_predicate.cpp @@ -29,6 +29,10 @@ namespace doris { NullPredicate::NullPredicate(uint32_t column_id, bool is_null, bool opposite) : ColumnPredicate(column_id), _is_null(opposite != is_null) {} +PredicateType NullPredicate::type() const { + return _is_null ? PredicateType::IS_NULL : PredicateType::NOT_IS_NULL; +} + void NullPredicate::evaluate(VectorizedRowBatch* batch) const { uint16_t n = batch->size(); if (n == 0) { diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h index 681e60b..7b90ffb 100644 --- a/be/src/olap/null_predicate.h +++ b/be/src/olap/null_predicate.h @@ -32,6 +32,8 @@ class NullPredicate : public ColumnPredicate { public: NullPredicate(uint32_t column_id, bool is_null, bool opposite = false); + virtual PredicateType type() const override; + virtual void evaluate(VectorizedRowBatch* batch) const override; void evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const override; diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index f199000..3c6e623 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -241,18 +241,7 @@ void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder, StringRe Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr &dst) { if (_encoding_type == PLAIN_ENCODING) { - // todo(zeno) Handle convert in ColumnDictionary, - // add interface like convert_to_predicate_column_if_necessary - auto* col_ptr = dst.get(); - if (dst->is_nullable()) { - auto nullable_col = reinterpret_cast<vectorized::ColumnNullable*>(dst.get()); - col_ptr = nullable_col->get_nested_column_ptr().get(); - } - - if (col_ptr->is_column_dictionary()) { - auto* dict_col_ptr = reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>*>(col_ptr); - col_ptr = (*std::move(dict_col_ptr->convert_to_predicate_column())).assume_mutable(); - } + dst = (*(std::move(dst->convert_to_predicate_column_if_dictionary()))).assume_mutable(); return _data_page_decoder->next_batch(n, dst); } // dictionary encoding diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index e57730a..549977d 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -616,8 +616,9 @@ void SegmentIterator::_vec_init_lazy_materialization() { pred_column_ids.insert(cid); if (type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR || - type == OLAP_FIELD_TYPE_STRING || predicate->is_in_predicate() || - predicate->is_bloom_filter_predicate()) { + type == OLAP_FIELD_TYPE_STRING || predicate->type() == PredicateType::BF || + predicate->type() == PredicateType::IN_LIST || + predicate->type() == PredicateType::NO_IN_LIST) { short_cir_pred_col_id_set.insert(cid); _short_cir_eval_predicate.push_back(predicate); _is_all_column_basic_type = false; @@ -859,23 +860,17 @@ void SegmentIterator::_evaluate_short_circuit_predicate(uint16_t* vec_sel_rowid_ return; } - for (auto column_predicate : _short_cir_eval_predicate) { - auto column_id = column_predicate->column_id(); + for (auto predicate : _short_cir_eval_predicate) { + auto column_id = predicate->column_id(); auto& short_cir_column = _current_return_columns[column_id]; auto* col_ptr = short_cir_column.get(); - // todo(zeno) define convert_dict_codes_if_dictionary interface in IColumn - if (short_cir_column->is_nullable()) { - auto nullable_col = - reinterpret_cast<vectorized::ColumnNullable*>(short_cir_column.get()); - col_ptr = nullable_col->get_nested_column_ptr().get(); + // range comparison predicate needs to sort the dict and convert the encoding + if (predicate->type() == PredicateType::LT || predicate->type() == PredicateType::LE || + predicate->type() == PredicateType::GT || predicate->type() == PredicateType::GE) { + col_ptr->convert_dict_codes_if_necessary(); } - - if (col_ptr->is_column_dictionary() && column_predicate->is_range_comparison_predicate()) { - auto& dict_col = - reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>(*col_ptr); - dict_col.convert_dict_codes(); - } - column_predicate->evaluate(*short_cir_column, vec_sel_rowid_idx, selected_size_ptr); + predicate->set_dict_code_if_necessary(*short_cir_column); + predicate->evaluate(*short_cir_column, vec_sel_rowid_idx, selected_size_ptr); } // evaluate delete condition diff --git a/be/src/runtime/string_value.h b/be/src/runtime/string_value.h index 860fced..cdf33ab 100644 --- a/be/src/runtime/string_value.h +++ b/be/src/runtime/string_value.h @@ -188,6 +188,12 @@ struct StringValue { return a.compare(b) < 0; } }; + + struct HashOfStringValue { + size_t operator()(const StringValue& v) const { + return HashUtil::hash(v.ptr, v.len, 0); + } + }; }; // This function must be called 'hash_value' to be picked up by boost. diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 0927f34..7e717bb 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -64,6 +64,13 @@ public: /// If column is ColumnLowCardinality, transforms is to full column. virtual Ptr convert_to_full_column_if_low_cardinality() const { return get_ptr(); } + /// If column isn't ColumnDictionary, return itself. + /// If column is ColumnDictionary, transforms is to predicate column. + virtual Ptr convert_to_predicate_column_if_dictionary() { return get_ptr(); } + + /// If column is ColumnDictionary, and is a range comparison predicate, convert dict encoding + virtual void convert_dict_codes_if_necessary() {} + /// Creates empty column with the same type. virtual MutablePtr clone_empty() const { return clone_resized(0); } @@ -518,7 +525,6 @@ bool is_column_const(const IColumn& column); /// True if column's an ColumnNullable instance. It's just a syntax sugar for type check. bool is_column_nullable(const IColumn& column); - } // namespace doris::vectorized // Wrap `ColumnPtr` because `ColumnPtr` can't be used in forward declaration. diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index 3aa5bd7..eb33f55 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -32,6 +32,10 @@ #include "vec/columns/column_vector.h" #include "vec/columns/predicate_column.h" #include "vec/core/types.h" +#include "vec/common/typeid_cast.h" +#include "olap/column_predicate.h" +#include "olap/comparison_predicate.h" +#include "olap/in_list_predicate.h" namespace doris::vectorized { @@ -49,12 +53,13 @@ namespace doris::vectorized { */ template <typename T> class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> { + static_assert(IsNumber<T>); private: friend class COWHelper<IColumn, ColumnDictionary>; ColumnDictionary() {} - ColumnDictionary(const size_t n) : codes(n) {} - ColumnDictionary(const ColumnDictionary& src) : codes(src.codes.begin(), src.codes.end()) {} + ColumnDictionary(const size_t n) : _codes(n) {} + ColumnDictionary(const ColumnDictionary& src) : _codes(src._codes.begin(), src._codes.end()) {} public: using Self = ColumnDictionary; @@ -62,13 +67,9 @@ public: using Container = PaddedPODArray<value_type>; using DictContainer = PaddedPODArray<StringValue>; - bool is_numeric() const override { return false; } - - bool is_predicate_column() const override { return false; } - bool is_column_dictionary() const override { return true; } - size_t size() const override { return codes.size(); } + size_t size() const override { return _codes.size(); } [[noreturn]] StringRef get_data_at(size_t n) const override { LOG(FATAL) << "get_data_at not supported in ColumnDictionary"; @@ -94,17 +95,20 @@ public: } void insert_data(const char* pos, size_t /*length*/) override { - codes.push_back(unaligned_load<T>(pos)); + _codes.push_back(unaligned_load<T>(pos)); } - void insert_data(const T value) { codes.push_back(value); } + void insert_data(const T value) { _codes.push_back(value); } - void insert_default() override { codes.push_back(T()); } + void insert_default() override { _codes.push_back(T()); } - void clear() override { codes.clear(); } + void clear() override { + _codes.clear(); + _dict_code_converted = false; + } // TODO: Make dict memory usage more precise - size_t byte_size() const override { return codes.size() * sizeof(codes[0]); } + size_t byte_size() const override { return _codes.size() * sizeof(_codes[0]); } size_t allocated_bytes() const override { return byte_size(); } @@ -115,11 +119,9 @@ public: LOG(FATAL) << "get_permutation not supported in ColumnDictionary"; } - void reserve(size_t n) override { codes.reserve(n); } + void reserve(size_t n) override { _codes.reserve(n); } - [[noreturn]] const char* get_family_name() const override { - LOG(FATAL) << "get_family_name not supported in ColumnDictionary"; - } + const char* get_family_name() const override { return "ColumnDictionary"; } [[noreturn]] MutableColumnPtr clone_resized(size_t size) const override { LOG(FATAL) << "clone_resized not supported in ColumnDictionary"; @@ -129,43 +131,13 @@ public: LOG(FATAL) << "insert not supported in ColumnDictionary"; } - Field operator[](size_t n) const override { return codes[n]; } + Field operator[](size_t n) const override { return _codes[n]; } void get(size_t n, Field& res) const override { res = (*this)[n]; } - [[noreturn]] UInt64 get64(size_t n) const override { - LOG(FATAL) << "get field not supported in ColumnDictionary"; - } - - [[noreturn]] Float64 get_float64(size_t n) const override { - LOG(FATAL) << "get field not supported in ColumnDictionary"; - } - - [[noreturn]] UInt64 get_uint(size_t n) const override { - LOG(FATAL) << "get field not supported in ColumnDictionary"; - } - - [[noreturn]] bool get_bool(size_t n) const override { - LOG(FATAL) << "get field not supported in ColumnDictionary"; - } - - [[noreturn]] Int64 get_int(size_t n) const override { - LOG(FATAL) << "get field not supported in ColumnDictionary"; - } + Container& get_data() { return _codes; } - Container& get_data() { return codes; } - - const Container& get_data() const { return codes; } - - T find_code(const StringValue& value) const { return dict.find_code(value); } - - T find_bound_code(const StringValue& value, bool lower, bool eq) const { - return dict.find_bound_code(value, lower, eq); - } - - phmap::flat_hash_set<T> find_codes(const phmap::flat_hash_set<StringValue>& values) const { - return dict.find_codes(values); - } + const Container& get_data() const { return _codes; } // it's impossable to use ComplexType as key , so we don't have to implemnt them [[noreturn]] StringRef serialize_value_into_arena(size_t n, Arena& arena, @@ -222,8 +194,8 @@ public: auto* res_col = reinterpret_cast<vectorized::ColumnString*>(col_ptr); for (size_t i = 0; i < sel_size; i++) { uint16_t n = sel[i]; - auto& code = reinterpret_cast<T&>(codes[n]); - auto value = dict.get_value(code); + auto& code = reinterpret_cast<T&>(_codes[n]); + auto value = _dict.get_value(code); res_col->insert_data(value.ptr, value.len); } return Status::OK(); @@ -241,18 +213,43 @@ public: const StringRef* dict_array, size_t data_num, uint32_t dict_num) override { if (!is_dict_inited()) { - dict.reserve(dict_num); + _dict.reserve(dict_num); for (uint32_t i = 0; i < dict_num; ++i) { auto value = StringValue(dict_array[i].data, dict_array[i].size); - dict.insert_value(value); + _dict.insert_value(value); } _dict_inited = true; } - char* end_ptr = (char*)codes.get_end_ptr(); + char* end_ptr = (char*)_codes.get_end_ptr(); memcpy(end_ptr, data_array + start_index, data_num * sizeof(T)); end_ptr += data_num * sizeof(T); - codes.set_end_ptr(end_ptr); + _codes.set_end_ptr(end_ptr); + } + + void convert_dict_codes_if_necessary() override { + if (!is_dict_sorted()) { + _dict.sort(); + _dict_sorted = true; + } + + if (!is_dict_code_converted()) { + for (size_t i = 0; i < size(); ++i) { + _codes[i] = _dict.convert_code(_codes[i]); + } + _dict_code_converted = true; + } + } + + int32_t find_code(const StringValue& value) const { return _dict.find_code(value); } + + int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const { + return _dict.find_code_by_bound(value, lower, eq); + } + + phmap::flat_hash_set<int32_t> find_codes( + const phmap::flat_hash_set<StringValue>& values) const { + return _dict.find_codes(values); } bool is_dict_inited() const { return _dict_inited; } @@ -261,126 +258,110 @@ public: bool is_dict_code_converted() const { return _dict_code_converted; } - ColumnPtr convert_to_predicate_column() { + ColumnPtr convert_to_predicate_column_if_dictionary() override { auto res = vectorized::PredicateColumnType<StringValue>::create(); - size_t size = codes.size(); + size_t size = _codes.size(); res->reserve(size); for (size_t i = 0; i < size; ++i) { - auto& code = reinterpret_cast<T&>(codes[i]); - auto value = dict.get_value(code); + auto& code = reinterpret_cast<T&>(_codes[i]); + auto value = _dict.get_value(code); res->insert_data(value.ptr, value.len); } - dict.clear(); + _dict.clear(); return res; } - void convert_dict_codes() { - if (!is_dict_sorted()) { - sort_dict(); - } - - if (!is_dict_code_converted()) { - for (size_t i = 0; i < size(); ++i) { - codes[i] = dict.convert_code(codes[i]); - } - _dict_code_converted = true; - } - } - - void sort_dict() { - dict.sort(); - _dict_sorted = true; - } - class Dictionary { public: Dictionary() = default; void reserve(size_t n) { - dict_data.reserve(n); - inverted_index.reserve(n); + _dict_data.reserve(n); + _inverted_index.reserve(n); } inline void insert_value(StringValue& value) { - dict_data.push_back_without_reserve(value); - inverted_index[value] = inverted_index.size(); + _dict_data.push_back_without_reserve(value); + _inverted_index[value] = _inverted_index.size(); } - inline T find_code(const StringValue& value) const { - auto it = inverted_index.find(value); - if (it != inverted_index.end()) { + inline int32_t find_code(const StringValue& value) const { + auto it = _inverted_index.find(value); + if (it != _inverted_index.end()) { return it->second; } return -1; } - inline T find_bound_code(const StringValue& value, bool lower, bool eq) const { + inline int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const { auto code = find_code(value); if (code >= 0) { return code; } if (lower) { - return std::lower_bound(dict_data.begin(), dict_data.end(), value) - dict_data.begin() - eq; + return std::lower_bound(_dict_data.begin(), _dict_data.end(), value) - + _dict_data.begin() - eq; } else { - return std::upper_bound(dict_data.begin(), dict_data.end(), value) - dict_data.begin() + eq; + return std::upper_bound(_dict_data.begin(), _dict_data.end(), value) - + _dict_data.begin() + eq; } } - inline phmap::flat_hash_set<T> find_codes(const phmap::flat_hash_set<StringValue>& values) const { - phmap::flat_hash_set<T> code_set; + inline phmap::flat_hash_set<int32_t> find_codes( + const phmap::flat_hash_set<StringValue>& values) const { + phmap::flat_hash_set<int32_t> code_set; for (const auto& value : values) { - auto it = inverted_index.find(value); - if (it != inverted_index.end()) { + auto it = _inverted_index.find(value); + if (it != _inverted_index.end()) { code_set.insert(it->second); } } return code_set; } - inline StringValue& get_value(T code) { return dict_data[code]; } + inline StringValue& get_value(T code) { return _dict_data[code]; } void clear() { - dict_data.clear(); - inverted_index.clear(); - code_convert_map.clear(); + _dict_data.clear(); + _inverted_index.clear(); + _code_convert_map.clear(); } void sort() { - size_t dict_size = dict_data.size(); - std::sort(dict_data.begin(), dict_data.end(), comparator); + size_t dict_size = _dict_data.size(); + std::sort(_dict_data.begin(), _dict_data.end(), _comparator); for (size_t i = 0; i < dict_size; ++i) { - code_convert_map[inverted_index.find(dict_data[i])->second] = (T)i; - inverted_index[dict_data[i]] = (T)i; + _code_convert_map[_inverted_index.find(_dict_data[i])->second] = (T)i; + _inverted_index[_dict_data[i]] = (T)i; } } - inline T convert_code(const T& code) const { return code_convert_map.find(code)->second; } + inline T convert_code(const T& code) const { return _code_convert_map.find(code)->second; } - size_t byte_size() { return dict_data.size() * sizeof(dict_data[0]); } + size_t byte_size() { return _dict_data.size() * sizeof(_dict_data[0]); } private: - struct HashOfStringValue { - size_t operator()(const StringValue& value) const { - return HashStringThoroughly(value.ptr, value.len); - } - }; - - StringValue::Comparator comparator; + StringValue::Comparator _comparator; // dict code -> dict value - DictContainer dict_data; + DictContainer _dict_data; // dict value -> dict code - phmap::flat_hash_map<StringValue, T, HashOfStringValue> inverted_index; + phmap::flat_hash_map<StringValue, T, StringValue::HashOfStringValue> _inverted_index; // data page code -> sorted dict code, only used for range comparison predicate - phmap::flat_hash_map<T, T> code_convert_map; + phmap::flat_hash_map<T, T> _code_convert_map; }; private: bool _dict_inited = false; bool _dict_sorted = false; bool _dict_code_converted = false; - Dictionary dict; - Container codes; + Dictionary _dict; + Container _codes; }; -} // namespace doris::vectorized \ No newline at end of file +template class ColumnDictionary<uint8_t>; +template class ColumnDictionary<uint16_t>; +template class ColumnDictionary<uint32_t>; +template class ColumnDictionary<int32_t>; + +} // namespace doris::vectorized diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 8badf6e..aa0df2d 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -267,6 +267,17 @@ public: LOG(FATAL) << "should not call the method in column nullable"; } + ColumnPtr convert_to_predicate_column_if_dictionary() override { + IColumn* nested_ptr = get_nested_column_ptr().get(); + nested_ptr = (*(std::move(nested_ptr->convert_to_predicate_column_if_dictionary() + ))).assume_mutable(); + return get_ptr(); + } + + void convert_dict_codes_if_necessary() override { + get_nested_column().convert_dict_codes_if_necessary(); + } + private: WrappedPtr nested_column; WrappedPtr null_map; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org