This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch opt_perf in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/opt_perf by this push: new eb68ee6560 [Improvement](dict) optimize dictionary column (#12880) eb68ee6560 is described below commit eb68ee6560b94a6844f4605398f997f878646936 Author: Gabriel <gabrielleeb...@gmail.com> AuthorDate: Thu Sep 22 19:46:38 2022 +0800 [Improvement](dict) optimize dictionary column (#12880) --- be/src/vec/columns/column_dictionary.h | 87 ++++++++++++++++++---------------- be/src/vec/columns/column_string.h | 11 +++++ 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index d56265b757..93fbcb9a3e 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -192,11 +192,13 @@ public: Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override { auto* res_col = reinterpret_cast<vectorized::ColumnString*>(col_ptr); + res_col->get_offsets().reserve(sel_size); + res_col->get_chars().reserve(_dict.avg_str_len() * sel_size); for (size_t i = 0; i < sel_size; i++) { uint16_t n = sel[i]; auto& code = reinterpret_cast<T&>(_codes[n]); auto value = _dict.get_value(code); - res_col->insert_data(value.ptr, value.len); + res_col->insert_data_without_reserve(value.ptr, value.len); } return Status::OK(); } @@ -281,42 +283,36 @@ public: class Dictionary { public: - Dictionary() = default; + Dictionary() : _dict_data(new DictContainer()), _total_str_len(0) {}; - void reserve(size_t n) { - _dict_data.reserve(n); - _inverted_index.reserve(n); - } + void reserve(size_t n) { _dict_data->reserve(n); } void insert_value(StringValue& value) { - _dict_data.push_back_without_reserve(value); - _inverted_index[value] = _inverted_index.size(); + _dict_data->push_back_without_reserve(value); + _total_str_len += value.len; } int32_t find_code(const StringValue& value) const { - auto it = _inverted_index.find(value); - if (it != _inverted_index.end()) { - return it->second; + for (size_t i = 0; i < _dict_data->size(); i++) { + if ((*_dict_data)[i] == value) { + return i; + } } return -2; // -1 is null code } T get_null_code() const { return -1; } - inline StringValue& get_value(T code) { - return code >= _dict_data.size() ? _null_value : _dict_data[code]; - } + inline StringValue& get_value(T code) { return (*_dict_data)[code]; } - inline const StringValue& get_value(T code) const { - return code >= _dict_data.size() ? _null_value : _dict_data[code]; - } + inline const StringValue& get_value(T code) const { return (*_dict_data)[code]; } // The function is only used in the runtime filter feature inline void generate_hash_values_for_runtime_filter(FieldType type) { if (_hash_values.empty()) { - _hash_values.resize(_dict_data.size()); - for (size_t i = 0; i < _dict_data.size(); i++) { - auto& sv = _dict_data[i]; + _hash_values.resize(_dict_data->size()); + for (size_t i = 0; i < _dict_data->size(); i++) { + auto& sv = (*_dict_data)[i]; // The char data is stored in the disk with the schema length, // and zeros are filled if the length is insufficient @@ -360,40 +356,50 @@ public: if (code >= 0) { return code; } - auto bound = std::upper_bound(_dict_data.begin(), _dict_data.end(), value) - - _dict_data.begin(); + auto bound = std::upper_bound(_dict_data->begin(), _dict_data->end(), value) - + _dict_data->begin(); return greater ? bound - greater + eq : bound - eq; } void find_codes(const phmap::flat_hash_set<StringValue>& values, std::vector<vectorized::UInt8>& selected) const { - size_t dict_word_num = _dict_data.size(); + size_t dict_word_num = _dict_data->size(); selected.resize(dict_word_num); selected.assign(dict_word_num, false); - for (const auto& value : values) { - if (auto it = _inverted_index.find(value); it != _inverted_index.end()) { - selected[it->second] = true; + for (size_t i = 0; i < _dict_data->size(); i++) { + if (values.find((*_dict_data)[i]) != values.end()) { + selected[i] = true; } } } void clear() { - _dict_data.clear(); - _inverted_index.clear(); - _code_convert_table.clear(); + _dict_data->clear(); _hash_values.clear(); } void clear_hash_values() { _hash_values.clear(); } void sort() { - size_t dict_size = _dict_data.size(); - _code_convert_table.reserve(dict_size); - std::sort(_dict_data.begin(), _dict_data.end(), _comparator); + size_t dict_size = _dict_data->size(); + + _perm.resize(dict_size); for (size_t i = 0; i < dict_size; ++i) { - _code_convert_table[_inverted_index.find(_dict_data[i])->second] = (T)i; - _inverted_index[_dict_data[i]] = (T)i; + _perm[i] = i; } + + std::sort(_perm.begin(), _perm.end(), + [&dict_data = *_dict_data, &comparator = _comparator](const size_t a, + const size_t b) { + return comparator(dict_data[a], dict_data[b]); + }); + + auto new_dict_data = new DictContainer(dict_size); + for (size_t i = 0; i < dict_size; ++i) { + _code_convert_table[_perm[i]] = (T)i; + (*new_dict_data)[i] = (*_dict_data)[_perm[i]]; + } + _dict_data.reset(new_dict_data); } T convert_code(const T& code) const { @@ -403,18 +409,17 @@ public: return _code_convert_table[code]; } - size_t byte_size() { return _dict_data.size() * sizeof(_dict_data[0]); } + size_t byte_size() { return _dict_data->size() * sizeof((*_dict_data)[0]); } + + bool empty() { return _dict_data->empty(); } - bool empty() { return _dict_data.empty(); } + size_t avg_str_len() { return _total_str_len / _dict_data->size(); } private: StringValue _null_value = StringValue(); StringValue::Comparator _comparator; // dict code -> dict value - DictContainer _dict_data; - // dict value -> dict code - phmap::flat_hash_map<StringValue, T, StringValue::HashOfStringValue> _inverted_index; - // only used for range comparison predicate. _code_convert_table[i] = j, where i is dataPageCode, and j is sortedDictCode + std::unique_ptr<DictContainer> _dict_data; std::vector<T> _code_convert_table; // hash value of origin string , used for bloom filter // It's a trade-off of space for performance @@ -422,6 +427,8 @@ public: // This may because the magnitude of the data is not large enough(in q60, only about 80k rows data is filtered for largest table) // So we may need more test here. HashValueContainer _hash_values; + IColumn::Permutation _perm; + size_t _total_str_len; }; private: diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 827ec896a7..469bbdc6df 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -153,6 +153,17 @@ public: offsets.push_back(new_size); } + void insert_data_without_reserve(const char* pos, size_t length) { + const size_t old_size = chars.size(); + const size_t new_size = old_size + length; + + if (length) { + chars.resize(new_size); + memcpy(chars.data() + old_size, pos, length); + } + offsets.push_back_without_reserve(new_size); + } + void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { size_t new_size = 0; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org