This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 17df7d5f435 [Chore](hash-table) remove some unused code of PartitionedHashMap (#43215) 17df7d5f435 is described below commit 17df7d5f4355a810bcc79c437955d94e676514df Author: Pxl <pxl...@qq.com> AuthorDate: Tue Nov 5 11:34:40 2024 +0800 [Chore](hash-table) remove some unused code of PartitionedHashMap (#43215) remove some unused code of PartitionedHashMap --- be/src/vec/common/hash_table/hash_map.h | 3 - be/src/vec/common/hash_table/hash_map_context.h | 1 - be/src/vec/common/hash_table/hash_table.h | 45 +- .../vec/common/hash_table/partitioned_hash_map.h | 60 --- .../vec/common/hash_table/partitioned_hash_table.h | 550 --------------------- be/src/vec/common/hash_table/ph_hash_map.h | 48 +- 6 files changed, 9 insertions(+), 698 deletions(-) diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h index 018d134d875..448ddd5b7c5 100644 --- a/be/src/vec/common/hash_table/hash_map.h +++ b/be/src/vec/common/hash_table/hash_map.h @@ -198,9 +198,6 @@ template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>, typename Grower = HashTableGrower<>, typename Allocator = HashTableAllocator> using HashMap = HashMapTable<Key, HashMapCell<Key, Mapped, Hash>, Hash, Grower, Allocator>; -template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>> -using NormalHashMap = HashMapTable<Key, HashMapCell<Key, Mapped, Hash>, Hash>; - template <typename Key, typename Hash = DefaultHash<Key>> using JoinHashMap = JoinHashTable<Key, Hash>; diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 5354155c529..16a793d7500 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -27,7 +27,6 @@ #include "vec/common/arena.h" #include "vec/common/assert_cast.h" #include "vec/common/columns_hashing.h" -#include "vec/common/hash_table/partitioned_hash_map.h" #include "vec/common/hash_table/string_hash_map.h" #include "vec/common/string_ref.h" #include "vec/common/typeid_cast.h" diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h index 490cd501692..809868e2bee 100644 --- a/be/src/vec/common/hash_table/hash_table.h +++ b/be/src/vec/common/hash_table/hash_table.h @@ -419,28 +419,12 @@ protected: Cell* buf = nullptr; /// A piece of memory for all elements except the element with zero key. Grower grower; int64_t _resize_timer_ns; - // the bucket count threshold above which it's converted to partioned hash table - // > 0: enable convert dynamically - // 0: convert is disabled - int _partitioned_threshold = 0; - // if need resize and bucket count after resize will be >= _partitioned_threshold, - // this flag is set to true, and resize does not actually happen, - // PartitionedHashTable will convert this hash table to partitioned hash table - bool _need_partition = false; //factor that will trigger growing the hash table on insert. static constexpr float MAX_BUCKET_OCCUPANCY_FRACTION = 0.5f; mutable size_t collisions = 0; - void set_partitioned_threshold(int threshold) { _partitioned_threshold = threshold; } - - bool check_if_need_partition(size_t bucket_count) { - return _partitioned_threshold > 0 && bucket_count >= _partitioned_threshold; - } - - bool need_partition() { return _need_partition; } - /// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain. size_t ALWAYS_INLINE find_cell(const Key& x, size_t hash_value, size_t place_value) const { while (!buf[place_value].is_zero(*this) && @@ -609,8 +593,6 @@ public: std::swap(buf, rhs.buf); std::swap(m_size, rhs.m_size); std::swap(grower, rhs.grower); - std::swap(_need_partition, rhs._need_partition); - std::swap(_partitioned_threshold, rhs._partitioned_threshold); Hash::operator=(std::move(rhs)); // NOLINT(bugprone-use-after-move) Allocator::operator=(std::move(rhs)); // NOLINT(bugprone-use-after-move) @@ -740,12 +722,10 @@ protected: throw; } - if (LIKELY(!_need_partition)) { - // The hash table was rehashed, so we have to re-find the key. - size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); - assert(!buf[new_place].is_zero(*this)); - it = &buf[new_place]; - } + // The hash table was rehashed, so we have to re-find the key. + size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); + assert(!buf[new_place].is_zero(*this)); + it = &buf[new_place]; } } @@ -776,12 +756,10 @@ protected: throw; } - if (LIKELY(!_need_partition)) { - // The hash table was rehashed, so we have to re-find the key. - size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); - assert(!buf[new_place].is_zero(*this)); - it = &buf[new_place]; - } + // The hash table was rehashed, so we have to re-find the key. + size_t new_place = find_cell(key, hash_value, grower.place(hash_value)); + assert(!buf[new_place].is_zero(*this)); + it = &buf[new_place]; } } @@ -1060,13 +1038,6 @@ private: } else new_grower.increase_size(); - // new bucket count exceed partitioned hash table bucket count threshold, - // don't resize and set need partition flag - if (check_if_need_partition(new_grower.buf_size())) { - _need_partition = true; - return; - } - /// Expand the space. buf = reinterpret_cast<Cell*>(Allocator::realloc(buf, get_buffer_size_in_bytes(), new_grower.buf_size() * sizeof(Cell))); diff --git a/be/src/vec/common/hash_table/partitioned_hash_map.h b/be/src/vec/common/hash_table/partitioned_hash_map.h deleted file mode 100644 index a2db6fece35..00000000000 --- a/be/src/vec/common/hash_table/partitioned_hash_map.h +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashMap.h -// and modified by Doris -#pragma once - -#include "vec/common/hash_table/hash_map.h" -#include "vec/common/hash_table/partitioned_hash_table.h" -#include "vec/common/hash_table/ph_hash_map.h" -namespace doris { -template <typename ImplTable> -class PartitionedHashMapTable : public PartitionedHashTable<ImplTable> { -public: - using Impl = ImplTable; - using Base = PartitionedHashTable<ImplTable>; - using Key = typename ImplTable::key_type; - using LookupResult = typename Impl::LookupResult; - - auto& ALWAYS_INLINE operator[](const Key& x) { - LookupResult it; - bool inserted = false; - this->emplace(x, it, inserted); - - if (inserted) { - new (lookup_result_get_mapped(it)) Base::mapped_type(); - } - - return *lookup_result_get_mapped(it); - } - - template <typename Func> - void for_each_mapped(Func&& func) { - for (auto& v : *this) { - func(v.get_second()); - } - } -}; - -template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>> -using PartitionedHashMap = - PartitionedHashMapTable<HashMap<Key, Mapped, Hash, PartitionedHashTableGrower<>>>; - -template <typename Key, typename Mapped, typename Hash = DefaultHash<Key>> -using PHNormalHashMap = PHHashMap<Key, Mapped, Hash, false>; -} // namespace doris \ No newline at end of file diff --git a/be/src/vec/common/hash_table/partitioned_hash_table.h b/be/src/vec/common/hash_table/partitioned_hash_table.h deleted file mode 100644 index c6a19b36d3a..00000000000 --- a/be/src/vec/common/hash_table/partitioned_hash_table.h +++ /dev/null @@ -1,550 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -// This file is copied from -// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashTable.h -// and modified by Doris -#pragma once - -#include "vec/common/hash_table/hash_table.h" - -/** Partitioned hash table. - * Represents 16 (or 1ULL << BITS_FOR_SUB_TABLE) small hash tables (sub table count of the first level). - * To determine which one to use, one of the bytes of the hash function is taken. - * - * Usually works a little slower than a simple hash table. - * However, it has advantages in some cases: - * - if you need to merge two hash tables together, then you can easily parallelize it by sub tables; - * - delay during resizes is amortized, since the small hash tables will be resized separately; - * - in theory, resizes are cache-local in a larger range of sizes. - */ - -template <size_t initial_size_degree = 8> -struct PartitionedHashTableGrower : public HashTableGrowerWithPrecalculation<initial_size_degree> { - /// Increase the size of the hash table. - void increase_size() { this->increase_size_degree(this->size_degree() >= 15 ? 1 : 2); } -}; - -template <typename Impl, size_t BITS_FOR_SUB_TABLE = 4> -class PartitionedHashTable : private boost::noncopyable, Impl::Hash { -public: - using key_type = typename Impl::key_type; - using mapped_type = typename Impl::mapped_type; - using value_type = typename Impl::value_type; - using cell_type = typename Impl::cell_type; - using Key = typename Impl::key_type; - - using LookupResult = typename Impl::LookupResult; - using ConstLookupResult = typename Impl::ConstLookupResult; - -protected: - using Self = PartitionedHashTable; - -private: - static constexpr size_t NUM_LEVEL1_SUB_TABLES = 1ULL << BITS_FOR_SUB_TABLE; - static constexpr size_t MAX_SUB_TABLE = NUM_LEVEL1_SUB_TABLES - 1; - - //factor that will trigger growing the hash table on insert. - static constexpr float MAX_SUB_TABLE_OCCUPANCY_FRACTION = 0.5f; - - Impl level0_sub_table; - Impl level1_sub_tables[NUM_LEVEL1_SUB_TABLES]; - - bool _is_partitioned = false; - - int64_t _convert_timer_ns = 0; - -public: - PartitionedHashTable() = default; - - PartitionedHashTable(PartitionedHashTable&& rhs) { *this = std::move(rhs); } - - PartitionedHashTable& operator=(PartitionedHashTable&& rhs) { - std::swap(_is_partitioned, rhs._is_partitioned); - - level0_sub_table = std::move(rhs.level0_sub_table); - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i] = std::move(rhs.level1_sub_tables[i]); - } - - return *this; - } - - size_t hash(const Key& x) const { return level0_sub_table.hash(x); } - - float get_factor() const { return MAX_SUB_TABLE_OCCUPANCY_FRACTION; } - - int64_t get_convert_timer_value() const { return _convert_timer_ns; } - - bool should_be_shrink(int64_t valid_row) const { - if (_is_partitioned) { - return false; - } else { - return level0_sub_table.should_be_shrink(valid_row); - } - } - - size_t size() { - size_t count = 0; - if (_is_partitioned) { - for (auto i = 0u; i < this->NUM_LEVEL1_SUB_TABLES; ++i) { - count += this->level1_sub_tables[i].size(); - } - } else { - count = level0_sub_table.size(); - } - return count; - } - - void init_buf_size(size_t reserve_for_num_elements) { - if (_is_partitioned) { - for (auto& impl : level1_sub_tables) { - impl.init_buf_size(reserve_for_num_elements / NUM_LEVEL1_SUB_TABLES); - } - } else { - if (level0_sub_table.check_if_need_partition(reserve_for_num_elements)) { - level0_sub_table.clear_and_shrink(); - _is_partitioned = true; - - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i].init_buf_size(reserve_for_num_elements / - NUM_LEVEL1_SUB_TABLES); - } - } else { - level0_sub_table.init_buf_size(reserve_for_num_elements); - } - } - } - - void delete_zero_key(Key key) { - if (_is_partitioned) { - const auto key_hash = hash(key); - size_t sub_table_idx = get_sub_table_from_hash(key_hash); - level1_sub_tables[sub_table_idx].delete_zero_key(key); - } else { - level0_sub_table.delete_zero_key(key); - } - } - - int64_t get_collisions() const { - size_t collisions = level0_sub_table.get_collisions(); - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; i++) { - collisions += level1_sub_tables[i].get_collisions(); - } - return collisions; - } - - size_t get_buffer_size_in_bytes() const { - if (_is_partitioned) { - size_t buff_size = 0; - for (const auto& impl : level1_sub_tables) buff_size += impl.get_buffer_size_in_bytes(); - return buff_size; - } else { - return level0_sub_table.get_buffer_size_in_bytes(); - } - } - - size_t get_buffer_size_in_cells() const { - if (_is_partitioned) { - size_t buff_size = 0; - for (const auto& impl : level1_sub_tables) buff_size += impl.get_buffer_size_in_cells(); - return buff_size; - } else { - return level0_sub_table.get_buffer_size_in_cells(); - } - } - - std::vector<size_t> get_buffer_sizes_in_cells() const { - std::vector<size_t> sizes; - if (_is_partitioned) { - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - sizes.push_back(level1_sub_tables[i].get_buffer_size_in_cells()); - } - } else { - sizes.push_back(level0_sub_table.get_buffer_size_in_cells()); - } - return sizes; - } - - void reset_resize_timer() { - if (_is_partitioned) { - for (auto& impl : level1_sub_tables) { - impl.reset_resize_timer(); - } - } else { - level0_sub_table.reset_resize_timer(); - } - } - int64_t get_resize_timer_value() const { - if (_is_partitioned) { - int64_t resize_timer_ns = 0; - for (const auto& impl : level1_sub_tables) { - resize_timer_ns += impl.get_resize_timer_value(); - } - return resize_timer_ns; - } else { - return level0_sub_table.get_resize_timer_value(); - } - } - - bool has_null_key_data() const { return false; } - template <typename MappedType> - char* get_null_key_data() { - return nullptr; - } - -protected: - typename Impl::iterator begin_of_next_non_empty_sub_table_idx(size_t& sub_table_idx) { - while (sub_table_idx != NUM_LEVEL1_SUB_TABLES && level1_sub_tables[sub_table_idx].empty()) - ++sub_table_idx; - - if (sub_table_idx != NUM_LEVEL1_SUB_TABLES) return level1_sub_tables[sub_table_idx].begin(); - - --sub_table_idx; - return level1_sub_tables[MAX_SUB_TABLE].end(); - } - - typename Impl::const_iterator begin_of_next_non_empty_sub_table_idx( - size_t& sub_table_idx) const { - while (sub_table_idx != NUM_LEVEL1_SUB_TABLES && level1_sub_tables[sub_table_idx].empty()) - ++sub_table_idx; - - if (sub_table_idx != NUM_LEVEL1_SUB_TABLES) return level1_sub_tables[sub_table_idx].begin(); - - --sub_table_idx; - return level1_sub_tables[MAX_SUB_TABLE].end(); - } - -public: - void set_partitioned_threshold(int threshold) { - level0_sub_table.set_partitioned_threshold(threshold); - } - - class iterator /// NOLINT - { - Self* container {}; - size_t sub_table_idx {}; - typename Impl::iterator current_it {}; - - friend class PartitionedHashTable; - - iterator(Self* container_, size_t sub_table_idx_, typename Impl::iterator current_it_) - : container(container_), sub_table_idx(sub_table_idx_), current_it(current_it_) {} - - public: - iterator() = default; - - bool operator==(const iterator& rhs) const { - return sub_table_idx == rhs.sub_table_idx && current_it == rhs.current_it; - } - bool operator!=(const iterator& rhs) const { return !(*this == rhs); } - - iterator& operator++() { - ++current_it; - if (container->_is_partitioned) { - if (current_it == container->level1_sub_tables[sub_table_idx].end()) { - ++sub_table_idx; - current_it = container->begin_of_next_non_empty_sub_table_idx(sub_table_idx); - } - } - - return *this; - } - - auto& operator*() { return *current_it; } - auto* operator->() { return current_it.get_ptr(); } - - auto* get_ptr() { return current_it.get_ptr(); } - size_t get_hash() { return current_it.get_hash(); } - }; - - class const_iterator /// NOLINT - { - Self* container {}; - size_t sub_table_idx {}; - typename Impl::const_iterator current_it {}; - - friend class PartitionedHashTable; - - const_iterator(Self* container_, size_t sub_table_idx_, - typename Impl::const_iterator current_it_) - : container(container_), sub_table_idx(sub_table_idx_), current_it(current_it_) {} - - public: - const_iterator() = default; - const_iterator(const iterator& rhs) - : container(rhs.container), - sub_table_idx(rhs.sub_table_idx), - current_it(rhs.current_it) {} /// NOLINT - - bool operator==(const const_iterator& rhs) const { - return sub_table_idx == rhs.sub_table_idx && current_it == rhs.current_it; - } - bool operator!=(const const_iterator& rhs) const { return !(*this == rhs); } - - const_iterator& operator++() { - ++current_it; - if (container->_is_partitioned) { - if (current_it == container->level1_sub_tables[sub_table_idx].end()) { - ++sub_table_idx; - current_it = container->begin_of_next_non_empty_sub_table_idx(sub_table_idx); - } - } - - return *this; - } - - const auto& operator*() const { return *current_it; } - const auto* operator->() const { return current_it->get_ptr(); } - - const auto* get_ptr() const { return current_it.get_ptr(); } - size_t get_hash() const { return current_it.get_hash(); } - }; - - const_iterator begin() const { - if (_is_partitioned) { - size_t sub_table_idx = 0; - typename Impl::const_iterator impl_it = - begin_of_next_non_empty_sub_table_idx(sub_table_idx); - return {this, sub_table_idx, impl_it}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.begin()}; - } - } - - iterator begin() { - if (_is_partitioned) { - size_t sub_table_idx = 0; - typename Impl::iterator impl_it = begin_of_next_non_empty_sub_table_idx(sub_table_idx); - return {this, sub_table_idx, impl_it}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.begin()}; - } - } - - const_iterator end() const { - if (_is_partitioned) { - return {this, MAX_SUB_TABLE, level1_sub_tables[MAX_SUB_TABLE].end()}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.end()}; - } - } - iterator end() { - if (_is_partitioned) { - return {this, MAX_SUB_TABLE, level1_sub_tables[MAX_SUB_TABLE].end()}; - } else { - return {this, NUM_LEVEL1_SUB_TABLES, level0_sub_table.end()}; - } - } - - /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. - std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type& x) { - size_t hash_value = hash(cell_type::get_key(x)); - - std::pair<LookupResult, bool> res; - emplace(cell_type::get_key(x), res.first, res.second, hash_value); - - if (res.second) insert_set_mapped(lookup_result_get_mapped(res.first), x); - - return res; - } - - void expanse_for_add_elem(size_t num_elem) { - if (_is_partitioned) { - size_t num_elem_per_sub_table = - (num_elem + NUM_LEVEL1_SUB_TABLES - 1) / NUM_LEVEL1_SUB_TABLES; - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i].expanse_for_add_elem(num_elem_per_sub_table); - } - } else { - level0_sub_table.expanse_for_add_elem(num_elem); - if (UNLIKELY(level0_sub_table.need_partition())) { - convert_to_partitioned(); - } - } - } - - template <bool READ> - void ALWAYS_INLINE prefetch(const Key& key, size_t hash_value) { - if (_is_partitioned) { - const auto sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].template prefetch<READ>(hash_value); - } else { - level0_sub_table.template prefetch<READ>(hash_value); - } - } - - /** Insert the key, - * return an iterator to a position that can be used for `placement new` of value, - * as well as the flag - whether a new key was inserted. - * - * You have to make `placement new` values if you inserted a new key, - * since when destroying a hash table, the destructor will be invoked for it! - * - * Example usage: - * - * Map::iterator it; - * bool inserted; - * map.emplace(key, it, inserted); - * if (inserted) - * new(&it->second) Mapped(value); - */ - template <typename KeyHolder> - void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, bool& inserted) { - size_t hash_value = hash(key_holder); - emplace(key_holder, it, inserted, hash_value); - } - - /// Same, but with a precalculated values of hash function. - template <typename KeyHolder> - void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, bool& inserted, - size_t hash_value) { - if (_is_partitioned) { - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].emplace(key_holder, it, inserted, hash_value); - } else { - level0_sub_table.emplace(key_holder, it, inserted, hash_value); - if (UNLIKELY(level0_sub_table.need_partition())) { - convert_to_partitioned(); - - // The hash table was converted to partitioned, so we have to re-find the key. - size_t sub_table_id = get_sub_table_from_hash(hash_value); - it = level1_sub_tables[sub_table_id].find(key_holder, hash_value); - } - } - } - - template <typename KeyHolder> - void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value, - bool& inserted) { - emplace(key_holder, it, inserted, hash_value); - } - - template <typename KeyHolder, typename Func> - void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) { - size_t hash_value = hash(key_holder); - lazy_emplace(key_holder, it, hash_value, std::forward<Func>(f)); - } - - template <typename KeyHolder, typename Func> - void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value, - Func&& f) { - if (_is_partitioned) { - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].lazy_emplace(key_holder, it, hash_value, - std::forward<Func>(f)); - } else { - level0_sub_table.lazy_emplace(key_holder, it, hash_value, std::forward<Func>(f)); - if (UNLIKELY(level0_sub_table.need_partition())) { - convert_to_partitioned(); - - // The hash table was converted to partitioned, so we have to re-find the key. - size_t sub_table_id = get_sub_table_from_hash(hash_value); - it = level1_sub_tables[sub_table_id].find(key_holder, hash_value); - } - } - } - - LookupResult ALWAYS_INLINE find(Key x, size_t hash_value) { - if (_is_partitioned) { - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - return level1_sub_tables[sub_table_idx].find(x, hash_value); - } else { - return level0_sub_table.find(x, hash_value); - } - } - - ConstLookupResult ALWAYS_INLINE find(Key x, size_t hash_value) const { - return const_cast<std::decay_t<decltype(*this)>*>(this)->find(x, hash_value); - } - - LookupResult ALWAYS_INLINE find(Key x) { return find(x, hash(x)); } - - ConstLookupResult ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); } - - size_t size() const { - if (_is_partitioned) { - size_t res = 0; - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - res += level1_sub_tables[i].size(); - } - return res; - } else { - return level0_sub_table.size(); - } - } - - std::vector<size_t> sizes() const { - std::vector<size_t> sizes; - if (_is_partitioned) { - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - sizes.push_back(level1_sub_tables[i].size()); - } - } else { - sizes.push_back(level0_sub_table.size()); - } - return sizes; - } - - bool empty() const { - if (_is_partitioned) { - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) - if (!level1_sub_tables[i].empty()) return false; - return true; - } else { - return level0_sub_table.empty(); - } - } - - bool add_elem_size_overflow(size_t row) const { - return !_is_partitioned && level0_sub_table.add_elem_size_overflow(row); - } - -private: - void convert_to_partitioned() { - SCOPED_RAW_TIMER(&_convert_timer_ns); - - DCHECK(!_is_partitioned); - _is_partitioned = true; - - auto bucket_count = level0_sub_table.get_buffer_size_in_cells(); - for (size_t i = 0; i < NUM_LEVEL1_SUB_TABLES; ++i) { - level1_sub_tables[i] = std::move(Impl(bucket_count / NUM_LEVEL1_SUB_TABLES)); - } - - auto it = level0_sub_table.begin(); - - /// It is assumed that the zero key (stored separately) is first in iteration order. - if (it != level0_sub_table.end() && it.get_ptr()->is_zero(level0_sub_table)) { - insert(it->get_value()); - ++it; - } - - for (; it != level0_sub_table.end(); ++it) { - const auto* cell = it.get_ptr(); - size_t hash_value = cell->get_hash(level0_sub_table); - size_t sub_table_idx = get_sub_table_from_hash(hash_value); - level1_sub_tables[sub_table_idx].insert_unique_non_zero(cell, hash_value); - } - - level0_sub_table.clear_and_shrink(); - } - - /// NOTE Bad for hash tables with more than 2^32 cells. - static size_t get_sub_table_from_hash(size_t hash_value) { - return (hash_value >> (32 - BITS_FOR_SUB_TABLE)) & MAX_SUB_TABLE; - } -}; diff --git a/be/src/vec/common/hash_table/ph_hash_map.h b/be/src/vec/common/hash_table/ph_hash_map.h index 50cf218dc87..de320425223 100644 --- a/be/src/vec/common/hash_table/ph_hash_map.h +++ b/be/src/vec/common/hash_table/ph_hash_map.h @@ -30,8 +30,7 @@ ALWAYS_INLINE inline auto lookup_result_get_mapped(std::pair<const Key, Mapped>* return &(it->second); } -template <typename Key, typename Mapped, typename HashMethod = DefaultHash<Key>, - bool PartitionedHashTable = false> +template <typename Key, typename Mapped, typename HashMethod = DefaultHash<Key>> class PHHashMap : private boost::noncopyable { public: using Self = PHHashMap; @@ -58,9 +57,6 @@ public: PHHashMap& operator=(PHHashMap&& rhs) { _hash_map.clear(); _hash_map = std::move(rhs._hash_map); - std::swap(_need_partition, rhs._need_partition); - std::swap(_partitioned_threshold, rhs._partitioned_threshold); - return *this; } @@ -130,19 +126,11 @@ public: inserted = true; ctor(key_holder, nullptr); }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } template <typename KeyHolder, typename Func> void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) { it = &*_hash_map.lazy_emplace(key_holder, [&](const auto& ctor) { f(ctor, key_holder); }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } template <typename KeyHolder> @@ -157,10 +145,6 @@ public: ctor(key, mapped_type()); } }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } template <typename KeyHolder, typename Func> @@ -168,10 +152,6 @@ public: Func&& f) { it = &*_hash_map.lazy_emplace_with_hash(key, hash_value, [&](const auto& ctor) { f(ctor, key, key); }); - - if constexpr (PartitionedHashTable) { - _check_if_need_partition(); - } } void ALWAYS_INLINE insert(const Key& key, size_t hash_value, const Mapped& value) { @@ -225,18 +205,6 @@ public: } bool has_null_key_data() const { return false; } - bool need_partition() { return _need_partition; } - - void set_partitioned_threshold(int threshold) { _partitioned_threshold = threshold; } - - bool check_if_need_partition(size_t bucket_count) { - if constexpr (PartitionedHashTable) { - return _partitioned_threshold > 0 && bucket_count >= _partitioned_threshold; - } else { - return false; - } - } - bool empty() const { return _hash_map.empty(); } void clear_and_shrink() { _hash_map.clear(); } @@ -244,19 +212,5 @@ public: void expanse_for_add_elem(size_t num_elem) { _hash_map.reserve(num_elem); } private: - void _check_if_need_partition() { - if (UNLIKELY(check_if_need_partition(_hash_map.size() + 1))) { - _need_partition = add_elem_size_overflow(1); - } - } - HashMapImpl _hash_map; - // the bucket count threshold above which it's converted to partioned hash table - // > 0: enable convert dynamically - // 0: convert is disabled - int _partitioned_threshold = 0; - // if need resize and bucket count after resize will be >= _partitioned_threshold, - // this flag is set to true, and resize does not actually happen, - // PartitionedHashTable will convert this hash table to partitioned hash table - bool _need_partition; }; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org