[GitHub] [doris] yiguolei commented on a diff in pull request #13654: [improvement](hashjoin) support partitioned hash table in hash join

GitBox Sun, 13 Nov 2022 19:56:43 -0800


yiguolei commented on code in PR #13654:
URL: https://github.com/apache/doris/pull/13654#discussion_r1021054512



##########
be/src/vec/common/hash_table/partitioned_hash_table.h:
##########
@@ -0,0 +1,412 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashTable.h
+// and modified by Doris
+#pragma once
+
+#include "vec/common/hash_table/hash_table.h"
+
+/** Two-level hash table.
+  * Represents 256 (or 1ULL << BITS_FOR_BUCKET) small hash tables (buckets of 
the first level).
+  * To determine which one to use, one of the bytes of the hash function is 
taken.
+  *
+  * Usually works a little slower than a simple hash table.
+  * However, it has advantages in some cases:
+  * - if you need to merge two hash tables together, then you can easily 
parallelize it by buckets;
+  * - delay during resizes is amortized, since the small hash tables will be 
resized separately;
+  * - in theory, resizes are cache-local in a larger range of sizes.
+  */
+
+template <size_t initial_size_degree = 8>
+struct PartitionedHashTableGrower : public 
HashTableGrowerWithPrecalculation<initial_size_degree> {
+    /// Increase the size of the hash table.
+    void increase_size() { this->increase_size_degree(this->size_degree() >= 
15 ? 1 : 2); }
+};
+
+template <typename Key, typename Cell, typename Hash, typename Grower, 
typename Allocator,
+          typename ImplTable = HashTable<Key, Cell, Hash, Grower, Allocator>,
+          size_t BITS_FOR_BUCKET = 4>
+class PartitionedHashTable : private boost::noncopyable,
+                             protected Hash /// empty base optimization
+{
+protected:
+    friend class const_iterator;
+    friend class iterator;
+
+    using HashValue = size_t;
+    using Self = PartitionedHashTable;
+
+public:
+    using Impl = ImplTable;
+
+    static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
+    static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
+
+    //factor that will trigger growing the hash table on insert.
+    static constexpr float MAX_BUCKET_OCCUPANCY_FRACTION = 0.5f;
+
+    size_t hash(const Key& x) const { return Hash::operator()(x); }
+
+    /// NOTE Bad for hash tables with more than 2^32 cells.
+    static size_t getBucketFromHash(size_t hash_value) {
+        return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET;
+    }
+
+    float get_factor() const { return MAX_BUCKET_OCCUPANCY_FRACTION; }
+
+    bool should_be_shrink(int64_t valid_row) { return false; }
+
+    void init_buf_size(size_t reserve_for_num_elements) {}
+
+    void delete_zero_key(Key key) {}
+
+    size_t get_buffer_size_in_bytes() const {
+        size_t buff_size = 0;
+        for (const auto& impl : impls) buff_size += 
impl.get_buffer_size_in_bytes();
+        return buff_size;
+    }
+
+    size_t get_buffer_size_in_cells() const {
+        size_t buff_size = 0;
+        for (const auto& impl : impls) buff_size += 
impl.get_buffer_size_in_cells();
+        return buff_size;
+    }
+
+    size_t* get_buffer_sizes_in_cells(size_t& num_buckets) const {
+        num_buckets = NUM_BUCKETS;

Review Comment:
   Is this method called?



##########
be/src/vec/common/hash_table/partitioned_hash_table.h:
##########
@@ -0,0 +1,412 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashTable.h
+// and modified by Doris
+#pragma once
+
+#include "vec/common/hash_table/hash_table.h"
+
+/** Two-level hash table.
+  * Represents 256 (or 1ULL << BITS_FOR_BUCKET) small hash tables (buckets of 
the first level).
+  * To determine which one to use, one of the bytes of the hash function is 
taken.
+  *
+  * Usually works a little slower than a simple hash table.
+  * However, it has advantages in some cases:
+  * - if you need to merge two hash tables together, then you can easily 
parallelize it by buckets;
+  * - delay during resizes is amortized, since the small hash tables will be 
resized separately;
+  * - in theory, resizes are cache-local in a larger range of sizes.
+  */
+
+template <size_t initial_size_degree = 8>
+struct PartitionedHashTableGrower : public 
HashTableGrowerWithPrecalculation<initial_size_degree> {
+    /// Increase the size of the hash table.
+    void increase_size() { this->increase_size_degree(this->size_degree() >= 
15 ? 1 : 2); }
+};
+
+template <typename Key, typename Cell, typename Hash, typename Grower, 
typename Allocator,
+          typename ImplTable = HashTable<Key, Cell, Hash, Grower, Allocator>,
+          size_t BITS_FOR_BUCKET = 4>
+class PartitionedHashTable : private boost::noncopyable,
+                             protected Hash /// empty base optimization
+{
+protected:
+    friend class const_iterator;
+    friend class iterator;
+
+    using HashValue = size_t;
+    using Self = PartitionedHashTable;
+
+public:
+    using Impl = ImplTable;
+
+    static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
+    static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
+
+    //factor that will trigger growing the hash table on insert.
+    static constexpr float MAX_BUCKET_OCCUPANCY_FRACTION = 0.5f;
+
+    size_t hash(const Key& x) const { return Hash::operator()(x); }
+
+    /// NOTE Bad for hash tables with more than 2^32 cells.
+    static size_t getBucketFromHash(size_t hash_value) {
+        return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET;
+    }
+
+    float get_factor() const { return MAX_BUCKET_OCCUPANCY_FRACTION; }
+
+    bool should_be_shrink(int64_t valid_row) { return false; }
+
+    void init_buf_size(size_t reserve_for_num_elements) {}
+
+    void delete_zero_key(Key key) {}
+
+    size_t get_buffer_size_in_bytes() const {
+        size_t buff_size = 0;
+        for (const auto& impl : impls) buff_size += 
impl.get_buffer_size_in_bytes();
+        return buff_size;
+    }
+
+    size_t get_buffer_size_in_cells() const {
+        size_t buff_size = 0;
+        for (const auto& impl : impls) buff_size += 
impl.get_buffer_size_in_cells();
+        return buff_size;
+    }
+
+    size_t* get_buffer_sizes_in_cells(size_t& num_buckets) const {
+        num_buckets = NUM_BUCKETS;

Review Comment:
   Is this method used?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[GitHub] [doris] yiguolei commented on a diff in pull request #13654: [improvement](hashjoin) support partitioned hash table in hash join

Reply via email to