github-actions[bot] commented on code in PR #24966:
URL: https://github.com/apache/doris/pull/24966#discussion_r1340996953


##########
be/src/vec/columns/column_decimal.h:
##########
@@ -159,14 +159,13 @@ class ColumnDecimal final : public 
COWHelper<ColumnVectorHelper, ColumnDecimal<T
     StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& 
begin) const override;
     const char* deserialize_and_insert_from_arena(const char* pos) override;
 
-    virtual size_t get_max_row_byte_size() const override;
+    size_t get_max_row_byte_size() const override;

Review Comment:
   warning: function 'get_max_row_byte_size' should be marked [[nodiscard]] 
[modernize-use-nodiscard]
   
   ```suggestion
       [[nodiscard]] size_t get_max_row_byte_size() const override;
   ```
   



##########
be/src/vec/common/hash_table/hash_map_context.h:
##########
@@ -0,0 +1,449 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <type_traits>
+
+#include "runtime/descriptors.h"
+#include "util/stack_util.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/arena.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/columns_hashing.h"
+#include "vec/common/hash_table/partitioned_hash_map.h"
+#include "vec/common/string_ref.h"
+#include "vec/utils/util.hpp"
+
+namespace doris::vectorized {
+
+template <typename HashMap>
+struct MethodBase {
+    using Key = typename HashMap::key_type;
+    using Mapped = typename HashMap::mapped_type;
+    using Value = typename HashMap::value_type;
+    using Iterator = typename HashMap::iterator;
+
+    std::shared_ptr<HashMap> hash_table;
+    Iterator iterator;
+    bool inited_iterator = false;
+    Key* keys;
+    std::unique_ptr<Arena> arena;
+    std::vector<size_t> hash_values;
+
+    MethodBase() {
+        arena.reset(new Arena());
+        hash_table.reset(new HashMap());
+    }
+    virtual ~MethodBase() = default;
+
+    virtual void reset() {
+        arena.reset(new Arena());
+        inited_iterator = false;
+    }
+
+    void init_iterator() {
+        if (!inited_iterator) {
+            inited_iterator = true;
+            iterator = hash_table->begin();
+        }
+    }
+    virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, const 
Sizes& key_sizes,
+                                      size_t num_rows, const uint8_t* null_map 
= nullptr) = 0;
+
+    void init_hash_values(size_t num_rows, const uint8_t* null_map) {
+        if (null_map == nullptr) {
+            init_hash_values(num_rows);
+            return;
+        }
+        hash_values.resize(num_rows);
+        for (size_t k = 0; k < num_rows; ++k) {
+            if (null_map[k]) {
+                continue;
+            }
+
+            hash_values[k] = hash_table->hash(keys[k]);
+        }
+    }
+    void init_hash_values(size_t num_rows) {
+        hash_values.resize(num_rows);
+        for (size_t k = 0; k < num_rows; ++k) {
+            hash_values[k] = hash_table->hash(keys[k]);
+        }
+    }
+
+    virtual void insert_keys_into_columns(std::vector<Key>& keys, 
MutableColumns& key_columns,
+                                          const size_t num_rows, const Sizes&) 
= 0;
+};
+
+template <typename TData>
+struct MethodSerialized : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State =
+            ColumnsHashing::HashMethodSerialized<typename Base::Value, 
typename Base::Mapped, true>;
+
+    std::vector<StringRef> stored_keys;
+
+    StringRef serialize_keys_to_pool_contiguous(size_t i, size_t keys_size,
+                                                const ColumnRawPtrs& 
key_columns, Arena& pool) {
+        const char* begin = nullptr;
+
+        size_t sum_size = 0;
+        for (size_t j = 0; j < keys_size; ++j) {
+            sum_size += key_columns[j]->serialize_value_into_arena(i, pool, 
begin).size;
+        }
+
+        return {begin, sum_size};
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        stored_keys.resize(num_rows);
+
+        size_t max_one_row_byte_size = 0;
+        for (const auto& column : key_columns) {
+            max_one_row_byte_size += column->get_max_row_byte_size();
+        }
+        size_t total_bytes = max_one_row_byte_size * num_rows;
+
+        if (total_bytes > config::pre_serialize_keys_limit_bytes) {
+            // reach mem limit, don't serialize in batch
+            Base::arena->clear();
+            size_t keys_size = key_columns.size();
+            for (size_t i = 0; i < num_rows; ++i) {
+                stored_keys[i] =
+                        serialize_keys_to_pool_contiguous(i, keys_size, 
key_columns, *Base::arena);
+            }
+        } else {
+            uint8_t* serialized_key_buffer =
+                    
reinterpret_cast<uint8_t*>(Base::arena->alloc(total_bytes));
+
+            for (size_t i = 0; i < num_rows; ++i) {
+                stored_keys[i].data =
+                        reinterpret_cast<char*>(serialized_key_buffer + i * 
max_one_row_byte_size);
+                stored_keys[i].size = 0;
+            }
+
+            for (const auto& column : key_columns) {
+                column->serialize_vec(stored_keys, num_rows, 
max_one_row_byte_size);
+            }
+        }
+        Base::keys = stored_keys.data();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<StringRef>& keys, 
MutableColumns& key_columns,
+                                  const size_t num_rows, const Sizes&) 
override {
+        for (auto& column : key_columns) {
+            column->deserialize_vec(keys, num_rows);
+        }
+    }
+};
+
+inline size_t get_bitmap_size(size_t key_number) {
+    return (key_number + 7) / 8;
+}
+
+template <typename TData>
+struct MethodStringNoCache : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State = ColumnsHashing::HashMethodString<typename Base::Value, 
typename Base::Mapped,
+                                                   true, false>;
+
+    std::vector<StringRef> stored_keys;
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        const IColumn& column = *key_columns[0];
+        const ColumnString& column_string = assert_cast<const ColumnString&>(
+                column.is_nullable()
+                        ? assert_cast<const 
ColumnNullable&>(column).get_nested_column()
+                        : column);
+        auto offsets = column_string.get_offsets().data();
+        auto chars = column_string.get_chars().data();
+
+        auto* buffer = Base::arena->alloc(column_string.get_chars().size());
+        memcpy(buffer, chars, column_string.get_chars().size());
+
+        stored_keys.resize(column_string.size());
+        for (size_t row = 0; row < column_string.size(); row++) {
+            stored_keys[row] =
+                    StringRef(buffer + offsets[row - 1], offsets[row] - 
offsets[row - 1]);
+        }
+
+        Base::keys = stored_keys.data();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<StringRef>& keys, 
MutableColumns& key_columns,
+                                  const size_t num_rows, const Sizes&) 
override {
+        key_columns[0]->reserve(num_rows);
+        key_columns[0]->insert_many_strings(keys.data(), num_rows);
+    }
+};
+
+/// For the case where there is one numeric key.
+/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
+template <typename FieldType, typename TData>
+struct MethodOneNumber : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State = ColumnsHashing::HashMethodOneNumber<typename Base::Value, 
typename Base::Mapped,
+                                                      FieldType, false>;
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        Base::keys = (FieldType*)(key_columns[0]->is_nullable()
+                                          ? assert_cast<const 
ColumnNullable*>(key_columns[0])
+                                                    ->get_nested_column_ptr()
+                                          : key_columns[0])
+                             ->get_raw_data()
+                             .data;
+        std::string name = key_columns[0]->get_name();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<typename Base::Key>& keys,
+                                  MutableColumns& key_columns, const size_t 
num_rows,
+                                  const Sizes&) override {
+        key_columns[0]->reserve(num_rows);
+        auto* column = static_cast<ColumnVectorHelper*>(key_columns[0].get());
+        for (size_t i = 0; i != num_rows; ++i) {
+            const auto* key_holder = reinterpret_cast<const char*>(&keys[i]);
+            column->insert_raw_data<sizeof(FieldType)>(key_holder);
+        }
+    }
+};
+
+template <typename TData, bool has_nullable_keys = false>
+struct MethodKeysFixed : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using typename Base::Key;
+    using typename Base::Mapped;
+    using Base::keys;
+    using Base::hash_table;
+    using Base::iterator;
+
+    using State = ColumnsHashing::HashMethodKeysFixed<typename Base::Value, 
Key, Mapped,
+                                                      has_nullable_keys, 
false>;
+
+    std::vector<Key> stored_keys;
+
+    template <typename T>
+    std::vector<T> pack_fixeds(size_t row_numbers, const ColumnRawPtrs& 
key_columns,
+                               const Sizes& key_sizes, const ColumnRawPtrs& 
nullmap_columns) {
+        size_t bitmap_size = get_bitmap_size(nullmap_columns.size());
+
+        std::vector<T> result(row_numbers);
+        size_t offset = 0;
+        if (bitmap_size > 0) {
+            for (size_t j = 0; j < nullmap_columns.size(); j++) {
+                if (!nullmap_columns[j]) {
+                    continue;
+                }
+                size_t bucket = j / 8;
+                size_t offset = j % 8;
+                const auto& data =
+                        assert_cast<const 
ColumnUInt8&>(*nullmap_columns[j]).get_data().data();
+                for (size_t i = 0; i < row_numbers; ++i) {
+                    *((char*)(&result[i]) + bucket) |= data[i] << offset;
+                }
+            }
+            offset += bitmap_size;
+        }
+
+        for (size_t j = 0; j < key_columns.size(); ++j) {
+            const char* data = key_columns[j]->get_raw_data().data;
+
+            auto foo = [&]<typename Fixed>(Fixed zero) {
+                CHECK_EQ(sizeof(Fixed), key_sizes[j]);
+                if (nullmap_columns.size() && nullmap_columns[j]) {
+                    const auto& nullmap =
+                            assert_cast<const 
ColumnUInt8&>(*nullmap_columns[j]).get_data().data();
+                    for (size_t i = 0; i < row_numbers; ++i) {
+                        // make sure null cell is filled by 0x0
+                        memcpy_fixed<Fixed>((char*)(&result[i]) + offset,
+                                            nullmap[i] ? (char*)&zero : data + 
i * sizeof(Fixed));
+                    }
+                } else {
+                    for (size_t i = 0; i < row_numbers; ++i) {
+                        memcpy_fixed<Fixed>((char*)(&result[i]) + offset, data 
+ i * sizeof(Fixed));
+                    }
+                }
+            };
+
+            if (key_sizes[j] == 1) {
+                foo(int8_t());
+            } else if (key_sizes[j] == 2) {
+                foo(int16_t());
+            } else if (key_sizes[j] == 4) {
+                foo(int32_t());
+            } else if (key_sizes[j] == 8) {
+                foo(int64_t());
+            } else if (key_sizes[j] == 16) {
+                foo(UInt128());
+            } else {
+                throw Exception(ErrorCode::INTERNAL_ERROR,
+                                "pack_fixeds input invalid key size, 
key_size={}", key_sizes[j]);
+            }
+            offset += key_sizes[j];
+        }
+        return result;
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        ColumnRawPtrs actual_columns;

Review Comment:
   warning: variable 'actual_columns' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           ColumnRawPtrs actual_columns = 0;
   ```
   



##########
be/src/vec/common/hash_table/hash_map_context.h:
##########
@@ -0,0 +1,449 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <type_traits>
+
+#include "runtime/descriptors.h"
+#include "util/stack_util.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/arena.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/columns_hashing.h"
+#include "vec/common/hash_table/partitioned_hash_map.h"
+#include "vec/common/string_ref.h"
+#include "vec/utils/util.hpp"
+
+namespace doris::vectorized {
+
+template <typename HashMap>
+struct MethodBase {
+    using Key = typename HashMap::key_type;
+    using Mapped = typename HashMap::mapped_type;
+    using Value = typename HashMap::value_type;
+    using Iterator = typename HashMap::iterator;
+
+    std::shared_ptr<HashMap> hash_table;
+    Iterator iterator;
+    bool inited_iterator = false;
+    Key* keys;
+    std::unique_ptr<Arena> arena;
+    std::vector<size_t> hash_values;
+
+    MethodBase() {
+        arena.reset(new Arena());
+        hash_table.reset(new HashMap());
+    }
+    virtual ~MethodBase() = default;
+
+    virtual void reset() {
+        arena.reset(new Arena());
+        inited_iterator = false;
+    }
+
+    void init_iterator() {
+        if (!inited_iterator) {
+            inited_iterator = true;
+            iterator = hash_table->begin();
+        }
+    }
+    virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, const 
Sizes& key_sizes,
+                                      size_t num_rows, const uint8_t* null_map 
= nullptr) = 0;
+
+    void init_hash_values(size_t num_rows, const uint8_t* null_map) {
+        if (null_map == nullptr) {
+            init_hash_values(num_rows);
+            return;
+        }
+        hash_values.resize(num_rows);
+        for (size_t k = 0; k < num_rows; ++k) {
+            if (null_map[k]) {
+                continue;
+            }
+
+            hash_values[k] = hash_table->hash(keys[k]);
+        }
+    }
+    void init_hash_values(size_t num_rows) {
+        hash_values.resize(num_rows);
+        for (size_t k = 0; k < num_rows; ++k) {
+            hash_values[k] = hash_table->hash(keys[k]);
+        }
+    }
+
+    virtual void insert_keys_into_columns(std::vector<Key>& keys, 
MutableColumns& key_columns,
+                                          const size_t num_rows, const Sizes&) 
= 0;
+};
+
+template <typename TData>
+struct MethodSerialized : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State =
+            ColumnsHashing::HashMethodSerialized<typename Base::Value, 
typename Base::Mapped, true>;
+
+    std::vector<StringRef> stored_keys;
+
+    StringRef serialize_keys_to_pool_contiguous(size_t i, size_t keys_size,
+                                                const ColumnRawPtrs& 
key_columns, Arena& pool) {
+        const char* begin = nullptr;
+
+        size_t sum_size = 0;
+        for (size_t j = 0; j < keys_size; ++j) {
+            sum_size += key_columns[j]->serialize_value_into_arena(i, pool, 
begin).size;
+        }
+
+        return {begin, sum_size};
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        stored_keys.resize(num_rows);
+
+        size_t max_one_row_byte_size = 0;
+        for (const auto& column : key_columns) {
+            max_one_row_byte_size += column->get_max_row_byte_size();
+        }
+        size_t total_bytes = max_one_row_byte_size * num_rows;
+
+        if (total_bytes > config::pre_serialize_keys_limit_bytes) {
+            // reach mem limit, don't serialize in batch
+            Base::arena->clear();
+            size_t keys_size = key_columns.size();
+            for (size_t i = 0; i < num_rows; ++i) {
+                stored_keys[i] =
+                        serialize_keys_to_pool_contiguous(i, keys_size, 
key_columns, *Base::arena);
+            }
+        } else {
+            uint8_t* serialized_key_buffer =
+                    
reinterpret_cast<uint8_t*>(Base::arena->alloc(total_bytes));
+
+            for (size_t i = 0; i < num_rows; ++i) {
+                stored_keys[i].data =
+                        reinterpret_cast<char*>(serialized_key_buffer + i * 
max_one_row_byte_size);
+                stored_keys[i].size = 0;
+            }
+
+            for (const auto& column : key_columns) {
+                column->serialize_vec(stored_keys, num_rows, 
max_one_row_byte_size);
+            }
+        }
+        Base::keys = stored_keys.data();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<StringRef>& keys, 
MutableColumns& key_columns,
+                                  const size_t num_rows, const Sizes&) 
override {
+        for (auto& column : key_columns) {
+            column->deserialize_vec(keys, num_rows);
+        }
+    }
+};
+
+inline size_t get_bitmap_size(size_t key_number) {
+    return (key_number + 7) / 8;
+}
+
+template <typename TData>
+struct MethodStringNoCache : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State = ColumnsHashing::HashMethodString<typename Base::Value, 
typename Base::Mapped,
+                                                   true, false>;
+
+    std::vector<StringRef> stored_keys;
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        const IColumn& column = *key_columns[0];
+        const ColumnString& column_string = assert_cast<const ColumnString&>(
+                column.is_nullable()
+                        ? assert_cast<const 
ColumnNullable&>(column).get_nested_column()
+                        : column);
+        auto offsets = column_string.get_offsets().data();
+        auto chars = column_string.get_chars().data();
+
+        auto* buffer = Base::arena->alloc(column_string.get_chars().size());
+        memcpy(buffer, chars, column_string.get_chars().size());
+
+        stored_keys.resize(column_string.size());
+        for (size_t row = 0; row < column_string.size(); row++) {
+            stored_keys[row] =
+                    StringRef(buffer + offsets[row - 1], offsets[row] - 
offsets[row - 1]);
+        }
+
+        Base::keys = stored_keys.data();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<StringRef>& keys, 
MutableColumns& key_columns,
+                                  const size_t num_rows, const Sizes&) 
override {
+        key_columns[0]->reserve(num_rows);
+        key_columns[0]->insert_many_strings(keys.data(), num_rows);
+    }
+};
+
+/// For the case where there is one numeric key.
+/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
+template <typename FieldType, typename TData>
+struct MethodOneNumber : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State = ColumnsHashing::HashMethodOneNumber<typename Base::Value, 
typename Base::Mapped,
+                                                      FieldType, false>;
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        Base::keys = (FieldType*)(key_columns[0]->is_nullable()
+                                          ? assert_cast<const 
ColumnNullable*>(key_columns[0])
+                                                    ->get_nested_column_ptr()
+                                          : key_columns[0])
+                             ->get_raw_data()
+                             .data;
+        std::string name = key_columns[0]->get_name();

Review Comment:
   warning: variable 'name' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           std::string name = 0 = key_columns[0]->get_name();
   ```
   



##########
be/src/vec/common/hash_table/hash_map_context.h:
##########
@@ -0,0 +1,449 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <type_traits>
+
+#include "runtime/descriptors.h"
+#include "util/stack_util.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/arena.h"
+#include "vec/common/assert_cast.h"
+#include "vec/common/columns_hashing.h"
+#include "vec/common/hash_table/partitioned_hash_map.h"
+#include "vec/common/string_ref.h"
+#include "vec/utils/util.hpp"
+
+namespace doris::vectorized {
+
+template <typename HashMap>
+struct MethodBase {
+    using Key = typename HashMap::key_type;
+    using Mapped = typename HashMap::mapped_type;
+    using Value = typename HashMap::value_type;
+    using Iterator = typename HashMap::iterator;
+
+    std::shared_ptr<HashMap> hash_table;
+    Iterator iterator;
+    bool inited_iterator = false;
+    Key* keys;
+    std::unique_ptr<Arena> arena;
+    std::vector<size_t> hash_values;
+
+    MethodBase() {
+        arena.reset(new Arena());
+        hash_table.reset(new HashMap());
+    }
+    virtual ~MethodBase() = default;
+
+    virtual void reset() {
+        arena.reset(new Arena());
+        inited_iterator = false;
+    }
+
+    void init_iterator() {
+        if (!inited_iterator) {
+            inited_iterator = true;
+            iterator = hash_table->begin();
+        }
+    }
+    virtual void init_serialized_keys(const ColumnRawPtrs& key_columns, const 
Sizes& key_sizes,
+                                      size_t num_rows, const uint8_t* null_map 
= nullptr) = 0;
+
+    void init_hash_values(size_t num_rows, const uint8_t* null_map) {
+        if (null_map == nullptr) {
+            init_hash_values(num_rows);
+            return;
+        }
+        hash_values.resize(num_rows);
+        for (size_t k = 0; k < num_rows; ++k) {
+            if (null_map[k]) {
+                continue;
+            }
+
+            hash_values[k] = hash_table->hash(keys[k]);
+        }
+    }
+    void init_hash_values(size_t num_rows) {
+        hash_values.resize(num_rows);
+        for (size_t k = 0; k < num_rows; ++k) {
+            hash_values[k] = hash_table->hash(keys[k]);
+        }
+    }
+
+    virtual void insert_keys_into_columns(std::vector<Key>& keys, 
MutableColumns& key_columns,
+                                          const size_t num_rows, const Sizes&) 
= 0;
+};
+
+template <typename TData>
+struct MethodSerialized : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State =
+            ColumnsHashing::HashMethodSerialized<typename Base::Value, 
typename Base::Mapped, true>;
+
+    std::vector<StringRef> stored_keys;
+
+    StringRef serialize_keys_to_pool_contiguous(size_t i, size_t keys_size,
+                                                const ColumnRawPtrs& 
key_columns, Arena& pool) {
+        const char* begin = nullptr;
+
+        size_t sum_size = 0;
+        for (size_t j = 0; j < keys_size; ++j) {
+            sum_size += key_columns[j]->serialize_value_into_arena(i, pool, 
begin).size;
+        }
+
+        return {begin, sum_size};
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        stored_keys.resize(num_rows);
+
+        size_t max_one_row_byte_size = 0;
+        for (const auto& column : key_columns) {
+            max_one_row_byte_size += column->get_max_row_byte_size();
+        }
+        size_t total_bytes = max_one_row_byte_size * num_rows;
+
+        if (total_bytes > config::pre_serialize_keys_limit_bytes) {
+            // reach mem limit, don't serialize in batch
+            Base::arena->clear();
+            size_t keys_size = key_columns.size();
+            for (size_t i = 0; i < num_rows; ++i) {
+                stored_keys[i] =
+                        serialize_keys_to_pool_contiguous(i, keys_size, 
key_columns, *Base::arena);
+            }
+        } else {
+            uint8_t* serialized_key_buffer =
+                    
reinterpret_cast<uint8_t*>(Base::arena->alloc(total_bytes));
+
+            for (size_t i = 0; i < num_rows; ++i) {
+                stored_keys[i].data =
+                        reinterpret_cast<char*>(serialized_key_buffer + i * 
max_one_row_byte_size);
+                stored_keys[i].size = 0;
+            }
+
+            for (const auto& column : key_columns) {
+                column->serialize_vec(stored_keys, num_rows, 
max_one_row_byte_size);
+            }
+        }
+        Base::keys = stored_keys.data();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<StringRef>& keys, 
MutableColumns& key_columns,
+                                  const size_t num_rows, const Sizes&) 
override {
+        for (auto& column : key_columns) {
+            column->deserialize_vec(keys, num_rows);
+        }
+    }
+};
+
+inline size_t get_bitmap_size(size_t key_number) {
+    return (key_number + 7) / 8;
+}
+
+template <typename TData>
+struct MethodStringNoCache : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State = ColumnsHashing::HashMethodString<typename Base::Value, 
typename Base::Mapped,
+                                                   true, false>;
+
+    std::vector<StringRef> stored_keys;
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        const IColumn& column = *key_columns[0];
+        const ColumnString& column_string = assert_cast<const ColumnString&>(
+                column.is_nullable()
+                        ? assert_cast<const 
ColumnNullable&>(column).get_nested_column()
+                        : column);
+        auto offsets = column_string.get_offsets().data();
+        auto chars = column_string.get_chars().data();
+
+        auto* buffer = Base::arena->alloc(column_string.get_chars().size());
+        memcpy(buffer, chars, column_string.get_chars().size());
+
+        stored_keys.resize(column_string.size());
+        for (size_t row = 0; row < column_string.size(); row++) {
+            stored_keys[row] =
+                    StringRef(buffer + offsets[row - 1], offsets[row] - 
offsets[row - 1]);
+        }
+
+        Base::keys = stored_keys.data();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<StringRef>& keys, 
MutableColumns& key_columns,
+                                  const size_t num_rows, const Sizes&) 
override {
+        key_columns[0]->reserve(num_rows);
+        key_columns[0]->insert_many_strings(keys.data(), num_rows);
+    }
+};
+
+/// For the case where there is one numeric key.
+/// FieldType is UInt8/16/32/64 for any type with corresponding bit width.
+template <typename FieldType, typename TData>
+struct MethodOneNumber : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using Base::init_iterator;
+    using State = ColumnsHashing::HashMethodOneNumber<typename Base::Value, 
typename Base::Mapped,
+                                                      FieldType, false>;
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        Base::keys = (FieldType*)(key_columns[0]->is_nullable()
+                                          ? assert_cast<const 
ColumnNullable*>(key_columns[0])
+                                                    ->get_nested_column_ptr()
+                                          : key_columns[0])
+                             ->get_raw_data()
+                             .data;
+        std::string name = key_columns[0]->get_name();
+        Base::init_hash_values(num_rows, null_map);
+    }
+
+    void insert_keys_into_columns(std::vector<typename Base::Key>& keys,
+                                  MutableColumns& key_columns, const size_t 
num_rows,
+                                  const Sizes&) override {
+        key_columns[0]->reserve(num_rows);
+        auto* column = static_cast<ColumnVectorHelper*>(key_columns[0].get());
+        for (size_t i = 0; i != num_rows; ++i) {
+            const auto* key_holder = reinterpret_cast<const char*>(&keys[i]);
+            column->insert_raw_data<sizeof(FieldType)>(key_holder);
+        }
+    }
+};
+
+template <typename TData, bool has_nullable_keys = false>
+struct MethodKeysFixed : public MethodBase<TData> {
+    using Base = MethodBase<TData>;
+    using typename Base::Key;
+    using typename Base::Mapped;
+    using Base::keys;
+    using Base::hash_table;
+    using Base::iterator;
+
+    using State = ColumnsHashing::HashMethodKeysFixed<typename Base::Value, 
Key, Mapped,
+                                                      has_nullable_keys, 
false>;
+
+    std::vector<Key> stored_keys;
+
+    template <typename T>
+    std::vector<T> pack_fixeds(size_t row_numbers, const ColumnRawPtrs& 
key_columns,
+                               const Sizes& key_sizes, const ColumnRawPtrs& 
nullmap_columns) {
+        size_t bitmap_size = get_bitmap_size(nullmap_columns.size());
+
+        std::vector<T> result(row_numbers);
+        size_t offset = 0;
+        if (bitmap_size > 0) {
+            for (size_t j = 0; j < nullmap_columns.size(); j++) {
+                if (!nullmap_columns[j]) {
+                    continue;
+                }
+                size_t bucket = j / 8;
+                size_t offset = j % 8;
+                const auto& data =
+                        assert_cast<const 
ColumnUInt8&>(*nullmap_columns[j]).get_data().data();
+                for (size_t i = 0; i < row_numbers; ++i) {
+                    *((char*)(&result[i]) + bucket) |= data[i] << offset;
+                }
+            }
+            offset += bitmap_size;
+        }
+
+        for (size_t j = 0; j < key_columns.size(); ++j) {
+            const char* data = key_columns[j]->get_raw_data().data;
+
+            auto foo = [&]<typename Fixed>(Fixed zero) {
+                CHECK_EQ(sizeof(Fixed), key_sizes[j]);
+                if (nullmap_columns.size() && nullmap_columns[j]) {
+                    const auto& nullmap =
+                            assert_cast<const 
ColumnUInt8&>(*nullmap_columns[j]).get_data().data();
+                    for (size_t i = 0; i < row_numbers; ++i) {
+                        // make sure null cell is filled by 0x0
+                        memcpy_fixed<Fixed>((char*)(&result[i]) + offset,
+                                            nullmap[i] ? (char*)&zero : data + 
i * sizeof(Fixed));
+                    }
+                } else {
+                    for (size_t i = 0; i < row_numbers; ++i) {
+                        memcpy_fixed<Fixed>((char*)(&result[i]) + offset, data 
+ i * sizeof(Fixed));
+                    }
+                }
+            };
+
+            if (key_sizes[j] == 1) {
+                foo(int8_t());
+            } else if (key_sizes[j] == 2) {
+                foo(int16_t());
+            } else if (key_sizes[j] == 4) {
+                foo(int32_t());
+            } else if (key_sizes[j] == 8) {
+                foo(int64_t());
+            } else if (key_sizes[j] == 16) {
+                foo(UInt128());
+            } else {
+                throw Exception(ErrorCode::INTERNAL_ERROR,
+                                "pack_fixeds input invalid key size, 
key_size={}", key_sizes[j]);
+            }
+            offset += key_sizes[j];
+        }
+        return result;
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, const Sizes& 
key_sizes,
+                              size_t num_rows, const uint8_t* null_map = 
nullptr) override {
+        ColumnRawPtrs actual_columns;
+        ColumnRawPtrs null_maps;

Review Comment:
   warning: variable 'null_maps' is not initialized 
[cppcoreguidelines-init-variables]
   
   ```suggestion
           ColumnRawPtrs null_maps = 0;
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org


Reply via email to