github-actions[bot] commented on code in PR #24966: URL: https://github.com/apache/doris/pull/24966#discussion_r1338269935
########## be/src/vec/common/hash_table/hash_map_context.h: ########## @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/descriptors.h" +#include "vec/common/arena.h" +#include "vec/common/columns_hashing.h" +#include "vec/common/hash_table/partitioned_hash_map.h" +#include "vec/common/string_ref.h" +#include "vec/utils/util.hpp" + +namespace doris::vectorized { + +template <typename TData> +struct MethodSerialized { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped, true>; + + Data data; + Iterator iterator; + bool inited = false; + std::vector<Key> keys; + size_t keys_memory_usage = 0; + MethodSerialized() : _serialized_key_buffer_size(0), _serialized_key_buffer(nullptr) { + _arena.reset(new Arena()); + _serialize_key_arena.reset(new Arena()); + } + + size_t serialize_keys(const ColumnRawPtrs& key_columns, size_t num_rows) { + keys.resize(num_rows); + + size_t max_one_row_byte_size = 0; + for (const auto& column : key_columns) { + max_one_row_byte_size += column->get_max_row_byte_size(); + } + size_t total_bytes = max_one_row_byte_size * num_rows; + + if (total_bytes > config::pre_serialize_keys_limit_bytes) { + // reach mem limit, don't serialize in batch + _arena->clear(); + size_t keys_size = key_columns.size(); + for (size_t i = 0; i < num_rows; ++i) { + keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, key_columns, *_arena); + } + keys_memory_usage = _arena->size(); + } else { + _arena->clear(); + if (total_bytes > _serialized_key_buffer_size) { + _serialized_key_buffer_size = total_bytes; + _serialize_key_arena->clear(); + _serialized_key_buffer = reinterpret_cast<uint8_t*>( + _serialize_key_arena->alloc(_serialized_key_buffer_size)); + } + + for (size_t i = 0; i < num_rows; ++i) { + keys[i].data = + reinterpret_cast<char*>(_serialized_key_buffer + i * max_one_row_byte_size); + keys[i].size = 0; + } + + for (const auto& column : key_columns) { + column->serialize_vec(keys, num_rows, max_one_row_byte_size); + } + keys_memory_usage = _serialized_key_buffer_size; + } + return max_one_row_byte_size; + } + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + for (auto& column : key_columns) { + column->deserialize_vec(keys, num_rows); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } + + void reset() { + _arena.reset(new Arena()); + keys_memory_usage = 0; + _serialized_key_buffer_size = 0; + } + +private: + size_t _serialized_key_buffer_size; + uint8_t* _serialized_key_buffer; + std::unique_ptr<Arena> _serialize_key_arena; + std::unique_ptr<Arena> _arena; +}; + +template <typename TData> +struct MethodStringNoCache { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + key_columns[0]->insert_many_strings(keys.data(), num_rows); + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +/// For the case where there is one numeric key. +/// FieldType is UInt8/16/32/64 for any type with corresponding bit width. +template <typename FieldType, typename TData> +struct MethodOneNumber { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, Mapped, FieldType, + false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + auto* column = static_cast<ColumnVectorHelper*>(key_columns[0].get()); + for (size_t i = 0; i != num_rows; ++i) { + const auto* key_holder = reinterpret_cast<const char*>(&keys[i]); + column->insert_raw_data<sizeof(FieldType)>(key_holder); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename TData, bool has_nullable_keys = false> +struct MethodKeysFixed { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, + has_nullable_keys, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes& key_sizes) { + // In any hash key value, column values to be read start just after the bitmap, if it exists. + size_t pos = has_nullable_keys ? get_bitmap_size(key_columns.size()) : 0; + + for (size_t i = 0; i < key_columns.size(); ++i) { + size_t size = key_sizes[i]; + char* data = nullptr; + key_columns[i]->resize(num_rows); + // If we have a nullable column, get its nested column and its null map. + if (is_column_nullable(*key_columns[i])) { + ColumnNullable& nullable_col = assert_cast<ColumnNullable&>(*key_columns[i]); + + data = const_cast<char*>(nullable_col.get_nested_column().get_raw_data().data); + UInt8* nullmap = assert_cast<ColumnUInt8*>(&nullable_col.get_null_map_column()) + ->get_data() + .data(); + + // The current column is nullable. Check if the value of the + // corresponding key is nullable. Update the null map accordingly. + size_t bucket = i / 8; + size_t offset = i % 8; + for (size_t j = 0; j < num_rows; j++) { + nullmap[j] = (reinterpret_cast<const UInt8*>(&keys[j])[bucket] >> offset) & 1; + } + } else { + data = const_cast<char*>(key_columns[i]->get_raw_data().data); + } + + auto foo = [&]<typename Fixed>(Fixed zero) { + CHECK_EQ(sizeof(Fixed), size); + for (size_t j = 0; j < num_rows; j++) { + memcpy_fixed<Fixed>(data + j * sizeof(Fixed), (char*)(&keys[j]) + pos); + } + }; + + if (size == 1) { + foo(int8_t()); + } else if (size == 2) { + foo(int16_t()); + } else if (size == 4) { + foo(int32_t()); + } else if (size == 8) { + foo(int64_t()); + } else if (size == 16) { + foo(UInt128()); + } else { + throw Exception(ErrorCode::INTERNAL_ERROR, + "pack_fixeds input invalid key size, key_size={}", size); + } + + pos += size; + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename Base> +struct DataWithNullKey : public Base { + using Base::Base; + + bool& has_null_key_data() { return has_null_key; } + bool has_null_key_data() const { return has_null_key; } + template <typename MappedType> + MappedType& get_null_key_data() const { + return (MappedType&)null_key_data; + } + size_t size() const { return Base::size() + has_null_key; } + bool empty() const { return Base::empty() && !has_null_key; } Review Comment: warning: function 'empty' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool empty() const { return Base::empty() && !has_null_key; } ``` ########## be/src/vec/common/hash_table/hash_map_context.h: ########## @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/descriptors.h" +#include "vec/common/arena.h" +#include "vec/common/columns_hashing.h" +#include "vec/common/hash_table/partitioned_hash_map.h" +#include "vec/common/string_ref.h" +#include "vec/utils/util.hpp" + +namespace doris::vectorized { + +template <typename TData> +struct MethodSerialized { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped, true>; + + Data data; + Iterator iterator; + bool inited = false; + std::vector<Key> keys; + size_t keys_memory_usage = 0; + MethodSerialized() : _serialized_key_buffer_size(0), _serialized_key_buffer(nullptr) { + _arena.reset(new Arena()); + _serialize_key_arena.reset(new Arena()); + } + + size_t serialize_keys(const ColumnRawPtrs& key_columns, size_t num_rows) { + keys.resize(num_rows); + + size_t max_one_row_byte_size = 0; + for (const auto& column : key_columns) { + max_one_row_byte_size += column->get_max_row_byte_size(); + } + size_t total_bytes = max_one_row_byte_size * num_rows; + + if (total_bytes > config::pre_serialize_keys_limit_bytes) { + // reach mem limit, don't serialize in batch + _arena->clear(); + size_t keys_size = key_columns.size(); + for (size_t i = 0; i < num_rows; ++i) { + keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, key_columns, *_arena); + } + keys_memory_usage = _arena->size(); + } else { + _arena->clear(); + if (total_bytes > _serialized_key_buffer_size) { + _serialized_key_buffer_size = total_bytes; + _serialize_key_arena->clear(); + _serialized_key_buffer = reinterpret_cast<uint8_t*>( + _serialize_key_arena->alloc(_serialized_key_buffer_size)); + } + + for (size_t i = 0; i < num_rows; ++i) { + keys[i].data = + reinterpret_cast<char*>(_serialized_key_buffer + i * max_one_row_byte_size); + keys[i].size = 0; + } + + for (const auto& column : key_columns) { + column->serialize_vec(keys, num_rows, max_one_row_byte_size); + } + keys_memory_usage = _serialized_key_buffer_size; + } + return max_one_row_byte_size; + } + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + for (auto& column : key_columns) { + column->deserialize_vec(keys, num_rows); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } + + void reset() { + _arena.reset(new Arena()); + keys_memory_usage = 0; + _serialized_key_buffer_size = 0; + } + +private: + size_t _serialized_key_buffer_size; + uint8_t* _serialized_key_buffer; + std::unique_ptr<Arena> _serialize_key_arena; + std::unique_ptr<Arena> _arena; +}; + +template <typename TData> +struct MethodStringNoCache { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + key_columns[0]->insert_many_strings(keys.data(), num_rows); + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +/// For the case where there is one numeric key. +/// FieldType is UInt8/16/32/64 for any type with corresponding bit width. +template <typename FieldType, typename TData> +struct MethodOneNumber { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, Mapped, FieldType, + false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + auto* column = static_cast<ColumnVectorHelper*>(key_columns[0].get()); + for (size_t i = 0; i != num_rows; ++i) { + const auto* key_holder = reinterpret_cast<const char*>(&keys[i]); + column->insert_raw_data<sizeof(FieldType)>(key_holder); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename TData, bool has_nullable_keys = false> +struct MethodKeysFixed { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, + has_nullable_keys, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes& key_sizes) { + // In any hash key value, column values to be read start just after the bitmap, if it exists. + size_t pos = has_nullable_keys ? get_bitmap_size(key_columns.size()) : 0; + + for (size_t i = 0; i < key_columns.size(); ++i) { + size_t size = key_sizes[i]; + char* data = nullptr; + key_columns[i]->resize(num_rows); + // If we have a nullable column, get its nested column and its null map. + if (is_column_nullable(*key_columns[i])) { + ColumnNullable& nullable_col = assert_cast<ColumnNullable&>(*key_columns[i]); + + data = const_cast<char*>(nullable_col.get_nested_column().get_raw_data().data); + UInt8* nullmap = assert_cast<ColumnUInt8*>(&nullable_col.get_null_map_column()) + ->get_data() + .data(); + + // The current column is nullable. Check if the value of the + // corresponding key is nullable. Update the null map accordingly. + size_t bucket = i / 8; + size_t offset = i % 8; + for (size_t j = 0; j < num_rows; j++) { + nullmap[j] = (reinterpret_cast<const UInt8*>(&keys[j])[bucket] >> offset) & 1; + } + } else { + data = const_cast<char*>(key_columns[i]->get_raw_data().data); + } + + auto foo = [&]<typename Fixed>(Fixed zero) { + CHECK_EQ(sizeof(Fixed), size); + for (size_t j = 0; j < num_rows; j++) { + memcpy_fixed<Fixed>(data + j * sizeof(Fixed), (char*)(&keys[j]) + pos); + } + }; + + if (size == 1) { + foo(int8_t()); + } else if (size == 2) { + foo(int16_t()); + } else if (size == 4) { + foo(int32_t()); + } else if (size == 8) { + foo(int64_t()); + } else if (size == 16) { + foo(UInt128()); + } else { + throw Exception(ErrorCode::INTERNAL_ERROR, + "pack_fixeds input invalid key size, key_size={}", size); + } + + pos += size; + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename Base> +struct DataWithNullKey : public Base { + using Base::Base; + + bool& has_null_key_data() { return has_null_key; } + bool has_null_key_data() const { return has_null_key; } Review Comment: warning: function 'has_null_key_data' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool has_null_key_data() const { return has_null_key; } ``` ########## be/src/vec/common/hash_table/hash_map_context.h: ########## @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/descriptors.h" +#include "vec/common/arena.h" +#include "vec/common/columns_hashing.h" +#include "vec/common/hash_table/partitioned_hash_map.h" +#include "vec/common/string_ref.h" +#include "vec/utils/util.hpp" + +namespace doris::vectorized { + +template <typename TData> +struct MethodSerialized { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped, true>; + + Data data; + Iterator iterator; + bool inited = false; + std::vector<Key> keys; + size_t keys_memory_usage = 0; + MethodSerialized() : _serialized_key_buffer_size(0), _serialized_key_buffer(nullptr) { + _arena.reset(new Arena()); + _serialize_key_arena.reset(new Arena()); + } + + size_t serialize_keys(const ColumnRawPtrs& key_columns, size_t num_rows) { + keys.resize(num_rows); + + size_t max_one_row_byte_size = 0; + for (const auto& column : key_columns) { + max_one_row_byte_size += column->get_max_row_byte_size(); + } + size_t total_bytes = max_one_row_byte_size * num_rows; + + if (total_bytes > config::pre_serialize_keys_limit_bytes) { + // reach mem limit, don't serialize in batch + _arena->clear(); + size_t keys_size = key_columns.size(); + for (size_t i = 0; i < num_rows; ++i) { + keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, key_columns, *_arena); + } + keys_memory_usage = _arena->size(); + } else { + _arena->clear(); + if (total_bytes > _serialized_key_buffer_size) { + _serialized_key_buffer_size = total_bytes; + _serialize_key_arena->clear(); + _serialized_key_buffer = reinterpret_cast<uint8_t*>( + _serialize_key_arena->alloc(_serialized_key_buffer_size)); + } + + for (size_t i = 0; i < num_rows; ++i) { + keys[i].data = + reinterpret_cast<char*>(_serialized_key_buffer + i * max_one_row_byte_size); + keys[i].size = 0; + } + + for (const auto& column : key_columns) { + column->serialize_vec(keys, num_rows, max_one_row_byte_size); + } + keys_memory_usage = _serialized_key_buffer_size; + } + return max_one_row_byte_size; + } + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + for (auto& column : key_columns) { + column->deserialize_vec(keys, num_rows); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } + + void reset() { + _arena.reset(new Arena()); + keys_memory_usage = 0; + _serialized_key_buffer_size = 0; + } + +private: + size_t _serialized_key_buffer_size; + uint8_t* _serialized_key_buffer; + std::unique_ptr<Arena> _serialize_key_arena; + std::unique_ptr<Arena> _arena; +}; + +template <typename TData> +struct MethodStringNoCache { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + key_columns[0]->insert_many_strings(keys.data(), num_rows); + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +/// For the case where there is one numeric key. +/// FieldType is UInt8/16/32/64 for any type with corresponding bit width. +template <typename FieldType, typename TData> +struct MethodOneNumber { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, Mapped, FieldType, + false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + auto* column = static_cast<ColumnVectorHelper*>(key_columns[0].get()); + for (size_t i = 0; i != num_rows; ++i) { + const auto* key_holder = reinterpret_cast<const char*>(&keys[i]); + column->insert_raw_data<sizeof(FieldType)>(key_holder); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename TData, bool has_nullable_keys = false> +struct MethodKeysFixed { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, + has_nullable_keys, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes& key_sizes) { + // In any hash key value, column values to be read start just after the bitmap, if it exists. + size_t pos = has_nullable_keys ? get_bitmap_size(key_columns.size()) : 0; + + for (size_t i = 0; i < key_columns.size(); ++i) { + size_t size = key_sizes[i]; + char* data = nullptr; + key_columns[i]->resize(num_rows); + // If we have a nullable column, get its nested column and its null map. + if (is_column_nullable(*key_columns[i])) { + ColumnNullable& nullable_col = assert_cast<ColumnNullable&>(*key_columns[i]); + + data = const_cast<char*>(nullable_col.get_nested_column().get_raw_data().data); + UInt8* nullmap = assert_cast<ColumnUInt8*>(&nullable_col.get_null_map_column()) + ->get_data() + .data(); + + // The current column is nullable. Check if the value of the + // corresponding key is nullable. Update the null map accordingly. + size_t bucket = i / 8; + size_t offset = i % 8; + for (size_t j = 0; j < num_rows; j++) { + nullmap[j] = (reinterpret_cast<const UInt8*>(&keys[j])[bucket] >> offset) & 1; + } + } else { + data = const_cast<char*>(key_columns[i]->get_raw_data().data); + } + + auto foo = [&]<typename Fixed>(Fixed zero) { + CHECK_EQ(sizeof(Fixed), size); + for (size_t j = 0; j < num_rows; j++) { + memcpy_fixed<Fixed>(data + j * sizeof(Fixed), (char*)(&keys[j]) + pos); + } + }; + + if (size == 1) { + foo(int8_t()); + } else if (size == 2) { + foo(int16_t()); + } else if (size == 4) { + foo(int32_t()); + } else if (size == 8) { + foo(int64_t()); + } else if (size == 16) { + foo(UInt128()); + } else { + throw Exception(ErrorCode::INTERNAL_ERROR, + "pack_fixeds input invalid key size, key_size={}", size); + } + + pos += size; + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename Base> +struct DataWithNullKey : public Base { + using Base::Base; + + bool& has_null_key_data() { return has_null_key; } + bool has_null_key_data() const { return has_null_key; } + template <typename MappedType> + MappedType& get_null_key_data() const { + return (MappedType&)null_key_data; + } + size_t size() const { return Base::size() + has_null_key; } Review Comment: warning: function 'size' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] size_t size() const { return Base::size() + has_null_key; } ``` ########## be/src/vec/common/hash_table/hash_map_context.h: ########## @@ -0,0 +1,386 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "runtime/descriptors.h" +#include "vec/common/arena.h" +#include "vec/common/columns_hashing.h" +#include "vec/common/hash_table/partitioned_hash_map.h" +#include "vec/common/string_ref.h" +#include "vec/utils/util.hpp" + +namespace doris::vectorized { + +template <typename TData> +struct MethodSerialized { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodSerialized<typename Data::value_type, Mapped, true>; + + Data data; + Iterator iterator; + bool inited = false; + std::vector<Key> keys; + size_t keys_memory_usage = 0; + MethodSerialized() : _serialized_key_buffer_size(0), _serialized_key_buffer(nullptr) { + _arena.reset(new Arena()); + _serialize_key_arena.reset(new Arena()); + } + + size_t serialize_keys(const ColumnRawPtrs& key_columns, size_t num_rows) { + keys.resize(num_rows); + + size_t max_one_row_byte_size = 0; + for (const auto& column : key_columns) { + max_one_row_byte_size += column->get_max_row_byte_size(); + } + size_t total_bytes = max_one_row_byte_size * num_rows; + + if (total_bytes > config::pre_serialize_keys_limit_bytes) { + // reach mem limit, don't serialize in batch + _arena->clear(); + size_t keys_size = key_columns.size(); + for (size_t i = 0; i < num_rows; ++i) { + keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, key_columns, *_arena); + } + keys_memory_usage = _arena->size(); + } else { + _arena->clear(); + if (total_bytes > _serialized_key_buffer_size) { + _serialized_key_buffer_size = total_bytes; + _serialize_key_arena->clear(); + _serialized_key_buffer = reinterpret_cast<uint8_t*>( + _serialize_key_arena->alloc(_serialized_key_buffer_size)); + } + + for (size_t i = 0; i < num_rows; ++i) { + keys[i].data = + reinterpret_cast<char*>(_serialized_key_buffer + i * max_one_row_byte_size); + keys[i].size = 0; + } + + for (const auto& column : key_columns) { + column->serialize_vec(keys, num_rows, max_one_row_byte_size); + } + keys_memory_usage = _serialized_key_buffer_size; + } + return max_one_row_byte_size; + } + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + for (auto& column : key_columns) { + column->deserialize_vec(keys, num_rows); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } + + void reset() { + _arena.reset(new Arena()); + keys_memory_usage = 0; + _serialized_key_buffer_size = 0; + } + +private: + size_t _serialized_key_buffer_size; + uint8_t* _serialized_key_buffer; + std::unique_ptr<Arena> _serialize_key_arena; + std::unique_ptr<Arena> _arena; +}; + +template <typename TData> +struct MethodStringNoCache { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<StringRef>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + key_columns[0]->insert_many_strings(keys.data(), num_rows); + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +/// For the case where there is one numeric key. +/// FieldType is UInt8/16/32/64 for any type with corresponding bit width. +template <typename FieldType, typename TData> +struct MethodOneNumber { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, Mapped, FieldType, + false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + key_columns[0]->reserve(num_rows); + auto* column = static_cast<ColumnVectorHelper*>(key_columns[0].get()); + for (size_t i = 0; i != num_rows; ++i) { + const auto* key_holder = reinterpret_cast<const char*>(&keys[i]); + column->insert_raw_data<sizeof(FieldType)>(key_holder); + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename TData, bool has_nullable_keys = false> +struct MethodKeysFixed { + using Data = TData; + using Key = typename Data::key_type; + using Mapped = typename Data::mapped_type; + using Iterator = typename Data::iterator; + using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, + has_nullable_keys, false>; + + Data data; + Iterator iterator; + bool inited = false; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes& key_sizes) { + // In any hash key value, column values to be read start just after the bitmap, if it exists. + size_t pos = has_nullable_keys ? get_bitmap_size(key_columns.size()) : 0; + + for (size_t i = 0; i < key_columns.size(); ++i) { + size_t size = key_sizes[i]; + char* data = nullptr; + key_columns[i]->resize(num_rows); + // If we have a nullable column, get its nested column and its null map. + if (is_column_nullable(*key_columns[i])) { + ColumnNullable& nullable_col = assert_cast<ColumnNullable&>(*key_columns[i]); + + data = const_cast<char*>(nullable_col.get_nested_column().get_raw_data().data); + UInt8* nullmap = assert_cast<ColumnUInt8*>(&nullable_col.get_null_map_column()) + ->get_data() + .data(); + + // The current column is nullable. Check if the value of the + // corresponding key is nullable. Update the null map accordingly. + size_t bucket = i / 8; + size_t offset = i % 8; + for (size_t j = 0; j < num_rows; j++) { + nullmap[j] = (reinterpret_cast<const UInt8*>(&keys[j])[bucket] >> offset) & 1; + } + } else { + data = const_cast<char*>(key_columns[i]->get_raw_data().data); + } + + auto foo = [&]<typename Fixed>(Fixed zero) { + CHECK_EQ(sizeof(Fixed), size); + for (size_t j = 0; j < num_rows; j++) { + memcpy_fixed<Fixed>(data + j * sizeof(Fixed), (char*)(&keys[j]) + pos); + } + }; + + if (size == 1) { + foo(int8_t()); + } else if (size == 2) { + foo(int16_t()); + } else if (size == 4) { + foo(int32_t()); + } else if (size == 8) { + foo(int64_t()); + } else if (size == 16) { + foo(UInt128()); + } else { + throw Exception(ErrorCode::INTERNAL_ERROR, + "pack_fixeds input invalid key size, key_size={}", size); + } + + pos += size; + } + } + + void init_once() { + if (!inited) { + inited = true; + iterator = data.begin(); + } + } +}; + +template <typename Base> +struct DataWithNullKey : public Base { + using Base::Base; + + bool& has_null_key_data() { return has_null_key; } + bool has_null_key_data() const { return has_null_key; } + template <typename MappedType> + MappedType& get_null_key_data() const { + return (MappedType&)null_key_data; + } + size_t size() const { return Base::size() + has_null_key; } + bool empty() const { return Base::empty() && !has_null_key; } + + void clear() { + Base::clear(); + has_null_key = false; + } + + void clear_and_shrink() { + Base::clear_and_shrink(); + has_null_key = false; + } + +private: + bool has_null_key = false; + // null_key_data store AggregateDataPtr on agg node, store PartitionBlocks on partition sort node. + void* null_key_data = nullptr; +}; + +/// Single low cardinality column. +template <typename SingleColumnMethod> +struct MethodSingleNullableColumn : public SingleColumnMethod { + using Base = SingleColumnMethod; + using BaseState = typename Base::State; + + using Data = typename Base::Data; + using Key = typename Base::Key; + using Mapped = typename Base::Mapped; + + using Base::data; + + using State = ColumnsHashing::HashMethodSingleLowNullableColumn<BaseState, Mapped, true>; + + static void insert_keys_into_columns(std::vector<Key>& keys, MutableColumns& key_columns, + const size_t num_rows, const Sizes&) { + auto col = key_columns[0].get(); + col->reserve(num_rows); + if constexpr (std::is_same_v<Key, StringRef>) { + col->insert_many_strings(keys.data(), num_rows); + } else { + col->insert_many_raw_data(reinterpret_cast<char*>(keys.data()), num_rows); + } + } +}; + +template <typename RowRefListType> +struct SerializedHashTableContext { + using Mapped = RowRefListType; + using HashTable = PartitionedHashMap<StringRef, Mapped>; + using State = + ColumnsHashing::HashMethodSerialized<typename HashTable::value_type, Mapped, true>; + using Iter = typename HashTable::iterator; + + SerializedHashTableContext() { _arena.reset(new Arena()); } Review Comment: warning: use '= default' to define a trivial default constructor [modernize-use-equals-default] ```cpp SerializedHashTableContext() { _arena.reset(new Arena()); } ^ ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org