This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch dev-1.1.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/dev-1.1.1 by this push: new 7e44079212 [improvement 1.1.1]fix_vec_hash_table_resize_dev1.1.1 (#10968) 7e44079212 is described below commit 7e44079212188157871659f93e0eae3f326a36a8 Author: Xinyi Zou <zouxiny...@gmail.com> AuthorDate: Mon Jul 18 14:37:04 2022 +0800 [improvement 1.1.1]fix_vec_hash_table_resize_dev1.1.1 (#10968) --- be/CMakeLists.txt | 9 +++++++++ be/src/vec/common/hash_table/hash_table.h | 31 +++++++++++++++++++++++++------ be/src/vec/exec/join/vhash_join_node.cpp | 1 + build.sh | 6 ++++++ run-be-ut.sh | 1 + 5 files changed, 42 insertions(+), 6 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index d2368f4b7b..6118cae793 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -415,6 +415,15 @@ if (WITH_LZO) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DDORIS_WITH_LZO") endif() +# STRICT_MEMORY_USE=ON` expects BE to use less memory, and gives priority to ensuring stability +# when the cluster memory is limited. +# TODO In the future, expect a dynamic soft memory limit, combined with real-time memory usage of the cluster, +# to control the main memory consumers, including HashTable, LRU Cache elimination strategy, +# ChunkAllocator cache strategy, Disk IO buffer cache strategy, etc. +if (STRICT_MEMORY_USE) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DSTRICT_MEMORY_USE") +endif() + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -faligned-new") endif() diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h index 920c819694..a60e71e42b 100644 --- a/be/src/vec/common/hash_table/hash_table.h +++ b/be/src/vec/common/hash_table/hash_table.h @@ -244,11 +244,22 @@ template <size_t initial_size_degree = 10> struct HashTableGrower { /// The state of this structure is enough to get the buffer size of the hash table. doris::vectorized::UInt8 size_degree = initial_size_degree; + doris::vectorized::Int64 double_grow_degree = 31; // 2GB /// The size of the hash table in the cells. size_t buf_size() const { return 1ULL << size_degree; } +#ifndef STRICT_MEMORY_USE size_t max_fill() const { return 1ULL << (size_degree - 1); } +#else + // When capacity is greater than 2G, grow when 75% of the capacity is satisfied. + size_t max_fill() const { + return size_degree < double_grow_degree + ? 1ULL << (size_degree - 1) + : (1ULL << size_degree) - (1ULL << (size_degree - 2)); + } +#endif + size_t mask() const { return buf_size() - 1; } /// From the hash value, get the cell number in the hash table. @@ -268,12 +279,20 @@ struct HashTableGrower { /// Set the buffer size by the number of elements in the hash table. Used when deserializing a hash table. void set(size_t num_elems) { - size_degree = - num_elems <= 1 - ? initial_size_degree - : ((initial_size_degree > static_cast<size_t>(log2(num_elems - 1)) + 2) - ? initial_size_degree - : (static_cast<size_t>(log2(num_elems - 1)) + 2)); +#ifndef STRICT_MEMORY_USE + size_t fill_capacity = static_cast<size_t>(log2(num_elems - 1)) + 2; +#else + size_t fill_capacity = static_cast<size_t>(log2(num_elems - 1)) + 1; + fill_capacity = + fill_capacity < double_grow_degree + ? fill_capacity + 1 + : (num_elems < (1ULL << fill_capacity) - (1ULL << (fill_capacity - 2)) + ? fill_capacity + : fill_capacity + 1); +#endif + size_degree = num_elems <= 1 ? initial_size_degree + : (initial_size_degree > fill_capacity ? initial_size_degree + : fill_capacity); } void set_buf_size(size_t buf_size_) { diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index 0309b4fa94..efab21aaf1 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -69,6 +69,7 @@ struct ProcessHashTableBuild { SCOPED_TIMER(_join_node->_build_table_insert_timer); // only not build_unique, we need expanse hash table before insert data if constexpr (!build_unique) { + // _rows contains null row, which will cause hash table resize to be large. hash_table_ctx.hash_table.expanse_for_add_elem(_rows); } hash_table_ctx.hash_table.reset_resize_timer(); diff --git a/build.sh b/build.sh index 7d1071fd19..4f53378a8e 100755 --- a/build.sh +++ b/build.sh @@ -215,6 +215,10 @@ if [[ -z ${STRIP_DEBUG_INFO} ]]; then STRIP_DEBUG_INFO=OFF fi +if [[ -z ${STRICT_MEMORY_USE} ]]; then + STRICT_MEMORY_USE=OFF +fi + echo "Get params: BUILD_BE -- $BUILD_BE BUILD_FE -- $BUILD_FE @@ -231,6 +235,7 @@ echo "Get params: BUILD_META_TOOL -- $BUILD_META_TOOL USE_LLD -- $USE_LLD STRIP_DEBUG_INFO -- $STRIP_DEBUG_INFO + STRICT_MEMORY_USE -- $STRICT_MEMORY_USE " # Clean and build generated code @@ -267,6 +272,7 @@ if [ ${BUILD_BE} -eq 1 ] ; then -DBUILD_META_TOOL=${BUILD_META_TOOL} \ -DUSE_LLD=${USE_LLD} \ -DSTRIP_DEBUG_INFO=${STRIP_DEBUG_INFO} \ + -DSTRICT_MEMORY_USE=${STRICT_MEMORY_USE} \ -DUSE_AVX2=${USE_AVX2} \ -DGLIBC_COMPATIBILITY=${GLIBC_COMPATIBILITY} ../ ${BUILD_SYSTEM} -j ${PARALLEL} diff --git a/run-be-ut.sh b/run-be-ut.sh index af8ae5bac7..930c013918 100755 --- a/run-be-ut.sh +++ b/run-be-ut.sh @@ -142,6 +142,7 @@ ${CMAKE_CMD} -G "${GENERATOR}" \ -DBUILD_META_TOOL=OFF \ -DWITH_MYSQL=OFF \ -DWITH_KERBEROS=OFF \ + -DSTRICT_MEMORY_USE=OFF \ ${CMAKE_USE_CCACHE} ../ ${BUILD_SYSTEM} -j ${PARALLEL} $RUN_FILE --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org