yiguolei commented on code in PR #21361: URL: https://github.com/apache/doris/pull/21361#discussion_r1248371936
########## be/src/vec/exec/join/process_hash_table_probe_impl.h: ########## @@ -177,22 +177,51 @@ Status ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c KeyGetter key_getter(probe_raw_ptrs, _join_node->_probe_key_sz, nullptr); if (probe_index == 0) { - size_t old_probe_keys_memory_usage = 0; - if (_arena) { - old_probe_keys_memory_usage = _arena->size(); + if (!_arena) { + _arena.reset(new Arena()); } - _arena.reset(new Arena()); // TODO arena reuse by clear()? if constexpr (ColumnsHashing::IsPreSerializedKeysHashMethodTraits<KeyGetter>::value) { if (_probe_keys.size() < probe_rows) { _probe_keys.resize(probe_rows); } - size_t keys_size = probe_raw_ptrs.size(); - for (size_t i = 0; i < probe_rows; ++i) { - _probe_keys[i] = - serialize_keys_to_pool_contiguous(i, keys_size, probe_raw_ptrs, *_arena); + size_t max_one_row_byte_size = 0; + for (const auto column : probe_raw_ptrs) { + max_one_row_byte_size += column->get_max_row_byte_size(); + } + size_t total_bytes = max_one_row_byte_size * probe_rows; + + if (total_bytes > config::pre_serialize_keys_limit_bytes) { + // reach mem limit, don't serialize in batch + _arena->clear(); + size_t keys_size = probe_raw_ptrs.size(); + for (size_t i = 0; i < probe_rows; ++i) { + _probe_keys[i] = serialize_keys_to_pool_contiguous(i, keys_size, probe_raw_ptrs, + *_arena); + } + _join_node->_probe_arena_memory_usage->add(_arena->size()); + } else { + _arena->clear(); + if (!_serialize_key_arena) { + _serialize_key_arena.reset(new Arena); + } + if (total_bytes > _serialized_key_buffer_size) { + _serialized_key_buffer_size = total_bytes; + _serialize_key_arena->clear(); + _serialized_key_buffer = reinterpret_cast<uint8_t*>( + _serialize_key_arena->alloc(_serialized_key_buffer_size)); + } + + for (size_t i = 0; i < probe_rows; ++i) { + _probe_keys[i].data = reinterpret_cast<char*>(_serialized_key_buffer + Review Comment: using config pre_serialize_keys_limit_bytes to control not use this way. For example, if there is a very long string in the value, 100k, and other string is very small 100bytes. Then the max value is 100k, using too many memory. And please add this to comment so that we could know why add this config. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org