This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new b37148e2cb0 [opt](join)Support MethodOneString to optimize hash join with a single string key (#40559) b37148e2cb0 is described below commit b37148e2cb0d466fe70320ec2b6778e8bb067152 Author: Mryange <59914473+mrya...@users.noreply.github.com> AuthorDate: Wed Sep 11 18:52:12 2024 +0800 [opt](join)Support MethodOneString to optimize hash join with a single string key (#40559) ``` mysql [test]>SELECT count() from hits_10m WHERE SearchPhrase IN (SELECT SearchPhrase from hits_10m); +----------+ | count(*) | +----------+ | 10000000 | +----------+ 1 row in set (1.15 sec) now mysql [test]>SELECT count() from hits_10m WHERE SearchPhrase IN (SELECT SearchPhrase from hits_10m); +----------+ | count(*) | +----------+ | 10000000 | +----------+ 1 row in set (0.66 sec) ``` --- be/src/pipeline/common/join_utils.h | 4 +- be/src/pipeline/dependency.h | 10 ++++- be/src/pipeline/exec/hashjoin_build_sink.cpp | 6 +++ .../exec/join/process_hash_table_probe_impl.h | 1 + be/src/vec/common/hash_table/hash_map_context.h | 48 +++++++++++++++------- 5 files changed, 51 insertions(+), 18 deletions(-) diff --git a/be/src/pipeline/common/join_utils.h b/be/src/pipeline/common/join_utils.h index cd3374995f7..7fcf669d42e 100644 --- a/be/src/pipeline/common/join_utils.h +++ b/be/src/pipeline/common/join_utils.h @@ -43,7 +43,7 @@ using I32HashTableContext = vectorized::PrimaryTypeHashTableContext<vectorized:: using I64HashTableContext = vectorized::PrimaryTypeHashTableContext<UInt64>; using I128HashTableContext = vectorized::PrimaryTypeHashTableContext<UInt128>; using I256HashTableContext = vectorized::PrimaryTypeHashTableContext<UInt256>; - +using MethodOneString = vectorized::MethodStringNoCache<JoinHashMap<StringRef>>; template <bool has_null> using I64FixedKeyHashTableContext = vectorized::FixedKeyHashTableContext<UInt64, has_null>; @@ -63,6 +63,6 @@ using HashTableVariants = I64FixedKeyHashTableContext<false>, I128FixedKeyHashTableContext<true>, I128FixedKeyHashTableContext<false>, I256FixedKeyHashTableContext<true>, I256FixedKeyHashTableContext<false>, I136FixedKeyHashTableContext<true>, - I136FixedKeyHashTableContext<false>>; + I136FixedKeyHashTableContext<false>, MethodOneString>; } // namespace doris::pipeline diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index e5738e48f93..863458d3bde 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -656,8 +656,8 @@ public: }; using SetHashTableVariants = - std::variant<std::monostate, - vectorized::MethodSerialized<HashMap<StringRef, RowRefListWithFlags>>, + std::variant<std::monostate, vectorized::SetSerializedHashTableContext, + vectorized::SetMethodOneString, vectorized::SetPrimaryTypeHashTableContext<vectorized::UInt8>, vectorized::SetPrimaryTypeHashTableContext<vectorized::UInt16>, vectorized::SetPrimaryTypeHashTableContext<vectorized::UInt32>, @@ -735,6 +735,12 @@ public: case TYPE_DATETIMEV2: hash_table_variants->emplace<vectorized::SetPrimaryTypeHashTableContext<UInt64>>(); break; + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_STRING: { + hash_table_variants->emplace<vectorized::SetMethodOneString>(); + break; + } case TYPE_LARGEINT: case TYPE_DECIMALV2: case TYPE_DECIMAL128I: diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 0bee88ed537..8f7b176a979 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -377,6 +377,12 @@ void HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { } break; } + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_STRING: { + _shared_state->hash_table_variants->emplace<MethodOneString>(); + break; + } default: _shared_state->hash_table_variants ->emplace<vectorized::SerializedHashTableContext>(); diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 3ffdb9cb990..653cc8ab447 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -739,6 +739,7 @@ struct ExtractType<T(U)> { INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext<true>)); \ INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext<false>)); \ INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext<true>)); \ + INSTANTIATION(JoinOpType, (MethodOneString)); \ INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext<false>)); } // namespace doris::pipeline diff --git a/be/src/vec/common/hash_table/hash_map_context.h b/be/src/vec/common/hash_table/hash_map_context.h index 6ca0653f7a9..0df0c8997f0 100644 --- a/be/src/vec/common/hash_table/hash_map_context.h +++ b/be/src/vec/common/hash_table/hash_map_context.h @@ -30,6 +30,7 @@ #include "vec/common/hash_table/partitioned_hash_map.h" #include "vec/common/hash_table/string_hash_map.h" #include "vec/common/string_ref.h" +#include "vec/common/typeid_cast.h" #include "vec/core/types.h" #include "vec/utils/util.hpp" @@ -284,29 +285,47 @@ struct MethodStringNoCache : public MethodBase<TData> { using State = ColumnsHashing::HashMethodString<typename Base::Value, typename Base::Mapped, true>; - std::vector<StringRef> stored_keys; + // need keep until the hash probe end. + std::vector<StringRef> _build_stored_keys; + // refresh each time probe + std::vector<StringRef> _stored_keys; size_t serialized_keys_size(bool is_build) const override { - return stored_keys.size() * sizeof(StringRef); + return is_build ? (_build_stored_keys.size() * sizeof(StringRef)) + : (_stored_keys.size() * sizeof(StringRef)); } - void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, - const uint8_t* null_map = nullptr, bool is_join = false, - bool is_build = false, uint32_t bucket_size = 0) override { + void init_serialized_keys_impl(const ColumnRawPtrs& key_columns, size_t num_rows, + std::vector<StringRef>& stored_keys) { const IColumn& column = *key_columns[0]; - const auto& column_string = assert_cast<const ColumnString&>( + const auto& nested_column = column.is_nullable() ? assert_cast<const ColumnNullable&>(column).get_nested_column() - : column); - const auto& offsets = column_string.get_offsets(); - const auto* chars = column_string.get_chars().data(); - - stored_keys.resize(column_string.size()); - for (size_t row = 0; row < column_string.size(); row++) { - stored_keys[row] = StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1]); + : column; + auto serialized_str = [](const auto& column_string, std::vector<StringRef>& stored_keys) { + const auto& offsets = column_string.get_offsets(); + const auto* chars = column_string.get_chars().data(); + stored_keys.resize(column_string.size()); + for (size_t row = 0; row < column_string.size(); row++) { + stored_keys[row] = + StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1]); + } + }; + if (nested_column.is_column_string64()) { + const auto& column_string = assert_cast<const ColumnString64&>(nested_column); + serialized_str(column_string, stored_keys); + } else { + const auto& column_string = assert_cast<const ColumnString&>(nested_column); + serialized_str(column_string, stored_keys); } - Base::keys = stored_keys.data(); + } + + void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t num_rows, + const uint8_t* null_map = nullptr, bool is_join = false, + bool is_build = false, uint32_t bucket_size = 0) override { + init_serialized_keys_impl(key_columns, num_rows, + is_build ? _build_stored_keys : _stored_keys); if (is_join) { Base::init_join_bucket_num(num_rows, bucket_size, null_map); } else { @@ -606,5 +625,6 @@ using SetPrimaryTypeHashTableContext = using SetSerializedHashTableContext = MethodSerialized<HashMap<StringRef, pipeline::RowRefListWithFlags>>; +using SetMethodOneString = MethodStringNoCache<HashMap<StringRef, pipeline::RowRefListWithFlags>>; } // namespace doris::vectorized \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org