This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new b37148e2cb0 [opt](join)Support MethodOneString to optimize hash join 
with a single string key (#40559)
b37148e2cb0 is described below

commit b37148e2cb0d466fe70320ec2b6778e8bb067152
Author: Mryange <59914473+mrya...@users.noreply.github.com>
AuthorDate: Wed Sep 11 18:52:12 2024 +0800

    [opt](join)Support MethodOneString to optimize hash join with a single 
string key (#40559)
    
    ```
    mysql [test]>SELECT count() from hits_10m WHERE SearchPhrase IN (SELECT 
SearchPhrase from hits_10m);
    +----------+
    | count(*) |
    +----------+
    | 10000000 |
    +----------+
    1 row in set (1.15 sec)
    
    now
    mysql [test]>SELECT count() from hits_10m WHERE SearchPhrase IN (SELECT 
SearchPhrase from hits_10m);
    +----------+
    | count(*) |
    +----------+
    | 10000000 |
    +----------+
    1 row in set (0.66 sec)
    
    ```
---
 be/src/pipeline/common/join_utils.h                |  4 +-
 be/src/pipeline/dependency.h                       | 10 ++++-
 be/src/pipeline/exec/hashjoin_build_sink.cpp       |  6 +++
 .../exec/join/process_hash_table_probe_impl.h      |  1 +
 be/src/vec/common/hash_table/hash_map_context.h    | 48 +++++++++++++++-------
 5 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/be/src/pipeline/common/join_utils.h 
b/be/src/pipeline/common/join_utils.h
index cd3374995f7..7fcf669d42e 100644
--- a/be/src/pipeline/common/join_utils.h
+++ b/be/src/pipeline/common/join_utils.h
@@ -43,7 +43,7 @@ using I32HashTableContext = 
vectorized::PrimaryTypeHashTableContext<vectorized::
 using I64HashTableContext = vectorized::PrimaryTypeHashTableContext<UInt64>;
 using I128HashTableContext = vectorized::PrimaryTypeHashTableContext<UInt128>;
 using I256HashTableContext = vectorized::PrimaryTypeHashTableContext<UInt256>;
-
+using MethodOneString = 
vectorized::MethodStringNoCache<JoinHashMap<StringRef>>;
 template <bool has_null>
 using I64FixedKeyHashTableContext = 
vectorized::FixedKeyHashTableContext<UInt64, has_null>;
 
@@ -63,6 +63,6 @@ using HashTableVariants =
                      I64FixedKeyHashTableContext<false>, 
I128FixedKeyHashTableContext<true>,
                      I128FixedKeyHashTableContext<false>, 
I256FixedKeyHashTableContext<true>,
                      I256FixedKeyHashTableContext<false>, 
I136FixedKeyHashTableContext<true>,
-                     I136FixedKeyHashTableContext<false>>;
+                     I136FixedKeyHashTableContext<false>, MethodOneString>;
 
 } // namespace doris::pipeline
diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h
index e5738e48f93..863458d3bde 100644
--- a/be/src/pipeline/dependency.h
+++ b/be/src/pipeline/dependency.h
@@ -656,8 +656,8 @@ public:
 };
 
 using SetHashTableVariants =
-        std::variant<std::monostate,
-                     vectorized::MethodSerialized<HashMap<StringRef, 
RowRefListWithFlags>>,
+        std::variant<std::monostate, vectorized::SetSerializedHashTableContext,
+                     vectorized::SetMethodOneString,
                      
vectorized::SetPrimaryTypeHashTableContext<vectorized::UInt8>,
                      
vectorized::SetPrimaryTypeHashTableContext<vectorized::UInt16>,
                      
vectorized::SetPrimaryTypeHashTableContext<vectorized::UInt32>,
@@ -735,6 +735,12 @@ public:
             case TYPE_DATETIMEV2:
                 
hash_table_variants->emplace<vectorized::SetPrimaryTypeHashTableContext<UInt64>>();
                 break;
+            case TYPE_CHAR:
+            case TYPE_VARCHAR:
+            case TYPE_STRING: {
+                hash_table_variants->emplace<vectorized::SetMethodOneString>();
+                break;
+            }
             case TYPE_LARGEINT:
             case TYPE_DECIMALV2:
             case TYPE_DECIMAL128I:
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp 
b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index 0bee88ed537..8f7b176a979 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -377,6 +377,12 @@ void 
HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) {
                         }
                         break;
                     }
+                    case TYPE_CHAR:
+                    case TYPE_VARCHAR:
+                    case TYPE_STRING: {
+                        
_shared_state->hash_table_variants->emplace<MethodOneString>();
+                        break;
+                    }
                     default:
                         _shared_state->hash_table_variants
                                 
->emplace<vectorized::SerializedHashTableContext>();
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h 
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 3ffdb9cb990..653cc8ab447 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -739,6 +739,7 @@ struct ExtractType<T(U)> {
     INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext<true>));     \
     INSTANTIATION(JoinOpType, (I256FixedKeyHashTableContext<false>));    \
     INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext<true>));     \
+    INSTANTIATION(JoinOpType, (MethodOneString));                        \
     INSTANTIATION(JoinOpType, (I136FixedKeyHashTableContext<false>));
 
 } // namespace doris::pipeline
diff --git a/be/src/vec/common/hash_table/hash_map_context.h 
b/be/src/vec/common/hash_table/hash_map_context.h
index 6ca0653f7a9..0df0c8997f0 100644
--- a/be/src/vec/common/hash_table/hash_map_context.h
+++ b/be/src/vec/common/hash_table/hash_map_context.h
@@ -30,6 +30,7 @@
 #include "vec/common/hash_table/partitioned_hash_map.h"
 #include "vec/common/hash_table/string_hash_map.h"
 #include "vec/common/string_ref.h"
+#include "vec/common/typeid_cast.h"
 #include "vec/core/types.h"
 #include "vec/utils/util.hpp"
 
@@ -284,29 +285,47 @@ struct MethodStringNoCache : public MethodBase<TData> {
     using State =
             ColumnsHashing::HashMethodString<typename Base::Value, typename 
Base::Mapped, true>;
 
-    std::vector<StringRef> stored_keys;
+    // need keep until the hash probe end.
+    std::vector<StringRef> _build_stored_keys;
+    // refresh each time probe
+    std::vector<StringRef> _stored_keys;
 
     size_t serialized_keys_size(bool is_build) const override {
-        return stored_keys.size() * sizeof(StringRef);
+        return is_build ? (_build_stored_keys.size() * sizeof(StringRef))
+                        : (_stored_keys.size() * sizeof(StringRef));
     }
 
-    void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t 
num_rows,
-                              const uint8_t* null_map = nullptr, bool is_join 
= false,
-                              bool is_build = false, uint32_t bucket_size = 0) 
override {
+    void init_serialized_keys_impl(const ColumnRawPtrs& key_columns, size_t 
num_rows,
+                                   std::vector<StringRef>& stored_keys) {
         const IColumn& column = *key_columns[0];
-        const auto& column_string = assert_cast<const ColumnString&>(
+        const auto& nested_column =
                 column.is_nullable()
                         ? assert_cast<const 
ColumnNullable&>(column).get_nested_column()
-                        : column);
-        const auto& offsets = column_string.get_offsets();
-        const auto* chars = column_string.get_chars().data();
-
-        stored_keys.resize(column_string.size());
-        for (size_t row = 0; row < column_string.size(); row++) {
-            stored_keys[row] = StringRef(chars + offsets[row - 1], 
offsets[row] - offsets[row - 1]);
+                        : column;
+        auto serialized_str = [](const auto& column_string, 
std::vector<StringRef>& stored_keys) {
+            const auto& offsets = column_string.get_offsets();
+            const auto* chars = column_string.get_chars().data();
+            stored_keys.resize(column_string.size());
+            for (size_t row = 0; row < column_string.size(); row++) {
+                stored_keys[row] =
+                        StringRef(chars + offsets[row - 1], offsets[row] - 
offsets[row - 1]);
+            }
+        };
+        if (nested_column.is_column_string64()) {
+            const auto& column_string = assert_cast<const 
ColumnString64&>(nested_column);
+            serialized_str(column_string, stored_keys);
+        } else {
+            const auto& column_string = assert_cast<const 
ColumnString&>(nested_column);
+            serialized_str(column_string, stored_keys);
         }
-
         Base::keys = stored_keys.data();
+    }
+
+    void init_serialized_keys(const ColumnRawPtrs& key_columns, size_t 
num_rows,
+                              const uint8_t* null_map = nullptr, bool is_join 
= false,
+                              bool is_build = false, uint32_t bucket_size = 0) 
override {
+        init_serialized_keys_impl(key_columns, num_rows,
+                                  is_build ? _build_stored_keys : 
_stored_keys);
         if (is_join) {
             Base::init_join_bucket_num(num_rows, bucket_size, null_map);
         } else {
@@ -606,5 +625,6 @@ using SetPrimaryTypeHashTableContext =
 
 using SetSerializedHashTableContext =
         MethodSerialized<HashMap<StringRef, pipeline::RowRefListWithFlags>>;
+using SetMethodOneString = MethodStringNoCache<HashMap<StringRef, 
pipeline::RowRefListWithFlags>>;
 
 } // namespace doris::vectorized
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to