[doris] 01/01: Revert "[improvement](function) optimize substr performance (#10169)"

morningman Thu, 23 Jun 2022 19:59:29 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch revert-10169-substr_opt
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 1d45b1ff2bcc156dea1cacdde7f8acebebeab7d2
Author: Mingyu Chen <morningman....@gmail.com>
AuthorDate: Fri Jun 24 10:59:16 2022 +0800

    Revert "[improvement](function) optimize substr performance (#10169)"
    
    This reverts commit 2335d233f1f52eb64a380b4c9959becdf182b71b.
---
 be/src/exprs/like_predicate.h       |   2 +-
 be/src/runtime/string_search.hpp    |  29 +-
 be/src/vec/common/string_searcher.h | 860 ------------------------------------
 be/src/vec/common/volnitsky.h       | 713 ------------------------------
 be/src/vec/functions/like.cpp       |  39 +-
 be/src/vec/functions/like.h         |   2 +-
 6 files changed, 10 insertions(+), 1635 deletions(-)

diff --git a/be/src/exprs/like_predicate.h b/be/src/exprs/like_predicate.h
index 7f0822ef07..c530e32f6f 100644
--- a/be/src/exprs/like_predicate.h
+++ b/be/src/exprs/like_predicate.h
@@ -73,7 +73,7 @@ private:
         void set_search_string(const std::string& search_string_arg) {
             search_string = search_string_arg;
             search_string_sv = StringValue(search_string);
-            substring_pattern.set_pattern(&search_string_sv);
+            substring_pattern = StringSearch(&search_string_sv);
         }
     };
 
diff --git a/be/src/runtime/string_search.hpp b/be/src/runtime/string_search.hpp
index 463719f279..3f657b6fa4 100644
--- a/be/src/runtime/string_search.hpp
+++ b/be/src/runtime/string_search.hpp
@@ -23,7 +23,6 @@
 
 #include "common/logging.h"
 #include "runtime/string_value.h"
-#include "vec/common/volnitsky.h"
 
 namespace doris {
 
@@ -32,18 +31,18 @@ public:
     virtual ~StringSearch() {}
     StringSearch() : _pattern(nullptr) {}
 
-    StringSearch(const StringValue* pattern) { set_pattern(pattern); }
-
-    void set_pattern(const StringValue* pattern) {
-        _pattern = pattern;
-        _vol_searcher.reset(new Volnitsky(pattern->ptr, pattern->len));
-    }
+    StringSearch(const StringValue* pattern) : _pattern(pattern) {}
 
     // search for this pattern in str.
     //   Returns the offset into str if the pattern exists
     //   Returns -1 if the pattern is not found
     int search(const StringValue* str) const {
-        auto it = search(str->ptr, str->len);
+        if (!str || !_pattern || _pattern->len == 0) {
+            return -1;
+        }
+
+        auto it = std::search(str->ptr, str->ptr + str->len,
+                              std::default_searcher(_pattern->ptr, 
_pattern->ptr + _pattern->len));
         if (it == str->ptr + str->len) {
             return -1;
         } else {
@@ -51,22 +50,8 @@ public:
         }
     }
 
-    // search for this pattern in str.
-    //   Returns the offset into str if the pattern exists
-    //   Returns str+len if the pattern is not found
-    const char* search(char* str, size_t len) const {
-        if (!str || !_pattern || _pattern->len == 0) {
-            return str + len;
-        }
-
-        return _vol_searcher->search(str, len);
-    }
-
-    inline size_t get_pattern_length() { return _pattern ? _pattern->len : 0; }
-
 private:
     const StringValue* _pattern;
-    std::unique_ptr<Volnitsky> _vol_searcher;
 };
 
 } // namespace doris
diff --git a/be/src/vec/common/string_searcher.h 
b/be/src/vec/common/string_searcher.h
deleted file mode 100644
index a1cc996771..0000000000
--- a/be/src/vec/common/string_searcher.h
+++ /dev/null
@@ -1,860 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// 
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/StringSearcher.h
-// and modified by Doris
-
-#pragma once
-
-#include <stdint.h>
-#include <string.h>
-
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "vec/common/string_utils/string_utils.h"
-
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-namespace doris {
-
-// namespace ErrorCodes
-// {
-//     extern const int BAD_ARGUMENTS;
-// }
-
-/** Variants for searching a substring in a string.
-  * In most cases, performance is less than Volnitsky (see Volnitsky.h).
-  */
-
-class StringSearcherBase {
-public:
-    bool force_fallback = false;
-#ifdef __SSE2__
-protected:
-    static constexpr auto n = sizeof(__m128i);
-    const int page_size = sysconf(_SC_PAGESIZE); //::getPageSize();
-
-    bool page_safe(const void* const ptr) const {
-        return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <= 
page_size - n;
-    }
-#endif
-};
-
-/// Performs case-sensitive and case-insensitive search of UTF-8 strings
-template <bool CaseSensitive, bool ASCII>
-class StringSearcher;
-
-// comment out since it's not used in doris and UTF8 dependency is not easy to 
meet
-// /// Case-insensitive UTF-8 searcher
-// template <>
-// class StringSearcher<false, false> : public StringSearcherBase
-// {
-// private:
-//     using UTF8SequenceBuffer = uint8_t[6];
-
-//     /// substring to be searched for
-//     const uint8_t * const needle;
-//     const size_t needle_size;
-//     const uint8_t * const needle_end = needle + needle_size;
-//     /// lower and uppercase variants of the first octet of the first 
character in `needle`
-//     bool first_needle_symbol_is_ascii{};
-//     uint8_t l{};
-//     uint8_t u{};
-
-// #ifdef __SSE4_1__
-//     /// vectors filled with `l` and `u`, for determining leftmost position 
of the first symbol
-//     __m128i patl;
-//     __m128i patu;
-//     /// lower and uppercase vectors of first 16 characters of `needle`
-//     __m128i cachel = _mm_setzero_si128();
-//     __m128i cacheu = _mm_setzero_si128();
-//     int cachemask{};
-//     size_t cache_valid_len{};
-//     size_t cache_actual_len{};
-// #endif
-
-// public:
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     StringSearcher(const CharT * needle_, const size_t needle_size_)
-//         : needle{reinterpret_cast<const uint8_t *>(needle_)}, 
needle_size{needle_size_}
-//     {
-//         if (0 == needle_size)
-//             return;
-
-//         UTF8SequenceBuffer l_seq;
-//         UTF8SequenceBuffer u_seq;
-
-//         if (*needle < 0x80u)
-//         {
-//             first_needle_symbol_is_ascii = true;
-//             l = std::tolower(*needle);
-//             u = std::toupper(*needle);
-//         }
-//         else
-//         {
-//             auto first_u32 = UTF8::convertUTF8ToCodePoint(needle, 
needle_size);
-
-//             /// Invalid UTF-8
-//             if (!first_u32)
-//             {
-//                 /// Process it verbatim as a sequence of bytes.
-//                 size_t src_len = UTF8::seqLength(*needle);
-
-//                 memcpy(l_seq, needle, src_len);
-//                 memcpy(u_seq, needle, src_len);
-//             }
-//             else
-//             {
-//                 uint32_t first_l_u32 = Poco::Unicode::toLower(*first_u32);
-//                 uint32_t first_u_u32 = Poco::Unicode::toUpper(*first_u32);
-
-//                 /// lower and uppercase variants of the first octet of the 
first character in `needle`
-//                 size_t length_l = UTF8::convertCodePointToUTF8(first_l_u32, 
l_seq, sizeof(l_seq));
-//                 size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, 
u_seq, sizeof(u_seq));
-
-//                 if (length_l != length_u)
-//                     force_fallback = true;
-//             }
-
-//             l = l_seq[0];
-//             u = u_seq[0];
-
-//             if (force_fallback)
-//                 return;
-//         }
-
-// #ifdef __SSE4_1__
-//         /// for detecting leftmost position of the first symbol
-//         patl = _mm_set1_epi8(l);
-//         patu = _mm_set1_epi8(u);
-//         /// lower and uppercase vectors of first 16 octets of `needle`
-
-//         const auto * needle_pos = needle;
-
-//         for (size_t i = 0; i < n;)
-//         {
-//             if (needle_pos == needle_end)
-//             {
-//                 cachel = _mm_srli_si128(cachel, 1);
-//                 cacheu = _mm_srli_si128(cacheu, 1);
-//                 ++i;
-
-//                 continue;
-//             }
-
-//             size_t src_len = std::min<size_t>(needle_end - needle_pos, 
UTF8::seqLength(*needle_pos));
-//             auto c_u32 = UTF8::convertUTF8ToCodePoint(needle_pos, src_len);
-
-//             if (c_u32)
-//             {
-//                 int c_l_u32 = Poco::Unicode::toLower(*c_u32);
-//                 int c_u_u32 = Poco::Unicode::toUpper(*c_u32);
-
-//                 size_t dst_l_len = UTF8::convertCodePointToUTF8(c_l_u32, 
l_seq, sizeof(l_seq));
-//                 size_t dst_u_len = UTF8::convertCodePointToUTF8(c_u_u32, 
u_seq, sizeof(u_seq));
-
-//                 /// @note Unicode standard states it is a rare but possible 
occasion
-//                 if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
-//                 {
-//                     force_fallback = true;
-//                     return;
-//                 }
-//             }
-
-//             cache_actual_len += src_len;
-//             if (cache_actual_len < n)
-//                 cache_valid_len += src_len;
-
-//             for (size_t j = 0; j < src_len && i < n; ++j, ++i)
-//             {
-//                 cachel = _mm_srli_si128(cachel, 1);
-//                 cacheu = _mm_srli_si128(cacheu, 1);
-
-//                 if (needle_pos != needle_end)
-//                 {
-//                     cachel = _mm_insert_epi8(cachel, l_seq[j], n - 1);
-//                     cacheu = _mm_insert_epi8(cacheu, u_seq[j], n - 1);
-
-//                     cachemask |= 1 << i;
-//                     ++needle_pos;
-//                 }
-//             }
-//         }
-// #endif
-//     }
-
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const 
CharT * const haystack_end, const uint8_t * needle_pos) const
-//     {
-//         while (haystack_pos < haystack_end && needle_pos < needle_end)
-//         {
-//             auto haystack_code_point = 
UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
-//             auto needle_code_point = 
UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
-
-//             /// Invalid UTF-8, should not compare equals
-//             if (!haystack_code_point || !needle_code_point)
-//                 break;
-
-//             /// Not equals case insensitive.
-//             if (Poco::Unicode::toLower(*haystack_code_point) != 
Poco::Unicode::toLower(*needle_code_point))
-//                 break;
-
-//             auto len = UTF8::seqLength(*haystack_pos);
-//             haystack_pos += len;
-
-//             len = UTF8::seqLength(*needle_pos);
-//             needle_pos += len;
-//         }
-
-//         return needle_pos == needle_end;
-//     }
-
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * 
haystack_end, const CharT * pos) const
-//     {
-
-// #ifdef __SSE4_1__
-//         if (page_safe(pos) && !force_fallback)
-//         {
-//             const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const 
__m128i *>(pos));
-//             const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
-//             const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
-//             const auto v_against_l_or_u = _mm_or_si128(v_against_l, 
v_against_u);
-//             const auto mask = _mm_movemask_epi8(v_against_l_or_u);
-
-//             if (0xffff == cachemask)
-//             {
-//                 if (mask == cachemask)
-//                 {
-//                     if (compareTrivial(pos, haystack_end, needle))
-//                         return true;
-//                 }
-//             }
-//             else if ((mask & cachemask) == cachemask)
-//             {
-//                 if (compareTrivial(pos, haystack_end, needle))
-//                     return true;
-//             }
-
-//             return false;
-//         }
-// #endif
-
-//         if (*pos == l || *pos == u)
-//         {
-//             pos += first_needle_symbol_is_ascii;
-//             const auto * needle_pos = needle + first_needle_symbol_is_ascii;
-
-//             if (compareTrivial(pos, haystack_end, needle_pos))
-//                 return true;
-//         }
-
-//         return false;
-//     }
-
-//     /** Returns haystack_end if not found.
-//       */
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     const CharT * search(const CharT * haystack, const CharT * const 
haystack_end) const
-//     {
-//         if (0 == needle_size)
-//             return haystack;
-
-//         while (haystack < haystack_end)
-//         {
-// #ifdef __SSE4_1__
-//             if (haystack + n <= haystack_end && page_safe(haystack) && 
!force_fallback)
-//             {
-//                 const auto v_haystack = 
_mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
-//                 const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
-//                 const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
-//                 const auto v_against_l_or_u = _mm_or_si128(v_against_l, 
v_against_u);
-
-//                 const auto mask = _mm_movemask_epi8(v_against_l_or_u);
-
-//                 if (mask == 0)
-//                 {
-//                     haystack += n;
-//                     UTF8::syncForward(haystack, haystack_end);
-//                     continue;
-//                 }
-
-//                 const auto offset = __builtin_ctz(mask);
-//                 haystack += offset;
-
-//                 if (haystack + n <= haystack_end && page_safe(haystack))
-//                 {
-//                     const auto v_haystack_offset = 
_mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
-//                     const auto v_against_l_offset = 
_mm_cmpeq_epi8(v_haystack_offset, cachel);
-//                     const auto v_against_u_offset = 
_mm_cmpeq_epi8(v_haystack_offset, cacheu);
-//                     const auto v_against_l_or_u_offset = 
_mm_or_si128(v_against_l_offset, v_against_u_offset);
-//                     const auto mask_offset_both = 
_mm_movemask_epi8(v_against_l_or_u_offset);
-
-//                     if (0xffff == cachemask)
-//                     {
-//                         if (mask_offset_both == cachemask)
-//                         {
-//                             if (compareTrivial(haystack, haystack_end, 
needle))
-//                                 return haystack;
-//                         }
-//                     }
-//                     else if ((mask_offset_both & cachemask) == cachemask)
-//                     {
-//                         if (compareTrivial(haystack, haystack_end, needle))
-//                             return haystack;
-//                     }
-
-//                     /// first octet was ok, but not the first 16, move to 
start of next sequence and reapply
-//                     haystack += UTF8::seqLength(*haystack);
-//                     continue;
-//                 }
-//             }
-// #endif
-
-//             if (haystack == haystack_end)
-//                 return haystack_end;
-
-//             if (*haystack == l || *haystack == u)
-//             {
-//                 auto haystack_pos = haystack + first_needle_symbol_is_ascii;
-//                 const auto * needle_pos = needle + 
first_needle_symbol_is_ascii;
-
-//                 if (compareTrivial(haystack_pos, haystack_end, needle_pos))
-//                     return haystack;
-//             }
-
-//             /// advance to the start of the next sequence
-//             haystack += UTF8::seqLength(*haystack);
-//         }
-
-//         return haystack_end;
-//     }
-
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     const CharT * search(const CharT * haystack, const size_t 
haystack_size) const
-//     {
-//         return search(haystack, haystack + haystack_size);
-//     }
-// };
-
-// /// Case-insensitive ASCII searcher
-// template <>
-// class StringSearcher<false, true> : public StringSearcherBase
-// {
-// private:
-//     /// string to be searched for
-//     const uint8_t * const needle;
-//     const uint8_t * const needle_end;
-//     /// lower and uppercase variants of the first character in `needle`
-//     uint8_t l{};
-//     uint8_t u{};
-
-// #ifdef __SSE4_1__
-//     /// vectors filled with `l` and `u`, for determining leftmost position 
of the first symbol
-//     __m128i patl, patu;
-//     /// lower and uppercase vectors of first 16 characters of `needle`
-//     __m128i cachel = _mm_setzero_si128(), cacheu = _mm_setzero_si128();
-//     int cachemask{};
-// #endif
-
-// public:
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     StringSearcher(const CharT * needle_, const size_t needle_size)
-//         : needle{reinterpret_cast<const uint8_t *>(needle_)}, 
needle_end{needle + needle_size}
-//     {
-//         if (0 == needle_size)
-//             return;
-
-//         l = static_cast<uint8_t>(std::tolower(*needle));
-//         u = static_cast<uint8_t>(std::toupper(*needle));
-
-// #ifdef __SSE4_1__
-//         patl = _mm_set1_epi8(l);
-//         patu = _mm_set1_epi8(u);
-
-//         const auto * needle_pos = needle;
-
-//         for (const auto i : collections::range(0, n))
-//         {
-//             cachel = _mm_srli_si128(cachel, 1);
-//             cacheu = _mm_srli_si128(cacheu, 1);
-
-//             if (needle_pos != needle_end)
-//             {
-//                 cachel = _mm_insert_epi8(cachel, std::tolower(*needle_pos), 
n - 1);
-//                 cacheu = _mm_insert_epi8(cacheu, std::toupper(*needle_pos), 
n - 1);
-//                 cachemask |= 1 << i;
-//                 ++needle_pos;
-//             }
-//         }
-// #endif
-//     }
-
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * 
/*haystack_end*/, const CharT * pos) const
-//     {
-// #ifdef __SSE4_1__
-//         if (page_safe(pos))
-//         {
-//             const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const 
__m128i *>(pos));
-//             const auto v_against_l = _mm_cmpeq_epi8(v_haystack, cachel);
-//             const auto v_against_u = _mm_cmpeq_epi8(v_haystack, cacheu);
-//             const auto v_against_l_or_u = _mm_or_si128(v_against_l, 
v_against_u);
-//             const auto mask = _mm_movemask_epi8(v_against_l_or_u);
-
-//             if (0xffff == cachemask)
-//             {
-//                 if (mask == cachemask)
-//                 {
-//                     pos += n;
-//                     const auto * needle_pos = needle + n;
-
-//                     while (needle_pos < needle_end && std::tolower(*pos) == 
std::tolower(*needle_pos))
-//                     {
-//                         ++pos;
-//                         ++needle_pos;
-//                     }
-
-//                     if (needle_pos == needle_end)
-//                         return true;
-//                 }
-//             }
-//             else if ((mask & cachemask) == cachemask)
-//                 return true;
-
-//             return false;
-//         }
-// #endif
-
-//         if (*pos == l || *pos == u)
-//         {
-//             ++pos;
-//             const auto * needle_pos = needle + 1;
-
-//             while (needle_pos < needle_end && std::tolower(*pos) == 
std::tolower(*needle_pos))
-//             {
-//                 ++pos;
-//                 ++needle_pos;
-//             }
-
-//             if (needle_pos == needle_end)
-//                 return true;
-//         }
-
-//         return false;
-//     }
-
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     const CharT * search(const CharT * haystack, const CharT * const 
haystack_end) const
-//     {
-//         if (needle == needle_end)
-//             return haystack;
-
-//         while (haystack < haystack_end)
-//         {
-// #ifdef __SSE4_1__
-//             if (haystack + n <= haystack_end && page_safe(haystack))
-//             {
-//                 const auto v_haystack = 
_mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
-//                 const auto v_against_l = _mm_cmpeq_epi8(v_haystack, patl);
-//                 const auto v_against_u = _mm_cmpeq_epi8(v_haystack, patu);
-//                 const auto v_against_l_or_u = _mm_or_si128(v_against_l, 
v_against_u);
-
-//                 const auto mask = _mm_movemask_epi8(v_against_l_or_u);
-
-//                 if (mask == 0)
-//                 {
-//                     haystack += n;
-//                     continue;
-//                 }
-
-//                 const auto offset = __builtin_ctz(mask);
-//                 haystack += offset;
-
-//                 if (haystack + n <= haystack_end && page_safe(haystack))
-//                 {
-//                     const auto v_haystack_offset = 
_mm_loadu_si128(reinterpret_cast<const __m128i *>(haystack));
-//                     const auto v_against_l_offset = 
_mm_cmpeq_epi8(v_haystack_offset, cachel);
-//                     const auto v_against_u_offset = 
_mm_cmpeq_epi8(v_haystack_offset, cacheu);
-//                     const auto v_against_l_or_u_offset = 
_mm_or_si128(v_against_l_offset, v_against_u_offset);
-//                     const auto mask_offset = 
_mm_movemask_epi8(v_against_l_or_u_offset);
-
-//                     if (0xffff == cachemask)
-//                     {
-//                         if (mask_offset == cachemask)
-//                         {
-//                             const auto * haystack_pos = haystack + n;
-//                             const auto * needle_pos = needle + n;
-
-//                             while (haystack_pos < haystack_end && 
needle_pos < needle_end &&
-//                                    std::tolower(*haystack_pos) == 
std::tolower(*needle_pos))
-//                             {
-//                                 ++haystack_pos;
-//                                 ++needle_pos;
-//                             }
-
-//                             if (needle_pos == needle_end)
-//                                 return haystack;
-//                         }
-//                     }
-//                     else if ((mask_offset & cachemask) == cachemask)
-//                         return haystack;
-
-//                     ++haystack;
-//                     continue;
-//                 }
-//             }
-// #endif
-
-//             if (haystack == haystack_end)
-//                 return haystack_end;
-
-//             if (*haystack == l || *haystack == u)
-//             {
-//                 const auto * haystack_pos = haystack + 1;
-//                 const auto * needle_pos = needle + 1;
-
-//                 while (haystack_pos < haystack_end && needle_pos < 
needle_end &&
-//                        std::tolower(*haystack_pos) == 
std::tolower(*needle_pos))
-//                 {
-//                     ++haystack_pos;
-//                     ++needle_pos;
-//                 }
-
-//                 if (needle_pos == needle_end)
-//                     return haystack;
-//             }
-
-//             ++haystack;
-//         }
-
-//         return haystack_end;
-//     }
-
-//     template <typename CharT>
-//     // requires (sizeof(CharT) == 1)
-//     const CharT * search(const CharT * haystack, const size_t 
haystack_size) const
-//     {
-//         return search(haystack, haystack + haystack_size);
-//     }
-// };
-
-/// Case-sensitive searcher (both ASCII and UTF-8)
-template <bool ASCII>
-class StringSearcher<true, ASCII> : public StringSearcherBase {
-private:
-    /// string to be searched for
-    const uint8_t* const needle;
-    const uint8_t* const needle_end;
-    /// first character in `needle`
-    uint8_t first {};
-
-#ifdef __SSE4_1__
-    /// vector filled `first` for determining leftmost position of the first 
symbol
-    __m128i pattern;
-    /// vector of first 16 characters of `needle`
-    __m128i cache = _mm_setzero_si128();
-    int cachemask {};
-#endif
-
-public:
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    StringSearcher(const CharT* needle_, const size_t needle_size)
-            : needle {reinterpret_cast<const uint8_t*>(needle_)},
-              needle_end {needle + needle_size} {
-        if (0 == needle_size) return;
-
-        first = *needle;
-
-#ifdef __SSE4_1__
-        pattern = _mm_set1_epi8(first);
-
-        const auto* needle_pos = needle;
-
-        //for (const auto i : collections::range(0, n))
-        for (size_t i = 0; i < n; i++) {
-            cache = _mm_srli_si128(cache, 1);
-
-            if (needle_pos != needle_end) {
-                cache = _mm_insert_epi8(cache, *needle_pos, n - 1);
-                cachemask |= 1 << i;
-                ++needle_pos;
-            }
-        }
-#endif
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    ALWAYS_INLINE bool compare(const CharT* /*haystack*/, const CharT* 
/*haystack_end*/,
-                               const CharT* pos) const {
-#ifdef __SSE4_1__
-        if (page_safe(pos)) {
-            const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(pos));
-            const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache);
-            const auto mask = _mm_movemask_epi8(v_against_cache);
-
-            if (0xffff == cachemask) {
-                if (mask == cachemask) {
-                    pos += n;
-                    const auto* needle_pos = needle + n;
-
-                    while (needle_pos < needle_end && *pos == *needle_pos) 
++pos, ++needle_pos;
-
-                    if (needle_pos == needle_end) return true;
-                }
-            } else if ((mask & cachemask) == cachemask)
-                return true;
-
-            return false;
-        }
-#endif
-
-        if (*pos == first) {
-            ++pos;
-            const auto* needle_pos = needle + 1;
-
-            while (needle_pos < needle_end && *pos == *needle_pos) ++pos, 
++needle_pos;
-
-            if (needle_pos == needle_end) return true;
-        }
-
-        return false;
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const CharT* const 
haystack_end) const {
-        if (needle == needle_end) return haystack;
-
-        while (haystack < haystack_end) {
-#ifdef __SSE4_1__
-            if (haystack + n <= haystack_end && page_safe(haystack)) {
-                /// find first character
-                const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(haystack));
-                const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, 
pattern);
-
-                const auto mask = _mm_movemask_epi8(v_against_pattern);
-
-                /// first character not present in 16 octets starting at 
`haystack`
-                if (mask == 0) {
-                    haystack += n;
-                    continue;
-                }
-
-                const auto offset = __builtin_ctz(mask);
-                haystack += offset;
-
-                if (haystack + n <= haystack_end && page_safe(haystack)) {
-                    /// check for first 16 octets
-                    const auto v_haystack_offset =
-                            _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(haystack));
-                    const auto v_against_cache = 
_mm_cmpeq_epi8(v_haystack_offset, cache);
-                    const auto mask_offset = 
_mm_movemask_epi8(v_against_cache);
-
-                    if (0xffff == cachemask) {
-                        if (mask_offset == cachemask) {
-                            const auto* haystack_pos = haystack + n;
-                            const auto* needle_pos = needle + n;
-
-                            while (haystack_pos < haystack_end && needle_pos < 
needle_end &&
-                                   *haystack_pos == *needle_pos)
-                                ++haystack_pos, ++needle_pos;
-
-                            if (needle_pos == needle_end) return haystack;
-                        }
-                    } else if ((mask_offset & cachemask) == cachemask)
-                        return haystack;
-
-                    ++haystack;
-                    continue;
-                }
-            }
-#endif
-
-            if (haystack == haystack_end) return haystack_end;
-
-            if (*haystack == first) {
-                const auto* haystack_pos = haystack + 1;
-                const auto* needle_pos = needle + 1;
-
-                while (haystack_pos < haystack_end && needle_pos < needle_end 
&&
-                       *haystack_pos == *needle_pos)
-                    ++haystack_pos, ++needle_pos;
-
-                if (needle_pos == needle_end) return haystack;
-            }
-
-            ++haystack;
-        }
-
-        return haystack_end;
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const size_t haystack_size) 
const {
-        return search(haystack, haystack + haystack_size);
-    }
-};
-
-// Searches for needle surrounded by token-separators.
-// Separators are anything inside ASCII (0-128) and not alphanum.
-// Any value outside of basic ASCII (>=128) is considered a non-separator 
symbol, hence UTF-8 strings
-// should work just fine. But any Unicode whitespace is not considered a token 
separtor.
-template <typename StringSearcher>
-class TokenSearcher : public StringSearcherBase {
-    StringSearcher searcher;
-    size_t needle_size;
-
-public:
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    TokenSearcher(const CharT* needle_, const size_t needle_size_)
-            : searcher {needle_, needle_size_}, needle_size(needle_size_) {
-        if (std::any_of(needle_, needle_ + needle_size_, isTokenSeparator)) {
-            //throw Exception{"Needle must not contain whitespace or separator 
characters", ErrorCodes::BAD_ARGUMENTS};
-        }
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    ALWAYS_INLINE bool compare(const CharT* haystack, const CharT* 
haystack_end,
-                               const CharT* pos) const {
-        // use searcher only if pos is in the beginning of token and pos + 
searcher.needle_size is end of token.
-        if (isToken(haystack, haystack_end, pos))
-            return searcher.compare(haystack, haystack_end, pos);
-
-        return false;
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const CharT* const 
haystack_end) const {
-        // use searcher.search(), then verify that returned value is a token
-        // if it is not, skip it and re-run
-
-        const auto* pos = haystack;
-        while (pos < haystack_end) {
-            pos = searcher.search(pos, haystack_end);
-            if (pos == haystack_end || isToken(haystack, haystack_end, pos)) 
return pos;
-
-            // assuming that heendle does not contain any token separators.
-            pos += needle_size;
-        }
-        return haystack_end;
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const size_t haystack_size) 
const {
-        return search(haystack, haystack + haystack_size);
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    ALWAYS_INLINE bool isToken(const CharT* haystack, const CharT* const 
haystack_end,
-                               const CharT* p) const {
-        return (p == haystack || isTokenSeparator(*(p - 1))) &&
-               (p + needle_size >= haystack_end || isTokenSeparator(*(p + 
needle_size)));
-    }
-
-    ALWAYS_INLINE static bool isTokenSeparator(const uint8_t c) {
-        return !(is_alpha_numeric_ascii(c) || !is_ascii(c));
-    }
-};
-
-using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>;
-// using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
-using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
-// using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
-using ASCIICaseSensitiveTokenSearcher = 
TokenSearcher<ASCIICaseSensitiveStringSearcher>;
-// using ASCIICaseInsensitiveTokenSearcher = 
TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
-
-/** Uses functions from libc.
-  * It makes sense to use only with short haystacks when cheap initialization 
is required.
-  * There is no option for case-insensitive search for UTF-8 strings.
-  * It is required that strings are zero-terminated.
-  */
-
-struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase {
-    const char* const needle;
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    LibCASCIICaseSensitiveStringSearcher(const CharT* const needle_, const 
size_t /* needle_size */)
-            : needle(reinterpret_cast<const char*>(needle_)) {}
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const CharT* const 
haystack_end) const {
-        const auto* res = strstr(reinterpret_cast<const char*>(haystack),
-                                 reinterpret_cast<const char*>(needle));
-        if (!res) return haystack_end;
-        return reinterpret_cast<const CharT*>(res);
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const size_t haystack_size) 
const {
-        return search(haystack, haystack + haystack_size);
-    }
-};
-
-struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase {
-    const char* const needle;
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    LibCASCIICaseInsensitiveStringSearcher(const CharT* const needle_,
-                                           const size_t /* needle_size */)
-            : needle(reinterpret_cast<const char*>(needle_)) {}
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const CharT* const 
haystack_end) const {
-        const auto* res = strcasestr(reinterpret_cast<const char*>(haystack),
-                                     reinterpret_cast<const char*>(needle));
-        if (!res) return haystack_end;
-        return reinterpret_cast<const CharT*>(res);
-    }
-
-    template <typename CharT>
-    // requires (sizeof(CharT) == 1)
-    const CharT* search(const CharT* haystack, const size_t haystack_size) 
const {
-        return search(haystack, haystack + haystack_size);
-    }
-};
-
-} // namespace doris
diff --git a/be/src/vec/common/volnitsky.h b/be/src/vec/common/volnitsky.h
deleted file mode 100644
index 97883e75df..0000000000
--- a/be/src/vec/common/volnitsky.h
+++ /dev/null
@@ -1,713 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-// This file is copied from
-// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Volnitsky.h
-// and modified by Doris
-
-#pragma once
-
-#include <stdint.h>
-#include <string.h>
-
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "vec/common/string_searcher.h"
-#include "vec/common/unaligned.h"
-
-/** Search for a substring in a string by Volnitsky's algorithm
-  * http://volnitsky.com/project/str_search/
-  *
-  * `haystack` and `needle` can contain zero bytes.
-  *
-  * Algorithm:
-  * - if the `needle` is too small or too large, or too small `haystack`, use 
std::search or memchr;
-  * - when initializing, fill in an open-addressing linear probing hash table 
of the form
-  *    hash from the bigram of needle -> the position of this bigram in needle 
+ 1.
-  *    (one is added only to distinguish zero offset from an empty cell)
-  * - the keys are not stored in the hash table, only the values are stored;
-  * - bigrams can be inserted several times if they occur in the needle 
several times;
-  * - when searching, take from haystack bigram, which should correspond to 
the last bigram of needle (comparing from the end);
-  * - look for it in the hash table, if found - get the offset from the hash 
table and compare the string bytewise;
-  * - if it did not match, we check the next cell of the hash table from the 
collision resolution chain;
-  * - if not found, skip to haystack almost the size of the needle bytes;
-  *
-  * MultiVolnitsky - search for multiple substrings in a string:
-  * - Add bigrams to hash table with string index. Then the usual Volnitsky 
search is used.
-  * - We are adding while searching, limiting the number of fallback searchers 
and the total number of added bigrams
-  */
-
-namespace doris {
-using UInt8 = uint8_t;
-using UInt16 = uint16_t;
-using UInt64 = uint64_t;
-
-namespace VolnitskyTraits {
-using Offset =
-        UInt8; /// Offset in the needle. For the basic algorithm, the length 
of the needle must not be greater than 255.
-using Id =
-        UInt8; /// Index of the string (within the array of multiple needles), 
must not be greater than 255.
-using Ngram = UInt16; /// n-gram (2 bytes).
-
-/** Fits into the L2 cache (of common Intel CPUs).
-     * This number is extremely good for compilers as it is 
numeric_limits<Uint16>::max() and there are optimizations with movzwl and other 
instructions with 2 bytes
-     */
-static constexpr size_t hash_size = 64 * 1024;
-
-/// min haystack size to use main algorithm instead of fallback
-static constexpr size_t min_haystack_size_for_algorithm = 20000;
-
-static inline bool isFallbackNeedle(const size_t needle_size, size_t 
haystack_size_hint = 0) {
-    return needle_size < 2 * sizeof(Ngram) || needle_size >= 
std::numeric_limits<Offset>::max() ||
-           (haystack_size_hint && haystack_size_hint < 
min_haystack_size_for_algorithm);
-}
-
-static inline Ngram toNGram(const UInt8* const pos) {
-    return unaligned_load<Ngram>(pos);
-}
-
-template <typename Callback>
-static inline bool putNGramASCIICaseInsensitive(const UInt8* pos, int offset,
-                                                Callback&& putNGramBase) {
-    struct Chars {
-        UInt8 c0;
-        UInt8 c1;
-    };
-
-    union {
-        Ngram n;
-        Chars chars;
-    };
-
-    n = toNGram(pos);
-
-    const auto c0_al = isAlphaASCII(chars.c0);
-    const auto c1_al = isAlphaASCII(chars.c1);
-
-    if (c0_al && c1_al) {
-        /// 4 combinations: AB, aB, Ab, ab
-        putNGramBase(n, offset);
-        chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
-        putNGramBase(n, offset);
-        chars.c1 = alternateCaseIfAlphaASCII(chars.c1);
-        putNGramBase(n, offset);
-        chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
-        putNGramBase(n, offset);
-    } else if (c0_al) {
-        /// 2 combinations: A1, a1
-        putNGramBase(n, offset);
-        chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
-        putNGramBase(n, offset);
-    } else if (c1_al) {
-        /// 2 combinations: 0B, 0b
-        putNGramBase(n, offset);
-        chars.c1 = alternateCaseIfAlphaASCII(chars.c1);
-        putNGramBase(n, offset);
-    } else
-        /// 1 combination: 01
-        putNGramBase(n, offset);
-
-    return true;
-}
-
-// comment out since it's not used in doris and UTF8 dependency is not easy to 
meet
-// template <typename Callback>
-// static inline bool putNGramUTF8CaseInsensitive(
-//     const UInt8 * pos, int offset, const UInt8 * begin, size_t size, 
Callback && putNGramBase)
-// {
-//     const UInt8 * end = begin + size;
-
-//     struct Chars
-//     {
-//         UInt8 c0;
-//         UInt8 c1;
-//     };
-
-//     union
-//     {
-//         VolnitskyTraits::Ngram n;
-//         Chars chars;
-//     };
-
-//     n = toNGram(pos);
-
-//     if (isascii(chars.c0) && isascii(chars.c1))
-//     {
-//         return putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
-//     }
-//     else
-//     {
-//         /** n-gram (in the case of n = 2)
-//             *  can be entirely located within one code point,
-//             *  or intersect with two code points.
-//             *
-//             * In the first case, you need to consider up to two 
alternatives - this code point in upper and lower case,
-//             *  and in the second case - up to four alternatives - fragments 
of two code points in all combinations of cases.
-//             *
-//             * It does not take into account the dependence of the 
case-transformation from the locale (for example - Turkish `Ii`)
-//             *  as well as composition / decomposition and other features.
-//             *
-//             * It also does not work if characters with lower and upper 
cases are represented by different number of bytes or code points.
-//             */
-
-//         using Seq = UInt8[6];
-
-//         if (UTF8::isContinuationOctet(chars.c1))
-//         {
-//             /// ngram is inside a sequence
-//             const auto * seq_pos = pos;
-//             UTF8::syncBackward(seq_pos, begin);
-
-//             auto u32 = UTF8::convertUTF8ToCodePoint(seq_pos, end - seq_pos);
-//             /// Invalid UTF-8
-//             if (!u32)
-//             {
-//                 putNGramBase(n, offset);
-//             }
-//             else
-//             {
-//                 int l_u32 = Poco::Unicode::toLower(*u32);
-//                 int u_u32 = Poco::Unicode::toUpper(*u32);
-
-//                 /// symbol is case-independent
-//                 if (l_u32 == u_u32)
-//                 {
-//                     putNGramBase(n, offset);
-//                 }
-//                 else
-//                 {
-//                     /// where is the given ngram in respect to the start of 
UTF-8 sequence?
-//                     size_t seq_ngram_offset = pos - seq_pos;
-
-//                     Seq seq_l;
-//                     size_t length_l = UTF8::convertCodePointToUTF8(l_u32, 
seq_l, sizeof(seq_l));
-
-//                     Seq seq_r;
-//                     size_t length_r = UTF8::convertCodePointToUTF8(u_u32, 
seq_r, sizeof(seq_r));
-
-//                     if (length_l != length_r)
-//                         return false;
-
-//                     assert(length_l >= 2 && length_r >= 2);
-
-//                     chars.c0 = seq_l[seq_ngram_offset];
-//                     chars.c1 = seq_l[seq_ngram_offset + 1];
-//                     putNGramBase(n, offset);
-
-//                     chars.c0 = seq_r[seq_ngram_offset]; //-V519
-//                     chars.c1 = seq_r[seq_ngram_offset + 1]; //-V519
-//                     putNGramBase(n, offset);
-
-//                 }
-//             }
-//         }
-//         else
-//         {
-//             /// ngram is on the boundary of two sequences
-//             /// first sequence may start before u_pos if it is not ASCII
-//             const auto * first_seq_pos = pos;
-//             UTF8::syncBackward(first_seq_pos, begin);
-//             /// where is the given ngram in respect to the start of first 
UTF-8 sequence?
-//             size_t seq_ngram_offset = pos - first_seq_pos;
-
-//             auto first_u32 = UTF8::convertUTF8ToCodePoint(first_seq_pos, 
end - first_seq_pos);
-//             int first_l_u32 = 0;
-//             int first_u_u32 = 0;
-
-//             if (first_u32)
-//             {
-//                 first_l_u32 = Poco::Unicode::toLower(*first_u32);
-//                 first_u_u32 = Poco::Unicode::toUpper(*first_u32);
-//             }
-
-//             /// second sequence always start immediately after u_pos
-//             const auto * second_seq_pos = pos + 1;
-
-//             auto second_u32 = UTF8::convertUTF8ToCodePoint(second_seq_pos, 
end - second_seq_pos);
-//             int second_l_u32 = 0;
-//             int second_u_u32 = 0;
-
-//             if (second_u32)
-//             {
-//                 second_l_u32 = Poco::Unicode::toLower(*second_u32);
-//                 second_u_u32 = Poco::Unicode::toUpper(*second_u32);
-//             }
-
-//             /// both symbols are case-independent
-//             if (first_l_u32 == first_u_u32 && second_l_u32 == second_u_u32)
-//             {
-//                 putNGramBase(n, offset);
-//             }
-//             else if (first_l_u32 == first_u_u32)
-//             {
-//                 /// first symbol is case-independent
-//                 Seq seq_l;
-//                 size_t size_l = UTF8::convertCodePointToUTF8(second_l_u32, 
seq_l, sizeof(seq_l));
-
-//                 Seq seq_u;
-//                 size_t size_u = UTF8::convertCodePointToUTF8(second_u_u32, 
seq_u, sizeof(seq_u));
-
-//                 if (size_l != size_u)
-//                     return false;
-
-//                 assert(size_l >= 1 && size_u >= 1);
-//                 chars.c1 = seq_l[0];
-//                 putNGramBase(n, offset);
-
-//                 /// put ngram from uppercase, if it is different
-//                 if (chars.c1 != seq_u[0])
-//                 {
-//                     chars.c1 = seq_u[0];
-//                     putNGramBase(n, offset);
-//                 }
-//             }
-//             else if (second_l_u32 == second_u_u32)
-//             {
-//                 /// second symbol is case-independent
-
-//                 Seq seq_l;
-//                 size_t size_l = UTF8::convertCodePointToUTF8(first_l_u32, 
seq_l, sizeof(seq_l));
-//                 Seq seq_u;
-//                 size_t size_u = UTF8::convertCodePointToUTF8(first_u_u32, 
seq_u, sizeof(seq_u));
-
-//                 if (size_l != size_u)
-//                     return false;
-
-//                 assert(size_l > seq_ngram_offset && size_u > 
seq_ngram_offset);
-
-//                 chars.c0 = seq_l[seq_ngram_offset];
-//                 putNGramBase(n, offset);
-
-//                 /// put ngram for uppercase, if it is different
-//                 if (chars.c0 != seq_u[seq_ngram_offset])
-//                 {
-//                     chars.c0 = seq_u[seq_ngram_offset];
-//                     putNGramBase(n, offset);
-//                 }
-//             }
-//             else
-//             {
-//                 Seq first_l_seq;
-//                 Seq first_u_seq;
-//                 Seq second_l_seq;
-//                 Seq second_u_seq;
-
-//                 size_t size_first_l = 
UTF8::convertCodePointToUTF8(first_l_u32, first_l_seq, sizeof(first_l_seq));
-//                 size_t size_first_u = 
UTF8::convertCodePointToUTF8(first_u_u32, first_u_seq, sizeof(first_u_seq));
-//                 size_t size_second_l = 
UTF8::convertCodePointToUTF8(second_l_u32, second_l_seq, sizeof(second_l_seq));
-//                 size_t size_second_u = 
UTF8::convertCodePointToUTF8(second_u_u32, second_u_seq, sizeof(second_u_seq));
-//                 if (size_first_l != size_first_u || size_second_l != 
size_second_u)
-//                     return false;
-
-//                 assert(size_first_l > seq_ngram_offset);
-//                 assert(size_first_u > seq_ngram_offset);
-//                 assert(size_second_l > 0);
-//                 assert(size_second_u > 0);
-
-//                 auto c0l = first_l_seq[seq_ngram_offset];
-//                 auto c0u = first_u_seq[seq_ngram_offset];
-//                 auto c1l = second_l_seq[0];
-//                 auto c1u = second_u_seq[0];
-
-//                 /// ngram for ll
-//                 chars.c0 = c0l;
-//                 chars.c1 = c1l;
-//                 putNGramBase(n, offset);
-
-//                 if (c0l != c0u)
-//                 {
-//                     /// ngram for Ul
-//                     chars.c0 = c0u;
-//                     chars.c1 = c1l; //-V1048
-//                     putNGramBase(n, offset);
-//                 }
-
-//                 if (c1l != c1u)
-//                 {
-//                     /// ngram for lU
-//                     chars.c0 = c0l;
-//                     chars.c1 = c1u;
-//                     putNGramBase(n, offset);
-//                 }
-
-//                 if (c0l != c0u && c1l != c1u)
-//                 {
-//                     /// ngram for UU
-//                     chars.c0 = c0u;
-//                     chars.c1 = c1u;
-//                     putNGramBase(n, offset);
-//                 }
-//             }
-//         }
-//     }
-//     return true;
-// }
-
-template <bool CaseSensitive, bool ASCII, typename Callback>
-static inline bool putNGram(const UInt8* pos, int offset, [[maybe_unused]] 
const UInt8* begin,
-                            size_t size, Callback&& putNGramBase) {
-    if constexpr (CaseSensitive) {
-        putNGramBase(toNGram(pos), offset);
-        return true;
-    } else if constexpr (ASCII) {
-        return putNGramASCIICaseInsensitive(pos, offset, 
std::forward<Callback>(putNGramBase));
-    } else {
-        // return putNGramUTF8CaseInsensitive(pos, offset, begin, size, 
std::forward<Callback>(putNGramBase));
-        return false;
-    }
-}
-} // namespace VolnitskyTraits
-
-/// @todo store lowercase needle to speed up in case there are numerous 
occurrences of bigrams from needle in haystack
-template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
-class VolnitskyBase {
-protected:
-    const UInt8* needle;
-    size_t needle_size;
-    const UInt8* needle_end = needle + needle_size;
-    /// For how long we move, if the n-gram from haystack is not found in the 
hash table.
-    size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
-
-    /** max needle length is 255, max distinct ngrams for case-sensitive is 
(255 - 1), case-insensitive is 4 * (255 - 1)
-      *  storage of 64K ngrams (n = 2, 128 KB) should be large enough for both 
cases */
-    std::unique_ptr<VolnitskyTraits::Offset[]> hash; /// Hash table.
-
-    bool fallback; /// Do we need to use the fallback algorithm.
-
-    FallbackSearcher fallback_searcher;
-
-public:
-    using Searcher = FallbackSearcher;
-
-    /** haystack_size_hint - the expected total size of the haystack for 
`search` calls. Optional (zero means unspecified).
-      * If you specify it small enough, the fallback algorithm will be used,
-      *  since it is considered that it's useless to waste time initializing 
the hash table.
-      */
-    VolnitskyBase(const char* const needle_, const size_t needle_size_,
-                  size_t haystack_size_hint = 0)
-            : needle {reinterpret_cast<const UInt8*>(needle_)},
-              needle_size {needle_size_},
-              fallback {VolnitskyTraits::isFallbackNeedle(needle_size, 
haystack_size_hint)},
-              fallback_searcher {needle_, needle_size} {
-        if (fallback || fallback_searcher.force_fallback) return;
-
-        hash = std::unique_ptr<VolnitskyTraits::Offset[]>(
-                new VolnitskyTraits::Offset[VolnitskyTraits::hash_size] {});
-
-        auto callback = [this](const VolnitskyTraits::Ngram ngram, const int 
offset) {
-            return this->putNGramBase(ngram, offset);
-        };
-        /// ssize_t is used here because unsigned can't be used with condition 
like `i >= 0`, unsigned always >= 0
-        /// And also adding from the end guarantees that we will find first 
occurrence because we will lookup bigger offsets first.
-        for (auto i = static_cast<ssize_t>(needle_size - 
sizeof(VolnitskyTraits::Ngram)); i >= 0;
-             --i) {
-            bool ok = VolnitskyTraits::putNGram<CaseSensitive, ASCII>(needle + 
i, i + 1, needle,
-                                                                      
needle_size, callback);
-
-            /** `putNGramUTF8CaseInsensitive` does not work if characters with 
lower and upper cases
-              * are represented by different number of bytes or code points.
-              * So, use fallback if error occurred.
-              */
-            if (!ok) {
-                fallback_searcher.force_fallback = true;
-                hash = nullptr;
-                return;
-            }
-        }
-    }
-
-    /// If not found, the end of the haystack is returned.
-    const UInt8* search(const UInt8* const haystack, const size_t 
haystack_size) const {
-        if (needle_size == 0) return haystack;
-
-        const auto* haystack_end = haystack + haystack_size;
-
-        if (fallback || haystack_size <= needle_size || 
fallback_searcher.force_fallback)
-            return fallback_searcher.search(haystack, haystack_end);
-
-        /// Let's "apply" the needle to the haystack and compare the n-gram 
from the end of the needle.
-        const auto* pos = haystack + needle_size - 
sizeof(VolnitskyTraits::Ngram);
-        for (; pos <= haystack_end - needle_size; pos += step) {
-            /// We look at all the cells of the hash table that can correspond 
to the n-gram from haystack.
-            for (size_t cell_num = VolnitskyTraits::toNGram(pos) % 
VolnitskyTraits::hash_size;
-                 hash[cell_num]; cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size) {
-                /// When found - compare bytewise, using the offset from the 
hash table.
-                const auto* res = pos - (hash[cell_num] - 1);
-
-                /// pointer in the code is always padded array so we can use 
pagesafe semantics
-                if (fallback_searcher.compare(haystack, haystack_end, res)) 
return res;
-            }
-        }
-
-        return fallback_searcher.search(pos - step + 1, haystack_end);
-    }
-
-    const char* search(const char* haystack, size_t haystack_size) const {
-        return reinterpret_cast<const char*>(
-                search(reinterpret_cast<const UInt8*>(haystack), 
haystack_size));
-    }
-
-protected:
-    void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset) {
-        /// Put the offset for the n-gram in the corresponding cell or the 
nearest free cell.
-        size_t cell_num = ngram % VolnitskyTraits::hash_size;
-
-        while (hash[cell_num])
-            cell_num =
-                    (cell_num + 1) % VolnitskyTraits::hash_size; /// Search 
for the next free cell.
-
-        hash[cell_num] = offset;
-    }
-};
-
-template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
-class MultiVolnitskyBase {
-private:
-    /// needles and their offsets
-    const std::vector<StringRef>& needles;
-
-    /// fallback searchers
-    std::vector<size_t> fallback_needles;
-    std::vector<FallbackSearcher> fallback_searchers;
-
-    /// because std::pair<> is not POD
-    struct OffsetId {
-        VolnitskyTraits::Id id;
-        VolnitskyTraits::Offset off;
-    };
-
-    std::unique_ptr<OffsetId[]> hash;
-
-    /// step for each bunch of strings
-    size_t step;
-
-    /// last index of offsets that was not processed
-    size_t last;
-
-    /// limit for adding to hashtable. In worst case with case insentive 
search, the table will be filled at most as half
-    static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8;
-
-public:
-    explicit MultiVolnitskyBase(const std::vector<StringRef>& needles_)
-            : needles {needles_}, step {0}, last {0} {
-        fallback_searchers.reserve(needles.size());
-        hash = std::unique_ptr<OffsetId[]>(
-                new OffsetId[VolnitskyTraits::
-                                     hash_size]); /// No zero initialization, 
it will be done later.
-    }
-
-    /**
-     * This function is needed to initialize hash table
-     * Returns `true` if there is nothing to initialize
-     * and `false` if we have something to initialize and initializes it.
-     * This function is a kind of fallback if there are many needles.
-     * We actually destroy the hash table and initialize it with uninitialized 
needles
-     * and search through the haystack again.
-     * The actual usage of this function is like this:
-     * while (hasMoreToSearch())
-     * {
-     *     search inside the haystack with the known needles
-     * }
-     */
-    bool hasMoreToSearch() {
-        if (last == needles.size()) return false;
-
-        memset(hash.get(), 0, VolnitskyTraits::hash_size * sizeof(OffsetId));
-        fallback_needles.clear();
-        step = std::numeric_limits<size_t>::max();
-
-        size_t buf = 0;
-        size_t size = needles.size();
-
-        for (; last < size; ++last) {
-            const char* cur_needle_data = needles[last].data;
-            const size_t cur_needle_size = needles[last].size;
-
-            /// save the indices of fallback searchers
-            if (VolnitskyTraits::isFallbackNeedle(cur_needle_size)) {
-                fallback_needles.push_back(last);
-            } else {
-                /// put all bigrams
-                auto callback = [this](const VolnitskyTraits::Ngram ngram, 
const int offset) {
-                    return this->putNGramBase(ngram, offset, this->last);
-                };
-
-                buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
-
-                /// this is the condition when we actually need to stop and 
start searching with known needles
-                if (buf > small_limit) break;
-
-                step = std::min(step, cur_needle_size - 
sizeof(VolnitskyTraits::Ngram) + 1);
-                for (auto i = static_cast<int>(cur_needle_size - 
sizeof(VolnitskyTraits::Ngram));
-                     i >= 0; --i) {
-                    VolnitskyTraits::putNGram<CaseSensitive, ASCII>(
-                            reinterpret_cast<const UInt8*>(cur_needle_data) + 
i, i + 1,
-                            reinterpret_cast<const UInt8*>(cur_needle_data), 
cur_needle_size,
-                            callback);
-                }
-            }
-            fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
-        }
-        return true;
-    }
-
-    inline bool searchOne(const UInt8* haystack, const UInt8* haystack_end) 
const {
-        const size_t fallback_size = fallback_needles.size();
-        for (size_t i = 0; i < fallback_size; ++i)
-            if (fallback_searchers[fallback_needles[i]].search(haystack, 
haystack_end) !=
-                haystack_end)
-                return true;
-
-        /// check if we have one non empty volnitsky searcher
-        if (step != std::numeric_limits<size_t>::max()) {
-            const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
-            for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos 
+= step) {
-                for (size_t cell_num = VolnitskyTraits::toNGram(pos) % 
VolnitskyTraits::hash_size;
-                     hash[cell_num].off; cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size) {
-                    if (pos >= haystack + hash[cell_num].off - 1) {
-                        const auto res = pos - (hash[cell_num].off - 1);
-                        const size_t ind = hash[cell_num].id;
-                        if (res + needles[ind].size <= haystack_end &&
-                            fallback_searchers[ind].compare(haystack, 
haystack_end, res))
-                            return true;
-                    }
-                }
-            }
-        }
-        return false;
-    }
-
-    inline size_t searchOneFirstIndex(const UInt8* haystack, const UInt8* 
haystack_end) const {
-        const size_t fallback_size = fallback_needles.size();
-
-        size_t answer = std::numeric_limits<size_t>::max();
-
-        for (size_t i = 0; i < fallback_size; ++i)
-            if (fallback_searchers[fallback_needles[i]].search(haystack, 
haystack_end) !=
-                haystack_end)
-                answer = std::min(answer, fallback_needles[i]);
-
-        /// check if we have one non empty volnitsky searcher
-        if (step != std::numeric_limits<size_t>::max()) {
-            const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
-            for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos 
+= step) {
-                for (size_t cell_num = VolnitskyTraits::toNGram(pos) % 
VolnitskyTraits::hash_size;
-                     hash[cell_num].off; cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size) {
-                    if (pos >= haystack + hash[cell_num].off - 1) {
-                        const auto res = pos - (hash[cell_num].off - 1);
-                        const size_t ind = hash[cell_num].id;
-                        if (res + needles[ind].size <= haystack_end &&
-                            fallback_searchers[ind].compare(haystack, 
haystack_end, res))
-                            answer = std::min(answer, ind);
-                    }
-                }
-            }
-        }
-
-        /*
-        * if nothing was found, answer + 1 will be equal to zero and we can
-        * assign it into the result because we need to return the position 
starting with one
-        */
-        return answer + 1;
-    }
-
-    template <typename CountCharsCallback>
-    inline UInt64 searchOneFirstPosition(const UInt8* haystack, const UInt8* 
haystack_end,
-                                         const CountCharsCallback& 
count_chars) const {
-        const size_t fallback_size = fallback_needles.size();
-
-        UInt64 answer = std::numeric_limits<UInt64>::max();
-
-        for (size_t i = 0; i < fallback_size; ++i)
-            if (auto pos = 
fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
-                pos != haystack_end)
-                answer = std::min<UInt64>(answer, pos - haystack);
-
-        /// check if we have one non empty volnitsky searcher
-        if (step != std::numeric_limits<size_t>::max()) {
-            const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
-            for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos 
+= step) {
-                for (size_t cell_num = VolnitskyTraits::toNGram(pos) % 
VolnitskyTraits::hash_size;
-                     hash[cell_num].off; cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size) {
-                    if (pos >= haystack + hash[cell_num].off - 1) {
-                        const auto res = pos - (hash[cell_num].off - 1);
-                        const size_t ind = hash[cell_num].id;
-                        if (res + needles[ind].size <= haystack_end &&
-                            fallback_searchers[ind].compare(haystack, 
haystack_end, res))
-                            answer = std::min<UInt64>(answer, res - haystack);
-                    }
-                }
-            }
-        }
-        if (answer == std::numeric_limits<UInt64>::max()) return 0;
-        return count_chars(haystack, haystack + answer);
-    }
-
-    template <typename CountCharsCallback, typename AnsType>
-    inline void searchOneAll(const UInt8* haystack, const UInt8* haystack_end, 
AnsType* answer,
-                             const CountCharsCallback& count_chars) const {
-        const size_t fallback_size = fallback_needles.size();
-        for (size_t i = 0; i < fallback_size; ++i) {
-            const UInt8* ptr =
-                    fallback_searchers[fallback_needles[i]].search(haystack, 
haystack_end);
-            if (ptr != haystack_end) answer[fallback_needles[i]] = 
count_chars(haystack, ptr);
-        }
-
-        /// check if we have one non empty volnitsky searcher
-        if (step != std::numeric_limits<size_t>::max()) {
-            const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
-            for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos 
+= step) {
-                for (size_t cell_num = VolnitskyTraits::toNGram(pos) % 
VolnitskyTraits::hash_size;
-                     hash[cell_num].off; cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size) {
-                    if (pos >= haystack + hash[cell_num].off - 1) {
-                        const auto* res = pos - (hash[cell_num].off - 1);
-                        const size_t ind = hash[cell_num].id;
-                        if (answer[ind] == 0 && res + needles[ind].size <= 
haystack_end &&
-                            fallback_searchers[ind].compare(haystack, 
haystack_end, res))
-                            answer[ind] = count_chars(haystack, res);
-                    }
-                }
-            }
-        }
-    }
-
-    void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, 
const size_t num) {
-        size_t cell_num = ngram % VolnitskyTraits::hash_size;
-
-        while (hash[cell_num].off) cell_num = (cell_num + 1) % 
VolnitskyTraits::hash_size;
-
-        hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num),
-                          static_cast<VolnitskyTraits::Offset>(offset)};
-    }
-};
-
-using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
-using VolnitskyUTF8 =
-        VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// 
exactly same as Volnitsky
-// using VolnitskyCaseInsensitive = VolnitskyBase<false, true, 
ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
-// using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, 
UTF8CaseInsensitiveStringSearcher>;
-
-using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true, 
ASCIICaseSensitiveTokenSearcher>;
-// using VolnitskyCaseInsensitiveToken = VolnitskyBase<false, true, 
ASCIICaseInsensitiveTokenSearcher>;
-
-using MultiVolnitsky = MultiVolnitskyBase<true, true, 
ASCIICaseSensitiveStringSearcher>;
-using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, 
ASCIICaseSensitiveStringSearcher>;
-// using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, 
ASCIICaseInsensitiveStringSearcher>;
-// using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, 
UTF8CaseInsensitiveStringSearcher>;
-
-} // namespace doris
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 1c61454b71..e11e27c5e7 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -101,8 +101,7 @@ Status FunctionLikeBase::execute_impl(FunctionContext* 
context, Block& block,
     // result column
     auto res = ColumnUInt8::create();
     ColumnUInt8::Container& vec_res = res->get_data();
-    // set default value to 0, and match functions only need to set 1/true
-    vec_res.resize_fill(values->size());
+    vec_res.resize(values->size());
 
     auto* state = reinterpret_cast<LikeState*>(
             context->get_function_state(FunctionContext::THREAD_LOCAL));
@@ -130,42 +129,6 @@ Status FunctionLikeBase::vector_vector(const 
ColumnString::Chars& values,
                                        const ColumnString::Offsets& 
pattern_offsets,
                                        ColumnUInt8::Container& result, const 
LikeFn& function,
                                        LikeSearchState* search_state) {
-    // for constant_substring_fn, use long run length search for performance
-    if (constant_substring_fn ==
-        *(function.target<doris::Status (*)(LikeSearchState * state, const 
StringValue&,
-                                            const StringValue&, unsigned 
char*)>())) {
-        // treat continous multi string data as a long string data
-        const UInt8* begin = values.data();
-        const UInt8* end = begin + values.size();
-        const UInt8* pos = begin;
-
-        /// Current index in the array of strings.
-        size_t i = 0;
-        size_t needle_size = 
search_state->substring_pattern.get_pattern_length();
-
-        /// We will search for the next occurrence in all strings at once.
-        while (pos < end) {
-            // search return matched substring start offset
-            pos = (UInt8*)search_state->substring_pattern.search((char*)pos, 
end - pos);
-            if (pos >= end) break;
-
-            /// Determine which index it refers to.
-            /// begin + value_offsets[i] is the start offset of string at i+1
-            while (begin + value_offsets[i] <= pos) ++i;
-
-            /// We check that the entry does not pass through the boundaries 
of strings.
-            if (pos + needle_size < begin + value_offsets[i]) {
-                result[i] = 1;
-            }
-
-            // move to next string offset
-            pos = begin + value_offsets[i];
-            ++i;
-        }
-
-        return Status::OK();
-    }
-
     const auto size = value_offsets.size();
 
     for (int i = 0; i < size; ++i) {
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index e19b92d0c2..e66692a0fa 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -61,7 +61,7 @@ struct LikeSearchState {
     void set_search_string(const std::string& search_string_arg) {
         search_string = search_string_arg;
         search_string_sv = StringValue(search_string);
-        substring_pattern.set_pattern(&search_string_sv);
+        substring_pattern = StringSearch(&search_string_sv);
     }
 };
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris] 01/01: Revert "[improvement](function) optimize substr performance (#10169)"

Reply via email to