This is an automated email from the ASF dual-hosted git repository.

zhangstar333 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8843efc1a3d [feature](function)support url domain functions (#42488)
8843efc1a3d is described below

commit 8843efc1a3d28518e76b1681ca54a0715e491213
Author: zhangstar333 <87313068+zhangstar...@users.noreply.github.com>
AuthorDate: Wed Oct 30 17:33:39 2024 +0800

    [feature](function)support url domain functions (#42488)
    
    ## Proposed changes
    
    support
    
top_level_domain/first_significant_subdomain/cut_to_first_significant_subdomain
    functions
    doc: https://github.com/apache/doris-website/pull/1230
    
    <!--Describe your changes.-->
---
 be/src/vec/functions/url/domain.h                  | 127 +++++-
 be/src/vec/functions/url/find_symbols.h            | 481 +++++++++++++++++++++
 be/src/vec/functions/url/function_url.cpp          |  23 +
 be/src/vec/functions/url/functions_url.h           |  11 -
 be/src/vec/functions/url/tldLookup.generated.cpp   | 140 ++++++
 be/src/vec/functions/url/tldLookup.h               |  34 ++
 .../doris/catalog/BuiltinScalarFunctions.java      |   6 +
 .../scalar/CutToFirstSignificantSubdomain.java     |  68 +++
 .../scalar/FirstSignificantSubdomain.java          |  68 +++
 .../functions/scalar/TopLevelDomain.java           |  68 +++
 .../expressions/visitor/ScalarFunctionVisitor.java |  16 +
 gensrc/script/doris_builtins_functions.py          |   5 +-
 .../string_functions/test_url_functions.out        | 121 ++++++
 .../string_functions/test_url_functions.groovy     |  79 ++++
 14 files changed, 1234 insertions(+), 13 deletions(-)

diff --git a/be/src/vec/functions/url/domain.h 
b/be/src/vec/functions/url/domain.h
index 54361134eff..b2ec5e0c9d9 100644
--- a/be/src/vec/functions/url/domain.h
+++ b/be/src/vec/functions/url/domain.h
@@ -20,11 +20,12 @@
 
 #pragma once
 
-// #include <base/find_symbols.h>
 #include <cstring>
 
 #include "vec/common/string_utils/string_utils.h"
+#include "vec/functions/url/find_symbols.h"
 #include "vec/functions/url/protocol.h"
+#include "vec/functions/url/tldLookup.h"
 
 namespace doris::vectorized {
 
@@ -144,4 +145,128 @@ struct ExtractDomain {
     }
 };
 
+struct ExtractTopLevelDomain {
+    static size_t get_reserve_length_for_element() { return 5; }
+
+    static void execute(const char* data, size_t size, const char*& res_data, 
size_t& res_size) {
+        res_data = data;
+        res_size = 0;
+        StringRef host = get_url_host(data, size);
+
+        if (host.size == 0) {
+            return;
+        } else {
+            auto host_view = host.to_string_view();
+            if (host_view[host_view.size() - 1] == '.') {
+                host_view.remove_suffix(1);
+            }
+
+            const auto* host_end = host_view.data() + host_view.size();
+            const char* last_dot = 
find_last_symbols_or_null<'.'>(host_view.data(), host_end);
+            if (!last_dot) {
+                return;
+            }
+
+            /// For IPv4 addresses select nothing.
+            ///
+            /// NOTE: it is safe to access last_dot[1]
+            /// since getURLHost() will not return a host if there is symbol 
after dot.
+            if (is_numeric_ascii(last_dot[1])) {
+                return;
+            }
+
+            res_data = last_dot + 1;
+            res_size = host_end - res_data;
+        }
+    }
+};
+
+struct ExtractFirstSignificantSubdomain {
+    static size_t get_reserve_length_for_element() { return 10; }
+
+    static void execute(const Pos data, const size_t size, Pos& res_data, 
size_t& res_size,
+                        Pos* out_domain_end = nullptr) {
+        res_data = data;
+        res_size = 0;
+
+        Pos tmp;
+        size_t domain_length = 0;
+        ExtractDomain<true>::execute(data, size, tmp, domain_length);
+
+        if (domain_length == 0) {
+            return;
+        }
+        if (out_domain_end) {
+            *out_domain_end = tmp + domain_length;
+        }
+
+        /// cut useless dot
+        if (tmp[domain_length - 1] == '.') {
+            --domain_length;
+        }
+
+        res_data = tmp;
+        res_size = domain_length;
+
+        const auto* begin = tmp;
+        const auto* end = begin + domain_length;
+        std::array<const char*, 3> last_periods {};
+
+        const auto* pos = find_first_symbols<'.'>(begin, end);
+        while (pos < end) {
+            last_periods[2] = last_periods[1];
+            last_periods[1] = last_periods[0];
+            last_periods[0] = pos;
+            pos = find_first_symbols<'.'>(pos + 1, end);
+        }
+
+        if (!last_periods[0]) {
+            return;
+        }
+
+        if (!last_periods[1]) {
+            res_size = last_periods[0] - begin;
+            return;
+        }
+
+        if (!last_periods[2]) {
+            last_periods[2] = begin - 1;
+        }
+
+        const auto* end_of_level_domain = 
find_first_symbols<'/'>(last_periods[0], end);
+        if (!end_of_level_domain) {
+            end_of_level_domain = end;
+        }
+
+        auto host_len = static_cast<size_t>(end_of_level_domain - 
last_periods[1] - 1);
+        StringRef host {last_periods[1] + 1, host_len};
+        if (tldLookup::is_valid(host.data, host.size)) {
+            res_data += last_periods[2] + 1 - begin;
+            res_size = last_periods[1] - last_periods[2] - 1;
+        } else {
+            res_data += last_periods[1] + 1 - begin;
+            res_size = last_periods[0] - last_periods[1] - 1;
+        }
+    }
+};
+
+struct CutToFirstSignificantSubdomain {
+    static size_t get_reserve_length_for_element() { return 15; }
+
+    static void execute(const Pos data, const size_t size, Pos& res_data, 
size_t& res_size) {
+        res_data = data;
+        res_size = 0;
+
+        Pos tmp_data = data;
+        size_t tmp_length;
+        Pos domain_end = data;
+        ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, 
tmp_length, &domain_end);
+
+        if (tmp_length == 0) {
+            return;
+        }
+        res_data = tmp_data;
+        res_size = domain_end - tmp_data;
+    }
+};
 } // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/find_symbols.h 
b/be/src/vec/functions/url/find_symbols.h
new file mode 100644
index 00000000000..7af95ce06bd
--- /dev/null
+++ b/be/src/vec/functions/url/find_symbols.h
@@ -0,0 +1,481 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h
+// and modified by Doris
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <string>
+
+#if defined(__SSE4_2__)
+#include <nmmintrin.h>
+#endif
+
+/** find_first_symbols<c1, c2, ...>(begin, end):
+  *
+  * Allow to search for next character from the set of 'symbols...' in a 
string.
+  * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case 
of one symbol and '\0'),
+  * but with the following differences:
+  * - works with any memory ranges, including containing zero bytes;
+  * - doesn't require terminating zero byte: end of memory range is passed 
explicitly;
+  * - if not found, returns pointer to end instead of nullptr;
+  * - maximum number of symbols to search is 16.
+  *
+  * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in 
the case of large number of symbols,
+  *  that have more than 2x performance advantage over trivial loop
+  *  in the case of parsing tab-separated dump with (probably escaped) string 
fields.
+  * In the case of parsing tab separated dump with short strings, there is no 
performance degradation over trivial loop.
+  *
+  * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend 
on CPU model.
+  *
+  * find_last_symbols_or_null<c1, c2, ...>(begin, end):
+  *
+  * Allow to search for the last matching character in a string.
+  * If no such characters, returns nullptr.
+  */
+
+struct SearchSymbols {
+    static constexpr auto BUFFER_SIZE = 16;
+
+    SearchSymbols() = default;
+
+    explicit SearchSymbols(std::string in) : str(std::move(in)) {
+#if defined(__SSE4_2__)
+        if (str.size() > BUFFER_SIZE) {
+            throw std::runtime_error("SearchSymbols can contain at most " +
+                                     std::to_string(BUFFER_SIZE) + " symbols 
and " +
+                                     std::to_string(str.size()) + " was 
provided\n");
+        }
+
+        char tmp_safety_buffer[BUFFER_SIZE] = {0};
+
+        memcpy(tmp_safety_buffer, str.data(), str.size());
+
+        simd_vector = _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(tmp_safety_buffer));
+#endif
+    }
+
+#if defined(__SSE4_2__)
+    __m128i simd_vector;
+#endif
+    std::string str;
+};
+
+namespace detail {
+template <char... chars>
+constexpr bool is_in(char x) {
+    return ((x == chars) || ...);
+} // NOLINT(misc-redundant-expression)
+
+static bool is_in(char c, const char* symbols, size_t num_chars) {
+    for (size_t i = 0U; i < num_chars; ++i) {
+        if (c == symbols[i]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+#if defined(__SSE2__)
+template <char s0>
+inline __m128i mm_is_in(__m128i bytes) {
+    __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0));
+    return eq0;
+}
+
+template <char s0, char s1, char... tail>
+inline __m128i mm_is_in(__m128i bytes) {
+    __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0));
+    __m128i eq = mm_is_in<s1, tail...>(bytes);
+    return _mm_or_si128(eq0, eq);
+}
+
+inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) {
+    __m128i accumulator = _mm_setzero_si128();
+    for (size_t i = 0; i < num_chars; ++i) {
+        __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i]));
+        accumulator = _mm_or_si128(accumulator, eq);
+    }
+
+    return accumulator;
+}
+
+inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t 
num_chars) {
+    std::array<__m128i, 16u> result {};
+
+    for (size_t i = 0; i < num_chars; ++i) {
+        result[i] = _mm_set1_epi8(symbols[i]);
+    }
+
+    return result;
+}
+
+inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& 
needles) {
+    __m128i accumulator = _mm_setzero_si128();
+
+    for (const auto& needle : needles) {
+        __m128i eq = _mm_cmpeq_epi8(bytes, needle);
+        accumulator = _mm_or_si128(accumulator, eq);
+    }
+
+    return accumulator;
+}
+#endif
+
+template <bool positive>
+constexpr bool maybe_negate(bool x) {
+    return x == positive;
+}
+
+template <bool positive>
+constexpr uint16_t maybe_negate(uint16_t x) {
+    if constexpr (positive)
+        return x;
+    else
+        return ~x;
+}
+
+enum class ReturnMode : uint8_t {
+    End,
+    Nullptr,
+};
+
+template <bool positive, ReturnMode return_mode, char... symbols>
+inline const char* find_first_symbols_sse2(const char* const begin, const 
char* const end) {
+    const char* pos = begin;
+
+#if defined(__SSE2__)
+    for (; pos + 15 < end; pos += 16) {
+        __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+        __m128i eq = mm_is_in<symbols...>(bytes);
+
+        uint16_t bit_mask = 
maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
+        if (bit_mask) return pos + __builtin_ctz(bit_mask);
+    }
+#endif
+
+    for (; pos < end; ++pos)
+        if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos;
+
+    return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode>
+inline const char* find_first_symbols_sse2(const char* const begin, const 
char* const end,
+                                           const char* symbols, size_t 
num_chars) {
+    const char* pos = begin;
+
+#if defined(__SSE2__)
+    const auto needles = mm_is_in_prepare(symbols, num_chars);
+    for (; pos + 15 < end; pos += 16) {
+        __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+        __m128i eq = mm_is_in_execute(bytes, needles);
+
+        uint16_t bit_mask = 
maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
+        if (bit_mask) return pos + __builtin_ctz(bit_mask);
+    }
+#endif
+
+    for (; pos < end; ++pos)
+        if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return 
pos;
+
+    return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode, char... symbols>
+inline const char* find_last_symbols_sse2(const char* const begin, const char* 
const end) {
+    const char* pos = end;
+
+#if defined(__SSE2__)
+    for (; pos - 16 >= begin;
+         pos -=
+         16) /// Assuming the pointer cannot overflow. Assuming we can compare 
these pointers.
+    {
+        __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 
16));
+
+        __m128i eq = mm_is_in<symbols...>(bytes);
+
+        uint16_t bit_mask = 
maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
+        if (bit_mask)
+            return pos - 1 -
+                   (__builtin_clz(bit_mask) -
+                    16); /// because __builtin_clz works with mask as uint32.
+    }
+#endif
+
+    --pos;
+    for (; pos >= begin; --pos)
+        if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos;
+
+    return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, 
char c02 = 0,
+          char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 
0, char c08 = 0,
+          char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 
0, char c14 = 0,
+          char c15 = 0, char c16 = 0>
+inline const char* find_first_symbols_sse42(const char* const begin, const 
char* const end) {
+    const char* pos = begin;
+
+#if defined(__SSE4_2__)
+    constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | 
_SIDD_LEAST_SIGNIFICANT;
+
+    __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, 
c10, c11, c12, c13,
+                                c14, c15, c16);
+
+    for (; pos + 15 < end; pos += 16) {
+        __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+        if constexpr (positive) {
+            if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
+                return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
+        } else {
+            if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | 
_SIDD_NEGATIVE_POLARITY))
+                return pos +
+                       _mm_cmpestri(set, num_chars, bytes, 16, mode | 
_SIDD_NEGATIVE_POLARITY);
+        }
+    }
+#endif
+
+    for (; pos < end; ++pos)
+        if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) ||
+            (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) 
||
+            (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, 
c03>(*pos))) ||
+            (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, 
c04>(*pos))) ||
+            (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, 
c04, c05>(*pos))) ||
+            (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, 
c04, c05, c06>(*pos))) ||
+            (num_chars == 7 &&
+             maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, 
c07>(*pos))) ||
+            (num_chars == 8 &&
+             maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, 
c08>(*pos))) ||
+            (num_chars == 9 &&
+             maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, 
c08, c09>(*pos))) ||
+            (num_chars == 10 &&
+             maybe_negate<positive>(
+                     is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, 
c10>(*pos))) ||
+            (num_chars == 11 &&
+             maybe_negate<positive>(
+                     is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, 
c11>(*pos))) ||
+            (num_chars == 12 &&
+             maybe_negate<positive>(
+                     is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, 
c11, c12>(*pos))) ||
+            (num_chars == 13 &&
+             maybe_negate<positive>(
+                     is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, 
c11, c12, c13>(
+                             *pos))) ||
+            (num_chars == 14 &&
+             maybe_negate<positive>(
+                     is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, 
c11, c12, c13, c14>(
+                             *pos))) ||
+            (num_chars == 15 &&
+             maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, 
c08, c09, c10, c11,
+                                          c12, c13, c14, c15>(*pos))) ||
+            (num_chars == 16 &&
+             maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, 
c08, c09, c10, c11,
+                                          c12, c13, c14, c15, c16>(*pos))))
+            return pos;
+    return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+template <bool positive, ReturnMode return_mode>
+inline const char* find_first_symbols_sse42(const char* const begin, const 
char* const end,
+                                            const SearchSymbols& symbols) {
+    const char* pos = begin;
+
+    const auto num_chars = symbols.str.size();
+
+#if defined(__SSE4_2__)
+    constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | 
_SIDD_LEAST_SIGNIFICANT;
+
+    const __m128i set = symbols.simd_vector;
+
+    for (; pos + 15 < end; pos += 16) {
+        __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos));
+
+        if constexpr (positive) {
+            if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
+                return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
+        } else {
+            if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | 
_SIDD_NEGATIVE_POLARITY))
+                return pos +
+                       _mm_cmpestri(set, num_chars, bytes, 16, mode | 
_SIDD_NEGATIVE_POLARITY);
+        }
+    }
+#endif
+
+    for (; pos < end; ++pos)
+        if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), 
num_chars))) return pos;
+
+    return return_mode == ReturnMode::End ? end : nullptr;
+}
+
+/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to 
do.
+
+template <bool positive, ReturnMode return_mode, char... symbols>
+inline const char* find_first_symbols_dispatch(const char* begin, const char* 
end)
+    requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16)
+{
+#if defined(__SSE4_2__)
+    if (sizeof...(symbols) >= 5)
+        return find_first_symbols_sse42<positive, return_mode, 
sizeof...(symbols), symbols...>(
+                begin, end);
+    else
+#endif
+        return find_first_symbols_sse2<positive, return_mode, 
symbols...>(begin, end);
+}
+
+template <bool positive, ReturnMode return_mode>
+inline const char* find_first_symbols_dispatch(const std::string_view haystack,
+                                               const SearchSymbols& symbols) {
+#if defined(__SSE4_2__)
+    if (symbols.str.size() >= 5)
+        return find_first_symbols_sse42<positive, 
return_mode>(haystack.begin(), haystack.end(),
+                                                               symbols);
+    else
+#endif
+        return find_first_symbols_sse2<positive, return_mode>(
+                haystack.begin(), haystack.end(), symbols.str.data(), 
symbols.str.size());
+}
+
+} // namespace detail
+
+template <char... symbols>
+inline const char* find_first_symbols(const char* begin, const char* end) {
+    return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, 
symbols...>(begin,
+                                                                               
           end);
+}
+
+/// Returning non const result for non const arguments.
+/// It is convenient when you are using this function to iterate through 
non-const buffer.
+template <char... symbols>
+inline char* find_first_symbols(char* begin, char* end) {
+    return const_cast<char*>(
+            detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, 
symbols...>(begin,
+                                                                               
            end));
+}
+
+inline const char* find_first_symbols(std::string_view haystack, const 
SearchSymbols& symbols) {
+    return detail::find_first_symbols_dispatch<true, 
detail::ReturnMode::End>(haystack, symbols);
+}
+
+template <char... symbols>
+inline const char* find_first_not_symbols(const char* begin, const char* end) {
+    return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, 
symbols...>(begin,
+                                                                               
            end);
+}
+
+template <char... symbols>
+inline char* find_first_not_symbols(char* begin, char* end) {
+    return const_cast<char*>(
+            detail::find_first_symbols_dispatch<false, 
detail::ReturnMode::End, symbols...>(begin,
+                                                                               
             end));
+}
+
+inline const char* find_first_not_symbols(std::string_view haystack, const 
SearchSymbols& symbols) {
+    return detail::find_first_symbols_dispatch<false, 
detail::ReturnMode::End>(haystack, symbols);
+}
+
+template <char... symbols>
+inline const char* find_first_symbols_or_null(const char* begin, const char* 
end) {
+    return detail::find_first_symbols_dispatch<true, 
detail::ReturnMode::Nullptr, symbols...>(begin,
+                                                                               
               end);
+}
+
+template <char... symbols>
+inline char* find_first_symbols_or_null(char* begin, char* end) {
+    return const_cast<char*>(
+            detail::find_first_symbols_dispatch<true, 
detail::ReturnMode::Nullptr, symbols...>(
+                    begin, end));
+}
+
+inline const char* find_first_symbols_or_null(std::string_view haystack,
+                                              const SearchSymbols& symbols) {
+    return detail::find_first_symbols_dispatch<true, 
detail::ReturnMode::Nullptr>(haystack,
+                                                                               
   symbols);
+}
+
+template <char... symbols>
+inline const char* find_first_not_symbols_or_null(const char* begin, const 
char* end) {
+    return detail::find_first_symbols_dispatch<false, 
detail::ReturnMode::Nullptr, symbols...>(
+            begin, end);
+}
+
+template <char... symbols>
+inline char* find_first_not_symbols_or_null(char* begin, char* end) {
+    return const_cast<char*>(
+            detail::find_first_symbols_dispatch<false, 
detail::ReturnMode::Nullptr, symbols...>(
+                    begin, end));
+}
+
+inline const char* find_first_not_symbols_or_null(std::string_view haystack,
+                                                  const SearchSymbols& 
symbols) {
+    return detail::find_first_symbols_dispatch<false, 
detail::ReturnMode::Nullptr>(haystack,
+                                                                               
    symbols);
+}
+
+template <char... symbols>
+inline const char* find_last_symbols_or_null(const char* begin, const char* 
end) {
+    return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, 
symbols...>(begin,
+                                                                               
          end);
+}
+
+template <char... symbols>
+inline char* find_last_symbols_or_null(char* begin, char* end) {
+    return const_cast<char*>(
+            detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, 
symbols...>(begin,
+                                                                               
           end));
+}
+
+template <char... symbols>
+inline const char* find_last_not_symbols_or_null(const char* begin, const 
char* end) {
+    return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, 
symbols...>(begin,
+                                                                               
           end);
+}
+
+template <char... symbols>
+inline char* find_last_not_symbols_or_null(char* begin, char* end) {
+    return const_cast<char*>(
+            detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, 
symbols...>(begin,
+                                                                               
            end));
+}
+
+/// Slightly resembles boost::split. The drawback of boost::split is that it 
fires a false positive in clang static analyzer.
+/// See https://github.com/boostorg/algorithm/issues/63
+/// And https://bugs.llvm.org/show_bug.cgi?id=41141
+template <char... symbols, typename To>
+inline To& splitInto(To& to, std::string_view what, bool token_compress = 
false) {
+    const char* pos = what.data();
+    const char* end = pos + what.size();
+    while (pos < end) {
+        const char* delimiter_or_end = find_first_symbols<symbols...>(pos, 
end);
+
+        if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos, 
delimiter_or_end - pos);
+
+        if (delimiter_or_end < end)
+            pos = delimiter_or_end + 1;
+        else
+            pos = delimiter_or_end;
+    }
+
+    return to;
+}
diff --git a/be/src/vec/functions/url/function_url.cpp 
b/be/src/vec/functions/url/function_url.cpp
index e25af6f7f27..47afe076b74 100644
--- a/be/src/vec/functions/url/function_url.cpp
+++ b/be/src/vec/functions/url/function_url.cpp
@@ -46,10 +46,33 @@ struct NameProtocol {
 using FunctionProtocol =
         FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, 
NameProtocol>;
 
+struct NameTopLevelDomain {
+    static constexpr auto name = "top_level_domain";
+};
+using FunctionTopLevelDomain =
+        FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>, 
NameTopLevelDomain>;
+
+struct NameFirstSignificantSubdomain {
+    static constexpr auto name = "first_significant_subdomain";
+};
+using FunctionFirstSignificantSubdomain =
+        
FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>,
+                               NameFirstSignificantSubdomain>;
+
+struct NameCutToFirstSignificantSubdomain {
+    static constexpr auto name = "cut_to_first_significant_subdomain";
+};
+using FunctionCutToFirstSignificantSubdomain =
+        
FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>,
+                               NameCutToFirstSignificantSubdomain>;
+
 void register_function_url(SimpleFunctionFactory& factory) {
     factory.register_function<FunctionDomain>();
     factory.register_function<FunctionDomainWithoutWWW>();
     factory.register_function<FunctionProtocol>();
+    factory.register_function<FunctionTopLevelDomain>();
+    factory.register_function<FunctionFirstSignificantSubdomain>();
+    factory.register_function<FunctionCutToFirstSignificantSubdomain>();
 }
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/functions_url.h 
b/be/src/vec/functions/url/functions_url.h
index f9f02a17a66..b6736496d24 100644
--- a/be/src/vec/functions/url/functions_url.h
+++ b/be/src/vec/functions/url/functions_url.h
@@ -89,7 +89,6 @@ struct ExtractSubstringImpl {
         for (size_t i = 0; i < size; ++i) {
             Extractor::execute(reinterpret_cast<const 
char*>(&data[prev_offset]),
                                offsets[i] - prev_offset, start, length);
-
             res_data.resize(res_data.size() + length);
             memcpy_small_allow_read_write_overflow15(&res_data[res_offset], 
start, length);
             res_offset += length;
@@ -105,11 +104,6 @@ struct ExtractSubstringImpl {
         Extractor::execute(data.data(), data.size(), start, length);
         res_data.assign(start, length);
     }
-
-    // static void vector_fixed(const ColumnString::Chars &, size_t, 
ColumnString::Chars &)
-    // {
-    //     throw Exception("Column of type FixedString is not supported by URL 
functions", ErrorCodes::ILLEGAL_COLUMN);
-    // }
 };
 
 /** Delete part of string using the Extractor.
@@ -155,11 +149,6 @@ struct CutSubstringImpl {
         res_data.append(data.data(), start);
         res_data.append(start + length, data.data() + data.size());
     }
-
-    // static void vector_fixed(const ColumnString::Chars &, size_t, 
ColumnString::Chars &)
-    // {
-    //     throw Exception("Column of type FixedString is not supported by URL 
functions", ErrorCodes::ILLEGAL_COLUMN);
-    // }
 };
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/functions/url/tldLookup.generated.cpp 
b/be/src/vec/functions/url/tldLookup.generated.cpp
new file mode 100644
index 00000000000..9b9471c094d
--- /dev/null
+++ b/be/src/vec/functions/url/tldLookup.generated.cpp
@@ -0,0 +1,140 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp
+// and modified by Doris
+
+// clang-format off
+/* C++ code produced by gperf version 3.1 */
+/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp 
tldLookup.gperf  */
+/* Computed positions: -k'1-11,13-14,17,$' */
+
+#if !( \
+        (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 
37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) 
&& ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && 
('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' 
== 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 
58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) 
&& ('A' == 65) && ('B [...]
+/* The character set is not based on ISO-646.  */
+#error "gperf generated tables don't work with this execution character set. 
Please report a bug to <bug-gp...@gnu.org>."
+#endif
+
+#line 7 "tldLookup.gperf"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
+#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#include <cstring>
+
+#define TOTAL_KEYWORDS 5045
+#define MIN_WORD_LENGTH 4
+#define MAX_WORD_LENGTH 34
+#define MIN_HASH_VALUE 75
+#define MAX_HASH_VALUE 110600
+/* maximum key range = 110526, duplicates = 0 */
+
+class TopLevelDomainLookupHash {
+private:
+    static inline unsigned int hash(const char* str, size_t len);
+
+public:
+    static const char* is_valid(const char* str, size_t len);
+};
+
+inline unsigned int TopLevelDomainLookupHash::hash(const char* str, size_t 
len) {
+    static const unsigned int asso_values[] = {110601, 110601, 110601, 110601, 
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 
110601, 110601,
+                                               110601, 110601, 110601, 110601, 
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 
110601, 110601,
+                                               110601, 110601, 110601, 110601, 
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 3905, 
0, 5,
+                                               11617, 15312, 10, 5, 25, 0, 25, 
0, 5, 0, 0, 110601, 110601, 110601, 5, 110601,
+                                               110601, 110601, 110601, 110601, 
30, 20, 5, 15, 10, 65, 45, 80, 70, 55, 110601, 110601,
+                                               110601, 110601, 110601, 110601, 
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 
110601, 110601,
+                                               110601, 2570, 9477, 1350, 15, 
130, 5915, 1830, 4360, 2210, 5405, 63, 3190, 20, 1165, 5,
+                                               6120, 5863, 470, 2315, 175, 0, 
815, 40, 13577, 115, 5680, 1030, 11798, 23179, 345, 1097,
+                                               28079, 13839, 245, 25674, 
31874, 75, 31774, 7351, 27474, 190, 16044, 8040, 50, 25, 35, 55,
+                                               0, 0, 30, 0, 10, 0, 0, 0, 35, 
0, 55, 10, 5, 65, 0, 60,
+                                               0, 25, 5, 30, 0, 5, 10, 0, 20, 
5, 5, 35, 5, 0, 0, 0,
+                                               0, 0, 15, 0, 5, 5, 0, 5, 5, 5, 
0, 0, 0, 0, 0, 15,
+                                               5, 110601, 110601, 5, 10, 45, 
5, 110601, 0, 110601, 110601, 110601, 110601, 110601, 110601, 110601,
+                                               0, 0, 0, 0, 110601, 110601, 
110601, 45, 0, 0, 0, 0, 110601, 110601, 110601, 110601,
+                                               0, 0, 110601, 0, 0, 0, 0, 5, 0, 
5, 30, 0, 0, 110601, 110601, 110601,
+                                               110601, 110601, 110601, 110601, 
0, 110601, 110601, 110601, 0, 0, 5, 0, 20, 40, 110601, 110601,
+                                               110601, 110601, 110601, 110601, 
110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 
110601, 110601,
+                                               110601, 110601, 110601, 110601};
+    unsigned int hval = len;
+
+    switch (hval) {
+    default:
+        hval += asso_values[static_cast<unsigned char>(str[16])];
+    /*FALLTHROUGH*/
+    case 16:
+    case 15:
+    case 14:
+        hval += asso_values[static_cast<unsigned char>(str[13] + 1)];
+    /*FALLTHROUGH*/
+    case 13:
+        hval += asso_values[static_cast<unsigned char>(str[12])];
+    /*FALLTHROUGH*/
+    case 12:
+    case 11:
+        hval += asso_values[static_cast<unsigned char>(str[10])];
+    /*FALLTHROUGH*/
+    case 10:
+        hval += asso_values[static_cast<unsigned char>(str[9])];
+    /*FALLTHROUGH*/
+    case 9:
+        hval += asso_values[static_cast<unsigned char>(str[8] + 1)];
+    /*FALLTHROUGH*/
+    case 8:
+        hval += asso_values[static_cast<unsigned char>(str[7])];
+    /*FALLTHROUGH*/
+    case 7:
+        hval += asso_values[static_cast<unsigned char>(str[6] + 3)];
+    /*FALLTHROUGH*/
+    case 6:
+        hval += asso_values[static_cast<unsigned char>(str[5])];
+    /*FALLTHROUGH*/
+    case 5:
+        hval += asso_values[static_cast<unsigned char>(str[4] + 2)];
+    /*FALLTHROUGH*/
+    case 4:
+        hval += asso_values[static_cast<unsigned char>(str[3] + 1)];
+    /*FALLTHROUGH*/
+    case 3:
+        hval += asso_values[static_cast<unsigned char>(str[2])];
+    /*FALLTHROUGH*/
+    case 2:
+        hval += asso_values[static_cast<unsigned char>(str[1])];
+    /*FALLTHROUGH*/
+    case 1:
+        hval += asso_values[static_cast<unsigned char>(str[0] + 20)];
+        break;
+    }
+    return hval + asso_values[static_cast<unsigned char>(str[len - 1])];
+}
+
+const char* TopLevelDomainLookupHash::is_valid(const char* str, size_t len) {
+    static const char* const wordlist[] = 
{"","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","co.tm","","\340\270\227\340\270\253\340\270\262\340\270\243.\340\271\204\340\270\227\340\270\242","","","","com.mu","","","","","com.so","","\340\270\243\340\270\261\340\270\220\340\270\232\340\270\262\340\270\245.\340\271\
 [...]
+    if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) {
+        unsigned int key = hash(str, len);
+
+        if (key <= MAX_HASH_VALUE) {
+            const char* s = wordlist[key];
+
+            if (*str == *s && !strncmp(str + 1, s + 1, len - 1) && s[len] == 
'\0')
+                return s;
+        }
+    }
+    return nullptr;
+}
+#line 5060 "tldLookup.gperf"
\ No newline at end of file
diff --git a/be/src/vec/functions/url/tldLookup.h 
b/be/src/vec/functions/url/tldLookup.h
new file mode 100644
index 00000000000..9be88890c14
--- /dev/null
+++ b/be/src/vec/functions/url/tldLookup.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// 
https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.h
+// and modified by Doris
+
+#pragma once
+
+#include <cstdlib>
+
+// Definition of the class generated by gperf, present on gperf/tldLookup.gperf
+class TopLevelDomainLookupHash {
+private:
+    static inline unsigned int hash(const char* str, size_t len);
+
+public:
+    static const char* is_valid(const char* str, size_t len);
+};
+
+using tldLookup = TopLevelDomainLookupHash;
\ No newline at end of file
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 8dea4eeb8d2..ed3f2895cc8 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -139,6 +139,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Database;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Date;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff;
@@ -180,6 +181,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Exp;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Field;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow;
@@ -440,6 +442,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn;
@@ -606,6 +609,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(CurrentTime.class, "curtime", "current_time"),
             scalar(CurrentUser.class, "current_user"),
             scalar(CutIpv6.class, "cut_ipv6"),
+            scalar(CutToFirstSignificantSubdomain.class, 
"cut_to_first_significant_subdomain"),
             scalar(Database.class, "database", "schema"),
             scalar(Date.class, "date"),
             scalar(DateDiff.class, "datediff"),
@@ -647,6 +651,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(ExtractUrlParameter.class, "extract_url_parameter"),
             scalar(Field.class, "field"),
             scalar(FindInSet.class, "find_in_set"),
+            scalar(FirstSignificantSubdomain.class, 
"first_significant_subdomain"),
             scalar(Floor.class, "floor"),
             scalar(Fmod.class, "fmod"),
             scalar(Fpow.class, "fpow"),
@@ -926,6 +931,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(ToIso8601.class, "to_iso8601"),
             scalar(Tokenize.class, "tokenize"),
             scalar(ToMonday.class, "to_monday"),
+            scalar(TopLevelDomain.class, "top_level_domain"),
             scalar(ToQuantileState.class, "to_quantile_state"),
             scalar(Translate.class, "translate"),
             scalar(Trim.class, "trim"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java
new file mode 100644
index 00000000000..a2e77531e43
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'CutToFirstSignificantSubdomain'. This class is generated by 
GenerateFunction.
+ */
+public class CutToFirstSignificantSubdomain extends ScalarFunction
+        implements UnaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 1 argument.
+     */
+    public CutToFirstSignificantSubdomain(Expression arg) {
+        super("cut_to_first_significant_subdomain", arg);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public CutToFirstSignificantSubdomain withChildren(List<Expression> 
children) {
+        Preconditions.checkArgument(children.size() == 1);
+        return new CutToFirstSignificantSubdomain(children.get(0));
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitCutToFirstSignificantSubdomain(this, context);
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java
new file mode 100644
index 00000000000..1af4dd96e6d
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'FirstSignificantSubdomain'. This class is generated by 
GenerateFunction.
+ */
+public class FirstSignificantSubdomain extends ScalarFunction
+        implements UnaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 1 argument.
+     */
+    public FirstSignificantSubdomain(Expression arg) {
+        super("first_significant_subdomain", arg);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public FirstSignificantSubdomain withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 1);
+        return new FirstSignificantSubdomain(children.get(0));
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitFirstSignificantSubdomain(this, context);
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java
new file mode 100644
index 00000000000..05997659a2e
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'TopLevelDomain'. This class is generated by 
GenerateFunction.
+ */
+public class TopLevelDomain extends ScalarFunction
+        implements UnaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            
FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 1 argument.
+     */
+    public TopLevelDomain(Expression arg) {
+        super("top_level_domain", arg);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public TopLevelDomain withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 1);
+        return new TopLevelDomain(children.get(0));
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitTopLevelDomain(this, context);
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index c5e9688d3c1..2619731cfc8 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -146,6 +146,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Database;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Date;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff;
@@ -188,6 +189,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Exp;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Field;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow;
@@ -437,6 +439,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn;
@@ -903,6 +906,11 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(charFunc, context);
     }
 
+    default R 
visitCutToFirstSignificantSubdomain(CutToFirstSignificantSubdomain 
cutToFirstSignificantSubdomain,
+            C context) {
+        return visitScalarFunction(cutToFirstSignificantSubdomain, context);
+    }
+
     default R visitEncodeAsSmallInt(EncodeAsSmallInt encode, C context) {
         return visitScalarFunction(encode, context);
     }
@@ -1187,6 +1195,10 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(findInSet, context);
     }
 
+    default R visitFirstSignificantSubdomain(FirstSignificantSubdomain 
firstSignificantSubdomain, C context) {
+        return visitScalarFunction(firstSignificantSubdomain, context);
+    }
+
     default R visitFloor(Floor floor, C context) {
         return visitScalarFunction(floor, context);
     }
@@ -2111,6 +2123,10 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(tokenize, context);
     }
 
+    default R visitTopLevelDomain(TopLevelDomain topLevelDomain, C context) {
+        return visitScalarFunction(topLevelDomain, context);
+    }
+
     default R visitToQuantileState(ToQuantileState toQuantileState, C context) 
{
         return visitScalarFunction(toQuantileState, context);
     }
diff --git a/gensrc/script/doris_builtins_functions.py 
b/gensrc/script/doris_builtins_functions.py
index 73e68badcda..31b02f9b979 100644
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2077,7 +2077,10 @@ visible_functions = {
     "Url": [
         [['domain'], 'STRING', ['STRING'], ''],
         [['domain_without_www'], 'STRING', ['STRING'], ''],
-        [['protocol'], 'STRING', ['STRING'], '']
+        [['protocol'], 'STRING', ['STRING'], ''],
+        [['top_level_domain'], 'STRING', ['STRING'], ''],
+        [['cut_to_first_significant_subdomain'], 'STRING', ['STRING'], ''],
+        [['first_significant_subdomain'], 'STRING', ['STRING'], '']
     ],
 
     # search functions
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out
new file mode 100644
index 00000000000..ce1ef717975
--- /dev/null
+++ 
b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out
@@ -0,0 +1,121 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !empty_nullable1 --
+
+-- !empty_nullable2 --
+
+-- !empty_nullable3 --
+
+-- !empty_not_nullable1 --
+
+-- !empty_not_nullable2 --
+
+-- !empty_not_nullable3 --
+
+-- !empty_null1 --
+\N
+
+-- !empty_null2 --
+\N
+
+-- !empty_null3 --
+\N
+
+-- !empty_const1 --
+com
+
+-- !empty_const2 --
+baidu
+
+-- !empty_const3 --
+baidu.com
+
+-- !empty_const4 --
+cn
+
+-- !empty_const5 --
+google
+
+-- !empty_const6 --
+google.com.cn
+
+-- !empty_const7 --
+
+
+-- !empty_const8 --
+
+
+-- !empty_const9 --
+
+
+-- !nullable1 --
+1      www.baidu.com   com
+10     https://news.clickhouse.com.tr/ tr
+2      www.google.com.cn       cn
+3      invalid url     
+4              
+5              
+6      \N      \N
+7      xxxxxxxx        
+8      http://www.example.com/a/b/c?a=b        com
+9      https://news.clickhouse.com/    com
+
+-- !nullable2 --
+1      www.baidu.com   baidu
+10     https://news.clickhouse.com.tr/ clickhouse
+2      www.google.com.cn       google
+3      invalid url     
+4              
+5              
+6      \N      \N
+7      xxxxxxxx        
+8      http://www.example.com/a/b/c?a=b        example
+9      https://news.clickhouse.com/    clickhouse
+
+-- !nullable3 --
+1      www.baidu.com   baidu.com
+10     https://news.clickhouse.com.tr/ clickhouse.com.tr
+2      www.google.com.cn       google.com.cn
+3      invalid url     
+4              
+5              
+6      \N      \N
+7      xxxxxxxx        
+8      http://www.example.com/a/b/c?a=b        example.com
+9      https://news.clickhouse.com/    clickhouse.com
+
+-- !not_nullable1 --
+1      www.baidu.com   com
+10     https://news.clickhouse.com.tr/ tr
+2      www.google.com.cn       cn
+3      invalid url     
+4              
+5              
+6              
+7      xxxxxxxx        
+8      http://www.example.com/a/b/c?a=b        com
+9      https://news.clickhouse.com/    com
+
+-- !not_nullable2 --
+1      www.baidu.com   baidu
+10     https://news.clickhouse.com.tr/ clickhouse
+2      www.google.com.cn       google
+3      invalid url     
+4              
+5              
+6              
+7      xxxxxxxx        
+8      http://www.example.com/a/b/c?a=b        example
+9      https://news.clickhouse.com/    clickhouse
+
+-- !not_nullable3 --
+1      www.baidu.com   baidu.com
+10     https://news.clickhouse.com.tr/ clickhouse.com.tr
+2      www.google.com.cn       google.com.cn
+3      invalid url     
+4              
+5              
+6              
+7      xxxxxxxx        
+8      http://www.example.com/a/b/c?a=b        example.com
+9      https://news.clickhouse.com/    clickhouse.com
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy
new file mode 100644
index 00000000000..389020b63e2
--- /dev/null
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_url_functions") {
+    sql " drop table if exists test_url_functions"
+    sql """
+        create table test_url_functions (
+            id int,
+            s1 string not null,
+            s2 string null
+        )
+        DISTRIBUTED BY HASH(id)
+        PROPERTIES
+        (
+            "replication_num" = "1"
+        );
+    """
+
+    //empty table
+    order_qt_empty_nullable1 "select top_level_domain(s2) from 
test_url_functions"
+    order_qt_empty_nullable2 "select first_significant_subdomain(s2) from 
test_url_functions"
+    order_qt_empty_nullable3 "select cut_to_first_significant_subdomain(s2) 
from test_url_functions"
+    order_qt_empty_not_nullable1 "select top_level_domain(s1) from 
test_url_functions"
+    order_qt_empty_not_nullable2 "select first_significant_subdomain(s1) from 
test_url_functions"
+    order_qt_empty_not_nullable3 "select 
cut_to_first_significant_subdomain(s1) from test_url_functions"
+
+    //null / const
+    order_qt_empty_null1 "select top_level_domain(NULL)"
+    order_qt_empty_null2 "select first_significant_subdomain(NULL)"
+    order_qt_empty_null3 "select cut_to_first_significant_subdomain(NULL)"
+    
+    //vaild url
+    order_qt_empty_const1 "select top_level_domain('www.baidu.com')"
+    order_qt_empty_const2 "select first_significant_subdomain('www.baidu.com')"
+    order_qt_empty_const3 "select 
cut_to_first_significant_subdomain('www.baidu.com')"
+    order_qt_empty_const4 "select top_level_domain('www.google.com.cn')"
+    order_qt_empty_const5 "select 
first_significant_subdomain('www.google.com.cn')"
+    order_qt_empty_const6 "select 
cut_to_first_significant_subdomain('www.google.com.cn')"
+    
+    //invaild url
+    order_qt_empty_const7 "select top_level_domain('I am invaild url')"
+    order_qt_empty_const8 "select first_significant_subdomain('I am invaild 
url')"
+    order_qt_empty_const9 "select cut_to_first_significant_subdomain('I am 
invaild url')"
+    
+
+    sql """ insert into test_url_functions values (1, 'www.baidu.com', 
'www.baidu.com'); """
+    sql """ insert into test_url_functions values (2, 'www.google.com.cn', 
'www.google.com.cn'); """
+    sql """ insert into test_url_functions values (3, 'invalid url', 'invalid 
url'); """
+    sql """ insert into test_url_functions values (4, '', ''); """
+    sql """ insert into test_url_functions values (5, ' ', ' '); """
+    sql """ insert into test_url_functions values (6, ' ', NULL); """
+    sql """ insert into test_url_functions values (7, 'xxxxxxxx', 'xxxxxxxx'); 
"""
+    sql """ insert into test_url_functions values (8, 
'http://www.example.com/a/b/c?a=b', 'http://www.example.com/a/b/c?a=b'); """
+    sql """ insert into test_url_functions values (9, 
'https://news.clickhouse.com/', 'https://news.clickhouse.com/'); """
+    sql """ insert into test_url_functions values (10, 
'https://news.clickhouse.com.tr/', 'https://news.clickhouse.com.tr/'); """
+
+    order_qt_nullable1 "select id,s2,top_level_domain(s2) from 
test_url_functions order by id"
+    order_qt_nullable2 "select id,s2,first_significant_subdomain(s2) from 
test_url_functions order by id"
+    order_qt_nullable3 "select id,s2,cut_to_first_significant_subdomain(s2) 
from test_url_functions order by id"
+
+    order_qt_not_nullable1 "select id,s1,top_level_domain(s1) from 
test_url_functions order by id"
+    order_qt_not_nullable2 "select id,s1,first_significant_subdomain(s1) from 
test_url_functions order by id"
+    order_qt_not_nullable3 "select 
id,s1,cut_to_first_significant_subdomain(s1) from test_url_functions order by 
id"
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to