This is an automated email from the ASF dual-hosted git repository. zhangstar333 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 8843efc1a3d [feature](function)support url domain functions (#42488) 8843efc1a3d is described below commit 8843efc1a3d28518e76b1681ca54a0715e491213 Author: zhangstar333 <87313068+zhangstar...@users.noreply.github.com> AuthorDate: Wed Oct 30 17:33:39 2024 +0800 [feature](function)support url domain functions (#42488) ## Proposed changes support top_level_domain/first_significant_subdomain/cut_to_first_significant_subdomain functions doc: https://github.com/apache/doris-website/pull/1230 <!--Describe your changes.--> --- be/src/vec/functions/url/domain.h | 127 +++++- be/src/vec/functions/url/find_symbols.h | 481 +++++++++++++++++++++ be/src/vec/functions/url/function_url.cpp | 23 + be/src/vec/functions/url/functions_url.h | 11 - be/src/vec/functions/url/tldLookup.generated.cpp | 140 ++++++ be/src/vec/functions/url/tldLookup.h | 34 ++ .../doris/catalog/BuiltinScalarFunctions.java | 6 + .../scalar/CutToFirstSignificantSubdomain.java | 68 +++ .../scalar/FirstSignificantSubdomain.java | 68 +++ .../functions/scalar/TopLevelDomain.java | 68 +++ .../expressions/visitor/ScalarFunctionVisitor.java | 16 + gensrc/script/doris_builtins_functions.py | 5 +- .../string_functions/test_url_functions.out | 121 ++++++ .../string_functions/test_url_functions.groovy | 79 ++++ 14 files changed, 1234 insertions(+), 13 deletions(-) diff --git a/be/src/vec/functions/url/domain.h b/be/src/vec/functions/url/domain.h index 54361134eff..b2ec5e0c9d9 100644 --- a/be/src/vec/functions/url/domain.h +++ b/be/src/vec/functions/url/domain.h @@ -20,11 +20,12 @@ #pragma once -// #include <base/find_symbols.h> #include <cstring> #include "vec/common/string_utils/string_utils.h" +#include "vec/functions/url/find_symbols.h" #include "vec/functions/url/protocol.h" +#include "vec/functions/url/tldLookup.h" namespace doris::vectorized { @@ -144,4 +145,128 @@ struct ExtractDomain { } }; +struct ExtractTopLevelDomain { + static size_t get_reserve_length_for_element() { return 5; } + + static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) { + res_data = data; + res_size = 0; + StringRef host = get_url_host(data, size); + + if (host.size == 0) { + return; + } else { + auto host_view = host.to_string_view(); + if (host_view[host_view.size() - 1] == '.') { + host_view.remove_suffix(1); + } + + const auto* host_end = host_view.data() + host_view.size(); + const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end); + if (!last_dot) { + return; + } + + /// For IPv4 addresses select nothing. + /// + /// NOTE: it is safe to access last_dot[1] + /// since getURLHost() will not return a host if there is symbol after dot. + if (is_numeric_ascii(last_dot[1])) { + return; + } + + res_data = last_dot + 1; + res_size = host_end - res_data; + } + } +}; + +struct ExtractFirstSignificantSubdomain { + static size_t get_reserve_length_for_element() { return 10; } + + static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size, + Pos* out_domain_end = nullptr) { + res_data = data; + res_size = 0; + + Pos tmp; + size_t domain_length = 0; + ExtractDomain<true>::execute(data, size, tmp, domain_length); + + if (domain_length == 0) { + return; + } + if (out_domain_end) { + *out_domain_end = tmp + domain_length; + } + + /// cut useless dot + if (tmp[domain_length - 1] == '.') { + --domain_length; + } + + res_data = tmp; + res_size = domain_length; + + const auto* begin = tmp; + const auto* end = begin + domain_length; + std::array<const char*, 3> last_periods {}; + + const auto* pos = find_first_symbols<'.'>(begin, end); + while (pos < end) { + last_periods[2] = last_periods[1]; + last_periods[1] = last_periods[0]; + last_periods[0] = pos; + pos = find_first_symbols<'.'>(pos + 1, end); + } + + if (!last_periods[0]) { + return; + } + + if (!last_periods[1]) { + res_size = last_periods[0] - begin; + return; + } + + if (!last_periods[2]) { + last_periods[2] = begin - 1; + } + + const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end); + if (!end_of_level_domain) { + end_of_level_domain = end; + } + + auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1); + StringRef host {last_periods[1] + 1, host_len}; + if (tldLookup::is_valid(host.data, host.size)) { + res_data += last_periods[2] + 1 - begin; + res_size = last_periods[1] - last_periods[2] - 1; + } else { + res_data += last_periods[1] + 1 - begin; + res_size = last_periods[0] - last_periods[1] - 1; + } + } +}; + +struct CutToFirstSignificantSubdomain { + static size_t get_reserve_length_for_element() { return 15; } + + static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) { + res_data = data; + res_size = 0; + + Pos tmp_data = data; + size_t tmp_length; + Pos domain_end = data; + ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end); + + if (tmp_length == 0) { + return; + } + res_data = tmp_data; + res_size = domain_end - tmp_data; + } +}; } // namespace doris::vectorized diff --git a/be/src/vec/functions/url/find_symbols.h b/be/src/vec/functions/url/find_symbols.h new file mode 100644 index 00000000000..7af95ce06bd --- /dev/null +++ b/be/src/vec/functions/url/find_symbols.h @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), + symbols); + else +#endif + return find_first_symbols_sse2<positive, return_mode>( + haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); +} + +} // namespace detail + +template <char... symbols> +inline const char* find_first_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end); +} + +/// Returning non const result for non const arguments. +/// It is convenient when you are using this function to iterate through non-const buffer. +template <char... symbols> +inline char* find_first_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_not_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_not_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end); +} + +template <char... symbols> +inline char* find_first_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_not_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_last_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +template <char... symbols> +inline const char* find_last_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +/// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. +/// See https://github.com/boostorg/algorithm/issues/63 +/// And https://bugs.llvm.org/show_bug.cgi?id=41141 +template <char... symbols, typename To> +inline To& splitInto(To& to, std::string_view what, bool token_compress = false) { + const char* pos = what.data(); + const char* end = pos + what.size(); + while (pos < end) { + const char* delimiter_or_end = find_first_symbols<symbols...>(pos, end); + + if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos, delimiter_or_end - pos); + + if (delimiter_or_end < end) + pos = delimiter_or_end + 1; + else + pos = delimiter_or_end; + } + + return to; +} diff --git a/be/src/vec/functions/url/function_url.cpp b/be/src/vec/functions/url/function_url.cpp index e25af6f7f27..47afe076b74 100644 --- a/be/src/vec/functions/url/function_url.cpp +++ b/be/src/vec/functions/url/function_url.cpp @@ -46,10 +46,33 @@ struct NameProtocol { using FunctionProtocol = FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, NameProtocol>; +struct NameTopLevelDomain { + static constexpr auto name = "top_level_domain"; +}; +using FunctionTopLevelDomain = + FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>, NameTopLevelDomain>; + +struct NameFirstSignificantSubdomain { + static constexpr auto name = "first_significant_subdomain"; +}; +using FunctionFirstSignificantSubdomain = + FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, + NameFirstSignificantSubdomain>; + +struct NameCutToFirstSignificantSubdomain { + static constexpr auto name = "cut_to_first_significant_subdomain"; +}; +using FunctionCutToFirstSignificantSubdomain = + FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, + NameCutToFirstSignificantSubdomain>; + void register_function_url(SimpleFunctionFactory& factory) { factory.register_function<FunctionDomain>(); factory.register_function<FunctionDomainWithoutWWW>(); factory.register_function<FunctionProtocol>(); + factory.register_function<FunctionTopLevelDomain>(); + factory.register_function<FunctionFirstSignificantSubdomain>(); + factory.register_function<FunctionCutToFirstSignificantSubdomain>(); } } // namespace doris::vectorized diff --git a/be/src/vec/functions/url/functions_url.h b/be/src/vec/functions/url/functions_url.h index f9f02a17a66..b6736496d24 100644 --- a/be/src/vec/functions/url/functions_url.h +++ b/be/src/vec/functions/url/functions_url.h @@ -89,7 +89,6 @@ struct ExtractSubstringImpl { for (size_t i = 0; i < size; ++i) { Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), offsets[i] - prev_offset, start, length); - res_data.resize(res_data.size() + length); memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); res_offset += length; @@ -105,11 +104,6 @@ struct ExtractSubstringImpl { Extractor::execute(data.data(), data.size(), start, length); res_data.assign(start, length); } - - // static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) - // { - // throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN); - // } }; /** Delete part of string using the Extractor. @@ -155,11 +149,6 @@ struct CutSubstringImpl { res_data.append(data.data(), start); res_data.append(start + length, data.data() + data.size()); } - - // static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) - // { - // throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN); - // } }; } // namespace doris::vectorized diff --git a/be/src/vec/functions/url/tldLookup.generated.cpp b/be/src/vec/functions/url/tldLookup.generated.cpp new file mode 100644 index 00000000000..9b9471c094d --- /dev/null +++ b/be/src/vec/functions/url/tldLookup.generated.cpp @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B [...] +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 34 +#define MIN_HASH_VALUE 75 +#define MAX_HASH_VALUE 110600 +/* maximum key range = 110526, duplicates = 0 */ + +class TopLevelDomainLookupHash { +private: + static inline unsigned int hash(const char* str, size_t len); + +public: + static const char* is_valid(const char* str, size_t len); +}; + +inline unsigned int TopLevelDomainLookupHash::hash(const char* str, size_t len) { + static const unsigned int asso_values[] = {110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 3905, 0, 5, + 11617, 15312, 10, 5, 25, 0, 25, 0, 5, 0, 0, 110601, 110601, 110601, 5, 110601, + 110601, 110601, 110601, 110601, 30, 20, 5, 15, 10, 65, 45, 80, 70, 55, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 2570, 9477, 1350, 15, 130, 5915, 1830, 4360, 2210, 5405, 63, 3190, 20, 1165, 5, + 6120, 5863, 470, 2315, 175, 0, 815, 40, 13577, 115, 5680, 1030, 11798, 23179, 345, 1097, + 28079, 13839, 245, 25674, 31874, 75, 31774, 7351, 27474, 190, 16044, 8040, 50, 25, 35, 55, + 0, 0, 30, 0, 10, 0, 0, 0, 35, 0, 55, 10, 5, 65, 0, 60, + 0, 25, 5, 30, 0, 5, 10, 0, 20, 5, 5, 35, 5, 0, 0, 0, + 0, 0, 15, 0, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 0, 15, + 5, 110601, 110601, 5, 10, 45, 5, 110601, 0, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 0, 0, 0, 0, 110601, 110601, 110601, 45, 0, 0, 0, 0, 110601, 110601, 110601, 110601, + 0, 0, 110601, 0, 0, 0, 0, 5, 0, 5, 30, 0, 0, 110601, 110601, 110601, + 110601, 110601, 110601, 110601, 0, 110601, 110601, 110601, 0, 0, 5, 0, 20, 40, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 110601, 110601, 110601}; + unsigned int hval = len; + + switch (hval) { + default: + hval += asso_values[static_cast<unsigned char>(str[16])]; + /*FALLTHROUGH*/ + case 16: + case 15: + case 14: + hval += asso_values[static_cast<unsigned char>(str[13] + 1)]; + /*FALLTHROUGH*/ + case 13: + hval += asso_values[static_cast<unsigned char>(str[12])]; + /*FALLTHROUGH*/ + case 12: + case 11: + hval += asso_values[static_cast<unsigned char>(str[10])]; + /*FALLTHROUGH*/ + case 10: + hval += asso_values[static_cast<unsigned char>(str[9])]; + /*FALLTHROUGH*/ + case 9: + hval += asso_values[static_cast<unsigned char>(str[8] + 1)]; + /*FALLTHROUGH*/ + case 8: + hval += asso_values[static_cast<unsigned char>(str[7])]; + /*FALLTHROUGH*/ + case 7: + hval += asso_values[static_cast<unsigned char>(str[6] + 3)]; + /*FALLTHROUGH*/ + case 6: + hval += asso_values[static_cast<unsigned char>(str[5])]; + /*FALLTHROUGH*/ + case 5: + hval += asso_values[static_cast<unsigned char>(str[4] + 2)]; + /*FALLTHROUGH*/ + case 4: + hval += asso_values[static_cast<unsigned char>(str[3] + 1)]; + /*FALLTHROUGH*/ + case 3: + hval += asso_values[static_cast<unsigned char>(str[2])]; + /*FALLTHROUGH*/ + case 2: + hval += asso_values[static_cast<unsigned char>(str[1])]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[static_cast<unsigned char>(str[0] + 20)]; + break; + } + return hval + asso_values[static_cast<unsigned char>(str[len - 1])]; +} + +const char* TopLevelDomainLookupHash::is_valid(const char* str, size_t len) { + static const char* const wordlist[] = {"","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","co.tm","","\340\270\227\340\270\253\340\270\262\340\270\243.\340\271\204\340\270\227\340\270\242","","","","com.mu","","","","","com.so","","\340\270\243\340\270\261\340\270\220\340\270\232\340\270\262\340\270\245.\340\271\ [...] + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) { + unsigned int key = hash(str, len); + + if (key <= MAX_HASH_VALUE) { + const char* s = wordlist[key]; + + if (*str == *s && !strncmp(str + 1, s + 1, len - 1) && s[len] == '\0') + return s; + } + } + return nullptr; +} +#line 5060 "tldLookup.gperf" \ No newline at end of file diff --git a/be/src/vec/functions/url/tldLookup.h b/be/src/vec/functions/url/tldLookup.h new file mode 100644 index 00000000000..9be88890c14 --- /dev/null +++ b/be/src/vec/functions/url/tldLookup.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.h +// and modified by Doris + +#pragma once + +#include <cstdlib> + +// Definition of the class generated by gperf, present on gperf/tldLookup.gperf +class TopLevelDomainLookupHash { +private: + static inline unsigned int hash(const char* str, size_t len); + +public: + static const char* is_valid(const char* str, size_t len); +}; + +using tldLookup = TopLevelDomainLookupHash; \ No newline at end of file diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 8dea4eeb8d2..ed3f2895cc8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -139,6 +139,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser; import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6; +import org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Database; import org.apache.doris.nereids.trees.expressions.functions.scalar.Date; import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff; @@ -180,6 +181,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Exp; import org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter; import org.apache.doris.nereids.trees.expressions.functions.scalar.Field; import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet; +import org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow; @@ -440,6 +442,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize; +import org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim; import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn; @@ -606,6 +609,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(CurrentTime.class, "curtime", "current_time"), scalar(CurrentUser.class, "current_user"), scalar(CutIpv6.class, "cut_ipv6"), + scalar(CutToFirstSignificantSubdomain.class, "cut_to_first_significant_subdomain"), scalar(Database.class, "database", "schema"), scalar(Date.class, "date"), scalar(DateDiff.class, "datediff"), @@ -647,6 +651,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(ExtractUrlParameter.class, "extract_url_parameter"), scalar(Field.class, "field"), scalar(FindInSet.class, "find_in_set"), + scalar(FirstSignificantSubdomain.class, "first_significant_subdomain"), scalar(Floor.class, "floor"), scalar(Fmod.class, "fmod"), scalar(Fpow.class, "fpow"), @@ -926,6 +931,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(ToIso8601.class, "to_iso8601"), scalar(Tokenize.class, "tokenize"), scalar(ToMonday.class, "to_monday"), + scalar(TopLevelDomain.class, "top_level_domain"), scalar(ToQuantileState.class, "to_quantile_state"), scalar(Translate.class, "translate"), scalar(Trim.class, "trim"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java new file mode 100644 index 00000000000..a2e77531e43 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'CutToFirstSignificantSubdomain'. This class is generated by GenerateFunction. + */ +public class CutToFirstSignificantSubdomain extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public CutToFirstSignificantSubdomain(Expression arg) { + super("cut_to_first_significant_subdomain", arg); + } + + /** + * withChildren. + */ + @Override + public CutToFirstSignificantSubdomain withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new CutToFirstSignificantSubdomain(children.get(0)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitCutToFirstSignificantSubdomain(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java new file mode 100644 index 00000000000..1af4dd96e6d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'FirstSignificantSubdomain'. This class is generated by GenerateFunction. + */ +public class FirstSignificantSubdomain extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public FirstSignificantSubdomain(Expression arg) { + super("first_significant_subdomain", arg); + } + + /** + * withChildren. + */ + @Override + public FirstSignificantSubdomain withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new FirstSignificantSubdomain(children.get(0)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitFirstSignificantSubdomain(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java new file mode 100644 index 00000000000..05997659a2e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'TopLevelDomain'. This class is generated by GenerateFunction. + */ +public class TopLevelDomain extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public TopLevelDomain(Expression arg) { + super("top_level_domain", arg); + } + + /** + * withChildren. + */ + @Override + public TopLevelDomain withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new TopLevelDomain(children.get(0)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitTopLevelDomain(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index c5e9688d3c1..2619731cfc8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -146,6 +146,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser; import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6; +import org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Database; import org.apache.doris.nereids.trees.expressions.functions.scalar.Date; import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff; @@ -188,6 +189,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Exp; import org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter; import org.apache.doris.nereids.trees.expressions.functions.scalar.Field; import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet; +import org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow; @@ -437,6 +439,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize; +import org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim; import org.apache.doris.nereids.trees.expressions.functions.scalar.TrimIn; @@ -903,6 +906,11 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(charFunc, context); } + default R visitCutToFirstSignificantSubdomain(CutToFirstSignificantSubdomain cutToFirstSignificantSubdomain, + C context) { + return visitScalarFunction(cutToFirstSignificantSubdomain, context); + } + default R visitEncodeAsSmallInt(EncodeAsSmallInt encode, C context) { return visitScalarFunction(encode, context); } @@ -1187,6 +1195,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(findInSet, context); } + default R visitFirstSignificantSubdomain(FirstSignificantSubdomain firstSignificantSubdomain, C context) { + return visitScalarFunction(firstSignificantSubdomain, context); + } + default R visitFloor(Floor floor, C context) { return visitScalarFunction(floor, context); } @@ -2111,6 +2123,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(tokenize, context); } + default R visitTopLevelDomain(TopLevelDomain topLevelDomain, C context) { + return visitScalarFunction(topLevelDomain, context); + } + default R visitToQuantileState(ToQuantileState toQuantileState, C context) { return visitScalarFunction(toQuantileState, context); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 73e68badcda..31b02f9b979 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2077,7 +2077,10 @@ visible_functions = { "Url": [ [['domain'], 'STRING', ['STRING'], ''], [['domain_without_www'], 'STRING', ['STRING'], ''], - [['protocol'], 'STRING', ['STRING'], ''] + [['protocol'], 'STRING', ['STRING'], ''], + [['top_level_domain'], 'STRING', ['STRING'], ''], + [['cut_to_first_significant_subdomain'], 'STRING', ['STRING'], ''], + [['first_significant_subdomain'], 'STRING', ['STRING'], ''] ], # search functions diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out new file mode 100644 index 00000000000..ce1ef717975 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out @@ -0,0 +1,121 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !empty_nullable1 -- + +-- !empty_nullable2 -- + +-- !empty_nullable3 -- + +-- !empty_not_nullable1 -- + +-- !empty_not_nullable2 -- + +-- !empty_not_nullable3 -- + +-- !empty_null1 -- +\N + +-- !empty_null2 -- +\N + +-- !empty_null3 -- +\N + +-- !empty_const1 -- +com + +-- !empty_const2 -- +baidu + +-- !empty_const3 -- +baidu.com + +-- !empty_const4 -- +cn + +-- !empty_const5 -- +google + +-- !empty_const6 -- +google.com.cn + +-- !empty_const7 -- + + +-- !empty_const8 -- + + +-- !empty_const9 -- + + +-- !nullable1 -- +1 www.baidu.com com +10 https://news.clickhouse.com.tr/ tr +2 www.google.com.cn cn +3 invalid url +4 +5 +6 \N \N +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b com +9 https://news.clickhouse.com/ com + +-- !nullable2 -- +1 www.baidu.com baidu +10 https://news.clickhouse.com.tr/ clickhouse +2 www.google.com.cn google +3 invalid url +4 +5 +6 \N \N +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example +9 https://news.clickhouse.com/ clickhouse + +-- !nullable3 -- +1 www.baidu.com baidu.com +10 https://news.clickhouse.com.tr/ clickhouse.com.tr +2 www.google.com.cn google.com.cn +3 invalid url +4 +5 +6 \N \N +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example.com +9 https://news.clickhouse.com/ clickhouse.com + +-- !not_nullable1 -- +1 www.baidu.com com +10 https://news.clickhouse.com.tr/ tr +2 www.google.com.cn cn +3 invalid url +4 +5 +6 +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b com +9 https://news.clickhouse.com/ com + +-- !not_nullable2 -- +1 www.baidu.com baidu +10 https://news.clickhouse.com.tr/ clickhouse +2 www.google.com.cn google +3 invalid url +4 +5 +6 +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example +9 https://news.clickhouse.com/ clickhouse + +-- !not_nullable3 -- +1 www.baidu.com baidu.com +10 https://news.clickhouse.com.tr/ clickhouse.com.tr +2 www.google.com.cn google.com.cn +3 invalid url +4 +5 +6 +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example.com +9 https://news.clickhouse.com/ clickhouse.com + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy new file mode 100644 index 00000000000..389020b63e2 --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_url_functions") { + sql " drop table if exists test_url_functions" + sql """ + create table test_url_functions ( + id int, + s1 string not null, + s2 string null + ) + DISTRIBUTED BY HASH(id) + PROPERTIES + ( + "replication_num" = "1" + ); + """ + + //empty table + order_qt_empty_nullable1 "select top_level_domain(s2) from test_url_functions" + order_qt_empty_nullable2 "select first_significant_subdomain(s2) from test_url_functions" + order_qt_empty_nullable3 "select cut_to_first_significant_subdomain(s2) from test_url_functions" + order_qt_empty_not_nullable1 "select top_level_domain(s1) from test_url_functions" + order_qt_empty_not_nullable2 "select first_significant_subdomain(s1) from test_url_functions" + order_qt_empty_not_nullable3 "select cut_to_first_significant_subdomain(s1) from test_url_functions" + + //null / const + order_qt_empty_null1 "select top_level_domain(NULL)" + order_qt_empty_null2 "select first_significant_subdomain(NULL)" + order_qt_empty_null3 "select cut_to_first_significant_subdomain(NULL)" + + //vaild url + order_qt_empty_const1 "select top_level_domain('www.baidu.com')" + order_qt_empty_const2 "select first_significant_subdomain('www.baidu.com')" + order_qt_empty_const3 "select cut_to_first_significant_subdomain('www.baidu.com')" + order_qt_empty_const4 "select top_level_domain('www.google.com.cn')" + order_qt_empty_const5 "select first_significant_subdomain('www.google.com.cn')" + order_qt_empty_const6 "select cut_to_first_significant_subdomain('www.google.com.cn')" + + //invaild url + order_qt_empty_const7 "select top_level_domain('I am invaild url')" + order_qt_empty_const8 "select first_significant_subdomain('I am invaild url')" + order_qt_empty_const9 "select cut_to_first_significant_subdomain('I am invaild url')" + + + sql """ insert into test_url_functions values (1, 'www.baidu.com', 'www.baidu.com'); """ + sql """ insert into test_url_functions values (2, 'www.google.com.cn', 'www.google.com.cn'); """ + sql """ insert into test_url_functions values (3, 'invalid url', 'invalid url'); """ + sql """ insert into test_url_functions values (4, '', ''); """ + sql """ insert into test_url_functions values (5, ' ', ' '); """ + sql """ insert into test_url_functions values (6, ' ', NULL); """ + sql """ insert into test_url_functions values (7, 'xxxxxxxx', 'xxxxxxxx'); """ + sql """ insert into test_url_functions values (8, 'http://www.example.com/a/b/c?a=b', 'http://www.example.com/a/b/c?a=b'); """ + sql """ insert into test_url_functions values (9, 'https://news.clickhouse.com/', 'https://news.clickhouse.com/'); """ + sql """ insert into test_url_functions values (10, 'https://news.clickhouse.com.tr/', 'https://news.clickhouse.com.tr/'); """ + + order_qt_nullable1 "select id,s2,top_level_domain(s2) from test_url_functions order by id" + order_qt_nullable2 "select id,s2,first_significant_subdomain(s2) from test_url_functions order by id" + order_qt_nullable3 "select id,s2,cut_to_first_significant_subdomain(s2) from test_url_functions order by id" + + order_qt_not_nullable1 "select id,s1,top_level_domain(s1) from test_url_functions order by id" + order_qt_not_nullable2 "select id,s1,first_significant_subdomain(s1) from test_url_functions order by id" + order_qt_not_nullable3 "select id,s1,cut_to_first_significant_subdomain(s1) from test_url_functions order by id" + +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org