This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 6c3d42e09a4 [cherry-pick](branch-21) cherry-pick pr about (#42488) (#42099) (#42055) (#42916) 6c3d42e09a4 is described below commit 6c3d42e09a48dd6c40289d6232b0dac14684a661 Author: zhangstar333 <87313068+zhangstar...@users.noreply.github.com> AuthorDate: Thu Oct 31 14:14:19 2024 +0800 [cherry-pick](branch-21) cherry-pick pr about (#42488) (#42099) (#42055) (#42916) ## Proposed changes Issue Number: close #xxx <!--Describe your changes.--> --- be/src/vec/functions/function_bit_test.cpp | 156 +++++++ be/src/vec/functions/function_string.cpp | 1 + be/src/vec/functions/function_string.h | 116 +++++ be/src/vec/functions/simple_function_factory.h | 2 + be/src/vec/functions/url/domain.h | 127 +++++- be/src/vec/functions/url/find_symbols.h | 481 +++++++++++++++++++++ be/src/vec/functions/url/function_url.cpp | 23 + be/src/vec/functions/url/functions_url.h | 11 - be/src/vec/functions/url/tldLookup.generated.cpp | 140 ++++++ be/src/vec/functions/url/tldLookup.h | 34 ++ .../doris/catalog/BuiltinScalarFunctions.java | 10 + .../expressions/functions/scalar/BitTest.java | 75 ++++ .../functions/scalar/CountSubstring.java | 70 +++ .../scalar/CutToFirstSignificantSubdomain.java | 68 +++ .../scalar/FirstSignificantSubdomain.java | 68 +++ .../functions/scalar/TopLevelDomain.java | 68 +++ .../expressions/visitor/ScalarFunctionVisitor.java | 26 ++ gensrc/script/doris_builtins_functions.py | 14 +- .../data/correctness_p0/test_bit_test_function.out | 191 ++++++++ .../string_functions/test_count_substrings.out | 147 +++++++ .../string_functions/test_url_functions.out | 121 ++++++ .../correctness_p0/test_bit_test_function.groovy | 91 ++++ .../string_functions/test_count_substrings.groovy | 76 ++++ .../string_functions/test_url_functions.groovy | 79 ++++ 24 files changed, 2180 insertions(+), 15 deletions(-) diff --git a/be/src/vec/functions/function_bit_test.cpp b/be/src/vec/functions/function_bit_test.cpp new file mode 100644 index 00000000000..8e010fd9446 --- /dev/null +++ b/be/src/vec/functions/function_bit_test.cpp @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <bit> +#include <bitset> + +#include "common/status.h" +#include "vec/columns/column.h" +#include "vec/columns/column_vector.h" +#include "vec/common/assert_cast.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_number.h" +#include "vec/functions/cast_type_to_either.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { + +class FunctionBitTest : public IFunction { +public: + static constexpr auto name = "bit_test"; + + static FunctionPtr create() { return std::make_shared<FunctionBitTest>(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 0; } + + bool is_variadic() const override { return true; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared<DataTypeInt8>(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) const override { + bool valid = + cast_type(block.get_by_position(arguments[0]).type.get(), [&](const auto& type) { + using DataType = std::decay_t<decltype(type)>; + using T = typename DataType::FieldType; + if (auto col = check_and_get_column<ColumnVector<T>>( + block.get_by_position(arguments[0]).column.get()) || + is_column_const(*block.get_by_position(arguments[0]).column)) { + execute_inner<T>(block, arguments, result, input_rows_count); + return true; + } + return false; + }); + if (!valid) { + return Status::RuntimeError( + "{}'s argument does not match the expected data type, type: {}, column: {}", + get_name(), block.get_by_position(arguments[0]).type->get_name(), + block.get_by_position(arguments[0]).column->dump_structure()); + } + return Status::OK(); + } + + template <typename F> + static bool cast_type(const IDataType* type, F&& f) { + return cast_type_to_either<DataTypeInt8, DataTypeInt16, DataTypeInt32, DataTypeInt64, + DataTypeInt128>(type, std::forward<F>(f)); + } + + template <typename T> + void execute_inner(Block& block, const ColumnNumbers& arguments, size_t result, + size_t input_rows_count) const { + size_t argument_size = arguments.size(); + std::vector<ColumnPtr> argument_columns(argument_size); + auto result_data_column = ColumnInt8::create(input_rows_count, 1); + auto& res_data = result_data_column->get_data(); + + // maybe most user is bit_test(column, const), so only handle this case + if (argument_size == 2) { + std::vector<uint8_t> is_consts(argument_size); + std::tie(argument_columns[0], is_consts[0]) = + unpack_if_const(block.get_by_position(arguments[0]).column); + std::tie(argument_columns[1], is_consts[1]) = + unpack_if_const(block.get_by_position(arguments[1]).column); + execute_for_two_argument<T>(argument_columns, is_consts, res_data, input_rows_count); + } else { + for (size_t i = 0; i < argument_size; ++i) { + argument_columns[i] = block.get_by_position(arguments[i]) + .column->convert_to_full_column_if_const(); + } + execute_for_others_arg<T>(argument_columns, res_data, argument_size, input_rows_count); + } + + block.replace_by_position(result, std::move(result_data_column)); + } + + template <typename T> + void execute_for_two_argument(std::vector<ColumnPtr>& argument_columns, + std::vector<uint8_t>& is_consts, ColumnInt8::Container& res_data, + size_t input_rows_count) const { + const auto& first_column_data = + assert_cast<const ColumnVector<T>&>(*argument_columns[0].get()).get_data(); + const auto& second_column_data = + assert_cast<const ColumnVector<T>&>(*argument_columns[1].get()).get_data(); + for (int i = 0; i < input_rows_count; ++i) { + auto first_value = first_column_data[index_check_const(i, is_consts[0])]; + auto second_value = second_column_data[index_check_const(i, is_consts[1])]; + // the pos is invalid, set result = 0 + if (second_value < 0 || second_value >= sizeof(T) * 8) { + res_data[i] = 0; + continue; + } + res_data[i] = ((first_value >> second_value) & 1); + } + } + + template <typename T> + void execute_for_others_arg(std::vector<ColumnPtr>& argument_columns, + ColumnInt8::Container& res_data, size_t argument_size, + size_t input_rows_count) const { + const auto& first_column_data = + assert_cast<const ColumnVector<T>&>(*argument_columns[0].get()).get_data(); + for (int i = 0; i < input_rows_count; ++i) { + auto first_value = first_column_data[i]; + for (int col = 1; col < argument_size; ++col) { + const auto& arg_column_data = + assert_cast<const ColumnVector<T>&>(*argument_columns[col].get()) + .get_data(); + // the pos is invalid, set result = 0 + if (arg_column_data[i] < 0 || arg_column_data[i] >= sizeof(T) * 8) { + res_data[i] = 0; + break; + } + // if one of pos & result is 0, could set res = 0, and return directly. + if (!((first_value >> arg_column_data[i]) & 1)) { + res_data[i] = 0; + break; + } + } + } + } +}; + +void register_function_bit_test(SimpleFunctionFactory& factory) { + factory.register_function<FunctionBitTest>(); + factory.register_alias("bit_test", "bit_test_all"); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index edf43300f94..15e977ecbb5 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -1175,6 +1175,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function<FunctionFromBase64>(); factory.register_function<FunctionSplitPart>(); factory.register_function<FunctionSplitByString>(); + factory.register_function<FunctionCountSubString>(); factory.register_function<FunctionSubstringIndex>(); factory.register_function<FunctionExtractURLParameter>(); factory.register_function<FunctionStringParseUrl>(); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 48887af85f0..256e5943990 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -2733,6 +2733,122 @@ private: } }; +class FunctionCountSubString : public IFunction { +public: + static constexpr auto name = "count_substrings"; + + static FunctionPtr create() { return std::make_shared<FunctionCountSubString>(); } + using NullMapType = PaddedPODArray<UInt8>; + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + DCHECK(is_string(arguments[0])) + << "first argument for function: " << name << " should be string" + << " and arguments[0] is " << arguments[0]->get_name(); + DCHECK(is_string(arguments[1])) + << "second argument for function: " << name << " should be string" + << " and arguments[1] is " << arguments[1]->get_name(); + return std::make_shared<DataTypeInt32>(); + } + + Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) const override { + DCHECK_EQ(arguments.size(), 2); + const auto& [src_column, left_const] = + unpack_if_const(block.get_by_position(arguments[0]).column); + const auto& [right_column, right_const] = + unpack_if_const(block.get_by_position(arguments[1]).column); + + const auto* col_left = check_and_get_column<ColumnString>(src_column.get()); + if (!col_left) { + return Status::InternalError("Left operator of function {} can not be {}", get_name(), + block.get_by_position(arguments[0]).type->get_name()); + } + + const auto* col_right = check_and_get_column<ColumnString>(right_column.get()); + if (!col_right) { + return Status::InternalError("Right operator of function {} can not be {}", get_name(), + block.get_by_position(arguments[1]).type->get_name()); + } + + auto dest_column_ptr = ColumnInt32::create(input_rows_count, 0); + // count_substring(ColumnString, "xxx") + if (right_const) { + _execute_constant_pattern(*col_left, col_right->get_data_at(0), + dest_column_ptr->get_data(), input_rows_count); + } else if (left_const) { + // count_substring("xxx", ColumnString) + _execute_constant_src_string(col_left->get_data_at(0), *col_right, + dest_column_ptr->get_data(), input_rows_count); + } else { + // count_substring(ColumnString, ColumnString) + _execute_vector(*col_left, *col_right, dest_column_ptr->get_data(), input_rows_count); + } + + block.replace_by_position(result, std::move(dest_column_ptr)); + return Status::OK(); + } + +private: + void _execute_constant_pattern(const ColumnString& src_column_string, + const StringRef& pattern_ref, + ColumnInt32::Container& dest_column_data, + size_t input_rows_count) const { + for (size_t i = 0; i < input_rows_count; i++) { + const StringRef str_ref = src_column_string.get_data_at(i); + dest_column_data[i] = find_str_count(str_ref, pattern_ref); + } + } + + void _execute_vector(const ColumnString& src_column_string, const ColumnString& pattern_column, + ColumnInt32::Container& dest_column_data, size_t input_rows_count) const { + for (size_t i = 0; i < input_rows_count; i++) { + const StringRef pattern_ref = pattern_column.get_data_at(i); + const StringRef str_ref = src_column_string.get_data_at(i); + dest_column_data[i] = find_str_count(str_ref, pattern_ref); + } + } + + void _execute_constant_src_string(const StringRef& str_ref, const ColumnString& pattern_col, + ColumnInt32::Container& dest_column_data, + size_t input_rows_count) const { + for (size_t i = 0; i < input_rows_count; ++i) { + const StringRef pattern_ref = pattern_col.get_data_at(i); + dest_column_data[i] = find_str_count(str_ref, pattern_ref); + } + } + + size_t find_pos(size_t pos, const StringRef str_ref, const StringRef pattern_ref) const { + size_t old_size = pos; + size_t str_size = str_ref.size; + while (pos < str_size && memcmp_small_allow_overflow15(str_ref.data + pos, pattern_ref.data, + pattern_ref.size)) { + pos++; + } + return pos - old_size; + } + + int find_str_count(const StringRef str_ref, StringRef pattern_ref) const { + int count = 0; + if (str_ref.size == 0 || pattern_ref.size == 0) { + return 0; + } else { + for (size_t str_pos = 0; str_pos <= str_ref.size;) { + const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref); + if (res_pos == (str_ref.size - str_pos)) { + break; // not find + } + count++; + str_pos = str_pos + res_pos + pattern_ref.size; + } + } + return count; + } +}; + struct SM3Sum { static constexpr auto name = "sm3sum"; using ObjectData = SM3Digest; diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h index d164e40abb1..a859a4685e2 100644 --- a/be/src/vec/functions/simple_function_factory.h +++ b/be/src/vec/functions/simple_function_factory.h @@ -106,6 +106,7 @@ void register_function_tokenize(SimpleFunctionFactory& factory); void register_function_url(SimpleFunctionFactory& factory); void register_function_ip(SimpleFunctionFactory& factory); void register_function_multi_match(SimpleFunctionFactory& factory); +void register_function_bit_test(SimpleFunctionFactory& factory); class SimpleFunctionFactory { using Creator = std::function<FunctionBuilderPtr()>; @@ -297,6 +298,7 @@ public: register_function_ignore(instance); register_function_variant_element(instance); register_function_multi_match(instance); + register_function_bit_test(instance); }); return instance; } diff --git a/be/src/vec/functions/url/domain.h b/be/src/vec/functions/url/domain.h index 54361134eff..b2ec5e0c9d9 100644 --- a/be/src/vec/functions/url/domain.h +++ b/be/src/vec/functions/url/domain.h @@ -20,11 +20,12 @@ #pragma once -// #include <base/find_symbols.h> #include <cstring> #include "vec/common/string_utils/string_utils.h" +#include "vec/functions/url/find_symbols.h" #include "vec/functions/url/protocol.h" +#include "vec/functions/url/tldLookup.h" namespace doris::vectorized { @@ -144,4 +145,128 @@ struct ExtractDomain { } }; +struct ExtractTopLevelDomain { + static size_t get_reserve_length_for_element() { return 5; } + + static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) { + res_data = data; + res_size = 0; + StringRef host = get_url_host(data, size); + + if (host.size == 0) { + return; + } else { + auto host_view = host.to_string_view(); + if (host_view[host_view.size() - 1] == '.') { + host_view.remove_suffix(1); + } + + const auto* host_end = host_view.data() + host_view.size(); + const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end); + if (!last_dot) { + return; + } + + /// For IPv4 addresses select nothing. + /// + /// NOTE: it is safe to access last_dot[1] + /// since getURLHost() will not return a host if there is symbol after dot. + if (is_numeric_ascii(last_dot[1])) { + return; + } + + res_data = last_dot + 1; + res_size = host_end - res_data; + } + } +}; + +struct ExtractFirstSignificantSubdomain { + static size_t get_reserve_length_for_element() { return 10; } + + static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size, + Pos* out_domain_end = nullptr) { + res_data = data; + res_size = 0; + + Pos tmp; + size_t domain_length = 0; + ExtractDomain<true>::execute(data, size, tmp, domain_length); + + if (domain_length == 0) { + return; + } + if (out_domain_end) { + *out_domain_end = tmp + domain_length; + } + + /// cut useless dot + if (tmp[domain_length - 1] == '.') { + --domain_length; + } + + res_data = tmp; + res_size = domain_length; + + const auto* begin = tmp; + const auto* end = begin + domain_length; + std::array<const char*, 3> last_periods {}; + + const auto* pos = find_first_symbols<'.'>(begin, end); + while (pos < end) { + last_periods[2] = last_periods[1]; + last_periods[1] = last_periods[0]; + last_periods[0] = pos; + pos = find_first_symbols<'.'>(pos + 1, end); + } + + if (!last_periods[0]) { + return; + } + + if (!last_periods[1]) { + res_size = last_periods[0] - begin; + return; + } + + if (!last_periods[2]) { + last_periods[2] = begin - 1; + } + + const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end); + if (!end_of_level_domain) { + end_of_level_domain = end; + } + + auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1); + StringRef host {last_periods[1] + 1, host_len}; + if (tldLookup::is_valid(host.data, host.size)) { + res_data += last_periods[2] + 1 - begin; + res_size = last_periods[1] - last_periods[2] - 1; + } else { + res_data += last_periods[1] + 1 - begin; + res_size = last_periods[0] - last_periods[1] - 1; + } + } +}; + +struct CutToFirstSignificantSubdomain { + static size_t get_reserve_length_for_element() { return 15; } + + static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) { + res_data = data; + res_size = 0; + + Pos tmp_data = data; + size_t tmp_length; + Pos domain_end = data; + ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end); + + if (tmp_length == 0) { + return; + } + res_data = tmp_data; + res_size = domain_end - tmp_data; + } +}; } // namespace doris::vectorized diff --git a/be/src/vec/functions/url/find_symbols.h b/be/src/vec/functions/url/find_symbols.h new file mode 100644 index 00000000000..7af95ce06bd --- /dev/null +++ b/be/src/vec/functions/url/find_symbols.h @@ -0,0 +1,481 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), + symbols); + else +#endif + return find_first_symbols_sse2<positive, return_mode>( + haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); +} + +} // namespace detail + +template <char... symbols> +inline const char* find_first_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end); +} + +/// Returning non const result for non const arguments. +/// It is convenient when you are using this function to iterate through non-const buffer. +template <char... symbols> +inline char* find_first_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_not_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_not_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end); +} + +template <char... symbols> +inline char* find_first_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_not_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_last_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +template <char... symbols> +inline const char* find_last_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +/// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. +/// See https://github.com/boostorg/algorithm/issues/63 +/// And https://bugs.llvm.org/show_bug.cgi?id=41141 +template <char... symbols, typename To> +inline To& splitInto(To& to, std::string_view what, bool token_compress = false) { + const char* pos = what.data(); + const char* end = pos + what.size(); + while (pos < end) { + const char* delimiter_or_end = find_first_symbols<symbols...>(pos, end); + + if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos, delimiter_or_end - pos); + + if (delimiter_or_end < end) + pos = delimiter_or_end + 1; + else + pos = delimiter_or_end; + } + + return to; +} diff --git a/be/src/vec/functions/url/function_url.cpp b/be/src/vec/functions/url/function_url.cpp index e25af6f7f27..47afe076b74 100644 --- a/be/src/vec/functions/url/function_url.cpp +++ b/be/src/vec/functions/url/function_url.cpp @@ -46,10 +46,33 @@ struct NameProtocol { using FunctionProtocol = FunctionStringToString<ExtractSubstringImpl<ExtractProtocol>, NameProtocol>; +struct NameTopLevelDomain { + static constexpr auto name = "top_level_domain"; +}; +using FunctionTopLevelDomain = + FunctionStringToString<ExtractSubstringImpl<ExtractTopLevelDomain>, NameTopLevelDomain>; + +struct NameFirstSignificantSubdomain { + static constexpr auto name = "first_significant_subdomain"; +}; +using FunctionFirstSignificantSubdomain = + FunctionStringToString<ExtractSubstringImpl<ExtractFirstSignificantSubdomain>, + NameFirstSignificantSubdomain>; + +struct NameCutToFirstSignificantSubdomain { + static constexpr auto name = "cut_to_first_significant_subdomain"; +}; +using FunctionCutToFirstSignificantSubdomain = + FunctionStringToString<ExtractSubstringImpl<CutToFirstSignificantSubdomain>, + NameCutToFirstSignificantSubdomain>; + void register_function_url(SimpleFunctionFactory& factory) { factory.register_function<FunctionDomain>(); factory.register_function<FunctionDomainWithoutWWW>(); factory.register_function<FunctionProtocol>(); + factory.register_function<FunctionTopLevelDomain>(); + factory.register_function<FunctionFirstSignificantSubdomain>(); + factory.register_function<FunctionCutToFirstSignificantSubdomain>(); } } // namespace doris::vectorized diff --git a/be/src/vec/functions/url/functions_url.h b/be/src/vec/functions/url/functions_url.h index f9f02a17a66..b6736496d24 100644 --- a/be/src/vec/functions/url/functions_url.h +++ b/be/src/vec/functions/url/functions_url.h @@ -89,7 +89,6 @@ struct ExtractSubstringImpl { for (size_t i = 0; i < size; ++i) { Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), offsets[i] - prev_offset, start, length); - res_data.resize(res_data.size() + length); memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); res_offset += length; @@ -105,11 +104,6 @@ struct ExtractSubstringImpl { Extractor::execute(data.data(), data.size(), start, length); res_data.assign(start, length); } - - // static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) - // { - // throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN); - // } }; /** Delete part of string using the Extractor. @@ -155,11 +149,6 @@ struct CutSubstringImpl { res_data.append(data.data(), start); res_data.append(start + length, data.data() + data.size()); } - - // static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &) - // { - // throw Exception("Column of type FixedString is not supported by URL functions", ErrorCodes::ILLEGAL_COLUMN); - // } }; } // namespace doris::vectorized diff --git a/be/src/vec/functions/url/tldLookup.generated.cpp b/be/src/vec/functions/url/tldLookup.generated.cpp new file mode 100644 index 00000000000..9b9471c094d --- /dev/null +++ b/be/src/vec/functions/url/tldLookup.generated.cpp @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B [...] +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 34 +#define MIN_HASH_VALUE 75 +#define MAX_HASH_VALUE 110600 +/* maximum key range = 110526, duplicates = 0 */ + +class TopLevelDomainLookupHash { +private: + static inline unsigned int hash(const char* str, size_t len); + +public: + static const char* is_valid(const char* str, size_t len); +}; + +inline unsigned int TopLevelDomainLookupHash::hash(const char* str, size_t len) { + static const unsigned int asso_values[] = {110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 3905, 0, 5, + 11617, 15312, 10, 5, 25, 0, 25, 0, 5, 0, 0, 110601, 110601, 110601, 5, 110601, + 110601, 110601, 110601, 110601, 30, 20, 5, 15, 10, 65, 45, 80, 70, 55, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 2570, 9477, 1350, 15, 130, 5915, 1830, 4360, 2210, 5405, 63, 3190, 20, 1165, 5, + 6120, 5863, 470, 2315, 175, 0, 815, 40, 13577, 115, 5680, 1030, 11798, 23179, 345, 1097, + 28079, 13839, 245, 25674, 31874, 75, 31774, 7351, 27474, 190, 16044, 8040, 50, 25, 35, 55, + 0, 0, 30, 0, 10, 0, 0, 0, 35, 0, 55, 10, 5, 65, 0, 60, + 0, 25, 5, 30, 0, 5, 10, 0, 20, 5, 5, 35, 5, 0, 0, 0, + 0, 0, 15, 0, 5, 5, 0, 5, 5, 5, 0, 0, 0, 0, 0, 15, + 5, 110601, 110601, 5, 10, 45, 5, 110601, 0, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 0, 0, 0, 0, 110601, 110601, 110601, 45, 0, 0, 0, 0, 110601, 110601, 110601, 110601, + 0, 0, 110601, 0, 0, 0, 0, 5, 0, 5, 30, 0, 0, 110601, 110601, 110601, + 110601, 110601, 110601, 110601, 0, 110601, 110601, 110601, 0, 0, 5, 0, 20, 40, 110601, 110601, + 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, 110601, + 110601, 110601, 110601, 110601}; + unsigned int hval = len; + + switch (hval) { + default: + hval += asso_values[static_cast<unsigned char>(str[16])]; + /*FALLTHROUGH*/ + case 16: + case 15: + case 14: + hval += asso_values[static_cast<unsigned char>(str[13] + 1)]; + /*FALLTHROUGH*/ + case 13: + hval += asso_values[static_cast<unsigned char>(str[12])]; + /*FALLTHROUGH*/ + case 12: + case 11: + hval += asso_values[static_cast<unsigned char>(str[10])]; + /*FALLTHROUGH*/ + case 10: + hval += asso_values[static_cast<unsigned char>(str[9])]; + /*FALLTHROUGH*/ + case 9: + hval += asso_values[static_cast<unsigned char>(str[8] + 1)]; + /*FALLTHROUGH*/ + case 8: + hval += asso_values[static_cast<unsigned char>(str[7])]; + /*FALLTHROUGH*/ + case 7: + hval += asso_values[static_cast<unsigned char>(str[6] + 3)]; + /*FALLTHROUGH*/ + case 6: + hval += asso_values[static_cast<unsigned char>(str[5])]; + /*FALLTHROUGH*/ + case 5: + hval += asso_values[static_cast<unsigned char>(str[4] + 2)]; + /*FALLTHROUGH*/ + case 4: + hval += asso_values[static_cast<unsigned char>(str[3] + 1)]; + /*FALLTHROUGH*/ + case 3: + hval += asso_values[static_cast<unsigned char>(str[2])]; + /*FALLTHROUGH*/ + case 2: + hval += asso_values[static_cast<unsigned char>(str[1])]; + /*FALLTHROUGH*/ + case 1: + hval += asso_values[static_cast<unsigned char>(str[0] + 20)]; + break; + } + return hval + asso_values[static_cast<unsigned char>(str[len - 1])]; +} + +const char* TopLevelDomainLookupHash::is_valid(const char* str, size_t len) { + static const char* const wordlist[] = {"","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","","co.tm","","\340\270\227\340\270\253\340\270\262\340\270\243.\340\271\204\340\270\227\340\270\242","","","","com.mu","","","","","com.so","","\340\270\243\340\270\261\340\270\220\340\270\232\340\270\262\340\270\245.\340\271\ [...] + if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH) { + unsigned int key = hash(str, len); + + if (key <= MAX_HASH_VALUE) { + const char* s = wordlist[key]; + + if (*str == *s && !strncmp(str + 1, s + 1, len - 1) && s[len] == '\0') + return s; + } + } + return nullptr; +} +#line 5060 "tldLookup.gperf" \ No newline at end of file diff --git a/be/src/vec/functions/url/tldLookup.h b/be/src/vec/functions/url/tldLookup.h new file mode 100644 index 00000000000..9be88890c14 --- /dev/null +++ b/be/src/vec/functions/url/tldLookup.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.h +// and modified by Doris + +#pragma once + +#include <cstdlib> + +// Definition of the class generated by gperf, present on gperf/tldLookup.gperf +class TopLevelDomainLookupHash { +private: + static inline unsigned int hash(const char* str, size_t len); + +public: + static const char* is_valid(const char* str, size_t len); +}; + +using tldLookup = TopLevelDomainLookupHash; \ No newline at end of file diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index f84bda52178..b09ea1033b5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -77,6 +77,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.BitCount; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitLength; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitShiftLeft; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitShiftRight; +import org.apache.doris.nereids.trees.expressions.functions.scalar.BitTest; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitmapAnd; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitmapAndCount; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitmapAndNot; @@ -122,6 +123,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Cos; import org.apache.doris.nereids.trees.expressions.functions.scalar.Cosh; import org.apache.doris.nereids.trees.expressions.functions.scalar.CosineDistance; import org.apache.doris.nereids.trees.expressions.functions.scalar.CountEqual; +import org.apache.doris.nereids.trees.expressions.functions.scalar.CountSubstring; import org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32; import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateMap; import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateNamedStruct; @@ -131,6 +133,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser; import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6; +import org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Database; import org.apache.doris.nereids.trees.expressions.functions.scalar.Date; import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff; @@ -167,6 +170,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Exp; import org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter; import org.apache.doris.nereids.trees.expressions.functions.scalar.Field; import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet; +import org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow; @@ -422,6 +426,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize; +import org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim; import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate; @@ -554,6 +559,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(BitmapXorCount.class, "bitmap_xor_count"), scalar(BitShiftLeft.class, "bit_shift_left"), scalar(BitShiftRight.class, "bit_shift_right"), + scalar(BitTest.class, "bit_test", "bit_test_all"), scalar(Cardinality.class, "array_size", "cardinality", "size"), scalar(Cbrt.class, "cbrt"), scalar(Ceil.class, "ceil", "ceiling"), @@ -570,6 +576,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Cosh.class, "cosh"), scalar(CosineDistance.class, "cosine_distance"), scalar(CountEqual.class, "countequal"), + scalar(CountSubstring.class, "count_substrings"), scalar(CreateMap.class, "map"), scalar(CreateStruct.class, "struct"), scalar(CreateNamedStruct.class, "named_struct"), @@ -578,6 +585,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(CurrentTime.class, "curtime", "current_time"), scalar(CurrentUser.class, "current_user"), scalar(CutIpv6.class, "cut_ipv6"), + scalar(CutToFirstSignificantSubdomain.class, "cut_to_first_significant_subdomain"), scalar(Database.class, "database", "schema"), scalar(Date.class, "date"), scalar(DateDiff.class, "datediff"), @@ -614,6 +622,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(ExtractUrlParameter.class, "extract_url_parameter"), scalar(Field.class, "field"), scalar(FindInSet.class, "find_in_set"), + scalar(FirstSignificantSubdomain.class, "first_significant_subdomain"), scalar(Floor.class, "floor"), scalar(Fmod.class, "fmod"), scalar(Fpow.class, "fpow"), @@ -889,6 +898,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(ToIso8601.class, "to_iso8601"), scalar(Tokenize.class, "tokenize"), scalar(ToMonday.class, "to_monday"), + scalar(TopLevelDomain.class, "top_level_domain"), scalar(ToQuantileState.class, "to_quantile_state"), scalar(Translate.class, "translate"), scalar(Trim.class, "trim"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/BitTest.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/BitTest.java new file mode 100644 index 00000000000..5c32005c126 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/BitTest.java @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.LargeIntType; +import org.apache.doris.nereids.types.SmallIntType; +import org.apache.doris.nereids.types.TinyIntType; +import org.apache.doris.nereids.util.ExpressionUtils; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** BitTest function */ + +public class BitTest extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(TinyIntType.INSTANCE).varArgs(TinyIntType.INSTANCE, TinyIntType.INSTANCE), + FunctionSignature.ret(TinyIntType.INSTANCE).varArgs(SmallIntType.INSTANCE, SmallIntType.INSTANCE), + FunctionSignature.ret(TinyIntType.INSTANCE).varArgs(IntegerType.INSTANCE, IntegerType.INSTANCE), + FunctionSignature.ret(TinyIntType.INSTANCE).varArgs(LargeIntType.INSTANCE, LargeIntType.INSTANCE), + FunctionSignature.ret(TinyIntType.INSTANCE).varArgs(BigIntType.INSTANCE, BigIntType.INSTANCE)); + + /** + * constructor with 2 or more arguments. + */ + public BitTest(Expression arg0, Expression arg1, Expression... varArgs) { + super("bit_test", ExpressionUtils.mergeArguments(arg0, arg1, varArgs)); + } + + /** + * withChildren. + */ + @Override + public BitTest withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() >= 2); + return new BitTest(children.get(0), children.get(1), + children.subList(2, children.size()).toArray(new Expression[0])); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitBitTest(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java new file mode 100644 index 00000000000..ce7a43cf94b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'count_substrings'. + */ +public class CountSubstring extends ScalarFunction + implements BinaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(IntegerType.INSTANCE) + .args(StringType.INSTANCE, StringType.INSTANCE) + ); + + /** + * constructor with 2 arguments. + */ + public CountSubstring(Expression arg0, Expression arg1) { + super("count_substrings", arg0, arg1); + } + + /** + * withChildren. + */ + @Override + public CountSubstring withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 2); + return new CountSubstring(children.get(0), children.get(1)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitCountSubstring(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java new file mode 100644 index 00000000000..a2e77531e43 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CutToFirstSignificantSubdomain.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'CutToFirstSignificantSubdomain'. This class is generated by GenerateFunction. + */ +public class CutToFirstSignificantSubdomain extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public CutToFirstSignificantSubdomain(Expression arg) { + super("cut_to_first_significant_subdomain", arg); + } + + /** + * withChildren. + */ + @Override + public CutToFirstSignificantSubdomain withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new CutToFirstSignificantSubdomain(children.get(0)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitCutToFirstSignificantSubdomain(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java new file mode 100644 index 00000000000..1af4dd96e6d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/FirstSignificantSubdomain.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'FirstSignificantSubdomain'. This class is generated by GenerateFunction. + */ +public class FirstSignificantSubdomain extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public FirstSignificantSubdomain(Expression arg) { + super("first_significant_subdomain", arg); + } + + /** + * withChildren. + */ + @Override + public FirstSignificantSubdomain withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new FirstSignificantSubdomain(children.get(0)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitFirstSignificantSubdomain(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java new file mode 100644 index 00000000000..05997659a2e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/TopLevelDomain.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.StringType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'TopLevelDomain'. This class is generated by GenerateFunction. + */ +public class TopLevelDomain extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE) + ); + + /** + * constructor with 1 argument. + */ + public TopLevelDomain(Expression arg) { + super("top_level_domain", arg); + } + + /** + * withChildren. + */ + @Override + public TopLevelDomain withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new TopLevelDomain(children.get(0)); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitTopLevelDomain(this, context); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 79b8452e1df..0192151ad78 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -84,6 +84,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.BitCount; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitLength; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitShiftLeft; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitShiftRight; +import org.apache.doris.nereids.trees.expressions.functions.scalar.BitTest; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitmapAnd; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitmapAndCount; import org.apache.doris.nereids.trees.expressions.functions.scalar.BitmapAndNot; @@ -129,6 +130,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Cos; import org.apache.doris.nereids.trees.expressions.functions.scalar.Cosh; import org.apache.doris.nereids.trees.expressions.functions.scalar.CosineDistance; import org.apache.doris.nereids.trees.expressions.functions.scalar.CountEqual; +import org.apache.doris.nereids.trees.expressions.functions.scalar.CountSubstring; import org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32; import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateMap; import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateNamedStruct; @@ -138,6 +140,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentDate; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentTime; import org.apache.doris.nereids.trees.expressions.functions.scalar.CurrentUser; import org.apache.doris.nereids.trees.expressions.functions.scalar.CutIpv6; +import org.apache.doris.nereids.trees.expressions.functions.scalar.CutToFirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Database; import org.apache.doris.nereids.trees.expressions.functions.scalar.Date; import org.apache.doris.nereids.trees.expressions.functions.scalar.DateDiff; @@ -175,6 +178,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Exp; import org.apache.doris.nereids.trees.expressions.functions.scalar.ExtractUrlParameter; import org.apache.doris.nereids.trees.expressions.functions.scalar.Field; import org.apache.doris.nereids.trees.expressions.functions.scalar.FindInSet; +import org.apache.doris.nereids.trees.expressions.functions.scalar.FirstSignificantSubdomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Floor; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fmod; import org.apache.doris.nereids.trees.expressions.functions.scalar.Fpow; @@ -419,6 +423,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIso8601; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize; +import org.apache.doris.nereids.trees.expressions.functions.scalar.TopLevelDomain; import org.apache.doris.nereids.trees.expressions.functions.scalar.Translate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim; import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate; @@ -827,6 +832,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(bitShiftRight, context); } + default R visitBitTest(BitTest bitTest, C context) { + return visitScalarFunction(bitTest, context); + } + default R visitCardinality(Cardinality cardinality, C context) { return visitScalarFunction(cardinality, context); } @@ -855,6 +864,11 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(charFunc, context); } + default R visitCutToFirstSignificantSubdomain(CutToFirstSignificantSubdomain cutToFirstSignificantSubdomain, + C context) { + return visitScalarFunction(cutToFirstSignificantSubdomain, context); + } + default R visitConcatWs(ConcatWs concatWs, C context) { return visitScalarFunction(concatWs, context); } @@ -891,6 +905,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(countequal, context); } + default R visitCountSubstring(CountSubstring countSubstring, C context) { + return visitScalarFunction(countSubstring, context); + } + default R visitCurrentCatalog(CurrentCatalog currentCatalog, C context) { return visitScalarFunction(currentCatalog, context); } @@ -1115,6 +1133,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(findInSet, context); } + default R visitFirstSignificantSubdomain(FirstSignificantSubdomain firstSignificantSubdomain, C context) { + return visitScalarFunction(firstSignificantSubdomain, context); + } + default R visitFloor(Floor floor, C context) { return visitScalarFunction(floor, context); } @@ -2023,6 +2045,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(tokenize, context); } + default R visitTopLevelDomain(TopLevelDomain topLevelDomain, C context) { + return visitScalarFunction(topLevelDomain, context); + } + default R visitToQuantileState(ToQuantileState toQuantileState, C context) { return visitScalarFunction(toQuantileState, context); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index a05f6ac8abb..0da5f697100 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -77,7 +77,12 @@ visible_functions = { [['bitnot'], 'LARGEINT', ['LARGEINT'], ''], [['bit_shift_left'], 'BIGINT', ['BIGINT', 'TINYINT'], ''], - [['bit_shift_right'], 'BIGINT', ['BIGINT', 'TINYINT'], ''] + [['bit_shift_right'], 'BIGINT', ['BIGINT', 'TINYINT'], ''], + [['bit_test','bit_test_all'], 'TINYINT', ['TINYINT','TINYINT','...'], ''], + [['bit_test','bit_test_all'], 'TINYINT', ['SMALLINT','SMALLINT','...'], ''], + [['bit_test','bit_test_all'], 'TINYINT', ['INT','INT','...'], ''], + [['bit_test','bit_test_all'], 'TINYINT', ['BIGINT','BIGINT','...'], ''], + [['bit_test','bit_test_all'], 'TINYINT', ['LARGEINT','LARGEINT','...'], ''] ], # map functions @@ -1625,7 +1630,7 @@ visible_functions = { [['char'], 'VARCHAR', ['VARCHAR', 'INT', '...'], 'ALWAYS_NULLABLE'], [['strcmp'], 'INT', ['VARCHAR', 'VARCHAR'], 'DEPEND_ON_ARGUMENT'], - + [['count_substrings'], 'INT', ['STRING', 'STRING'], 'DEPEND_ON_ARGUMENT'], [['substr', 'substring'], 'STRING', ['STRING', 'INT'], 'DEPEND_ON_ARGUMENT'], [['substr', 'substring'], 'STRING', ['STRING', 'INT', 'INT'], 'DEPEND_ON_ARGUMENT'], [['strleft', 'left'], 'STRING', ['STRING', 'INT'], 'DEPEND_ON_ARGUMENT'], @@ -2019,7 +2024,10 @@ visible_functions = { "Url": [ [['domain'], 'STRING', ['STRING'], ''], [['domain_without_www'], 'STRING', ['STRING'], ''], - [['protocol'], 'STRING', ['STRING'], ''] + [['protocol'], 'STRING', ['STRING'], ''], + [['top_level_domain'], 'STRING', ['STRING'], ''], + [['cut_to_first_significant_subdomain'], 'STRING', ['STRING'], ''], + [['first_significant_subdomain'], 'STRING', ['STRING'], ''] ], # search functions diff --git a/regression-test/data/correctness_p0/test_bit_test_function.out b/regression-test/data/correctness_p0/test_bit_test_function.out new file mode 100644 index 00000000000..365f4d95921 --- /dev/null +++ b/regression-test/data/correctness_p0/test_bit_test_function.out @@ -0,0 +1,191 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !bit_test1 -- +0 0 +1 1 +2 0 +3 1 +4 0 +5 1 +6 0 +7 1 +8 0 +9 1 + +-- !bit_test2 -- +0 0 +1 1 +2 0 +3 1 +4 0 +5 1 +6 0 +7 1 +8 0 +9 1 + +-- !bit_test3 -- +0 0 +1 0 +2 1 +3 1 +4 0 +5 0 +6 1 +7 1 +8 0 +9 0 + +-- !bit_test4 -- +0 0 +1 0 +2 0 +3 1 +4 0 +5 0 +6 0 +7 1 +8 0 +9 0 + +-- !bit_test5 -- +1 + +-- !bit_test6 -- +1 + +-- !bit_test7 -- +1 + +-- !bit_test8 -- +1 + +-- !bit_test9 -- +1 + +-- !bit_test10 -- +0 + +-- !bit_test11 -- +0 + +-- !bit_test12 -- +0 + +-- !bit_test13 -- +0 + +-- !bit_test14 -- +1 + +-- !bit_test_TINYINT_MAX -- +1 + +-- !bit_test_TINYINT_MIN -- +0 + +-- !bit_test_SMALLINT_MAX -- +1 + +-- !bit_test_SMALLINT_MIN -- +0 + +-- !bit_test_INT_MAX -- +1 + +-- !bit_test_INT_MIN -- +0 + +-- !bit_test_INT64_MAX -- +1 + +-- !bit_test_INT64_MIN -- +0 + +-- !bit_test_INT128_MAX -- +1 1 + +-- !bit_test_INT128_MIN -- +0 0 + +-- !select1_const -- +\N + +-- !select2_const -- +\N + +-- !select3_const -- +\N + +-- !select4_const -- +\N + +-- !select1_null_null -- +1 1 1 0 +2 \N 1 \N +3 \N \N \N +4 \N \N \N +5 \N \N \N + +-- !select2_null_not_null -- +1 1 1 0 +2 \N 1 \N +3 \N 1 \N +4 \N 1 \N +5 \N 2147483647 \N + +-- !select3_not_null_not_null -- +1 1 1 0 +2 1 1 0 +3 1 1 0 +4 2147483647 1 1 +5 2147483647 2147483647 0 + +-- !select4_not_null_null -- +1 1 1 0 +2 1 1 0 +3 1 \N \N +4 2147483647 \N \N +5 2147483647 \N \N + +-- !select5_null_const -- +1 1 1 0 +2 \N 1 \N +3 \N 1 \N +4 \N 1 \N +5 \N 1 \N + +-- !select6_not_null_const -- +1 1 1 0 +2 1 1 0 +3 1 1 0 +4 2147483647 1 1 +5 2147483647 1 1 + +-- !select7_const_null -- +1 6 1 1 +2 6 1 1 +3 6 \N \N +4 6 \N \N +5 6 \N \N + +-- !select7_const_not_null -- +1 6 1 1 +2 6 1 1 +3 6 1 1 +4 6 1 1 +5 6 2147483647 0 + +-- !select7_null_null -- +1 1 1 0 +2 \N 1 \N +3 \N \N \N +4 \N \N \N +5 \N \N \N + +-- !select7_not_null_not_null -- +1 1 1 0 +2 1 1 0 +3 1 1 0 +4 2147483647 1 1 +5 2147483647 2147483647 0 + diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out b/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out new file mode 100644 index 00000000000..9bee1363c66 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out @@ -0,0 +1,147 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select1 -- +\N + +-- !select2 -- +\N + +-- !select3 -- +\N + +-- !select4 -- +2 + +-- !select5 -- +6 + +-- !select6 -- +1 + +-- !select4_empty -- + +-- !select5_empty -- + +-- !select6_empty -- + +-- !select7_empty -- + +-- !select5_null_null -- +abcde 0 + 0 + a 0 +\N \N \N +asdasd a 2 +a1b1c1d 1 3 +,,, # 0 +a,b,c v 0 +a,b,c, \N \N +\N asd \N +a,b,c,12345 5 1 +a,b,c,12345 a 1 +a,你,你,1我2你4我5 你 3 + +-- !select6_null_not -- +abcde 0 + 0 + a 0 +\N \N +asdasd a 2 +a1b1c1d 1 3 +,,, # 0 +a,b,c v 0 +a,b,c, 0 +\N asd \N +a,b,c,12345 5 1 +a,b,c,12345 a 1 +a,你,你,1我2你4我5 我 2 + +-- !select7_not_null -- +abcde 0 + 0 + a 0 + \N \N +asdasd a 2 +a1b1c1d 1 3 +,,, # 0 +a,b,c v 0 +a,b,c \N \N + asd 0 +a,b,c,12345 5 1 +a,b,c,12345 a 1 +a你,你,1我2你4我5 你 3 + +-- !select8_not_not -- +abcde 0 + 0 + a 0 + 0 +asdasd a 2 +a1b1c1d 1 3 +,,, # 0 +a,b,c v 0 +a,b,c 0 + asd 0 +a,b,c,12345 5 1 +a,b,c,12345 a 1 +a你,你,1我2你4我5 我 2 + +-- !select9_null_const -- +abcde a 1 + a 0 + a 0 +\N a \N +asdasd a 2 +a1b1c1d a 1 +,,, a 0 +a,b,c a 1 +a,b,c, a 1 +\N a \N +a,b,c,12345 a 1 +a,b,c,12345 a 1 +a,你,你,1我2你4我5 a 1 + +-- !select10_not_null_const -- +abcde a 1 + a 0 + a 0 + a 0 +asdasd a 2 +a1b1c1d a 1 +,,, a 0 +a,b,c a 1 +a,b,c a 1 + a 0 +a,b,c,12345 a 1 +a,b,c,12345 a 1 +a你,你,1我2你4我5 a 1 + +-- !select11_const_null -- +a 0 +a 0 +a a 1 +a \N \N +a a 1 +a 1 0 +a # 0 +a v 0 +a \N \N +a asd 0 +a 5 0 +a a 1 +a 你 0 + +-- !select12_const_not_null -- +a 0 +a 0 +a a 1 +a 0 +a a 1 +a 1 0 +a # 0 +a v 0 +a 0 +a asd 0 +a 5 0 +a a 1 +a 我 0 + diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out new file mode 100644 index 00000000000..ce1ef717975 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_url_functions.out @@ -0,0 +1,121 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !empty_nullable1 -- + +-- !empty_nullable2 -- + +-- !empty_nullable3 -- + +-- !empty_not_nullable1 -- + +-- !empty_not_nullable2 -- + +-- !empty_not_nullable3 -- + +-- !empty_null1 -- +\N + +-- !empty_null2 -- +\N + +-- !empty_null3 -- +\N + +-- !empty_const1 -- +com + +-- !empty_const2 -- +baidu + +-- !empty_const3 -- +baidu.com + +-- !empty_const4 -- +cn + +-- !empty_const5 -- +google + +-- !empty_const6 -- +google.com.cn + +-- !empty_const7 -- + + +-- !empty_const8 -- + + +-- !empty_const9 -- + + +-- !nullable1 -- +1 www.baidu.com com +10 https://news.clickhouse.com.tr/ tr +2 www.google.com.cn cn +3 invalid url +4 +5 +6 \N \N +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b com +9 https://news.clickhouse.com/ com + +-- !nullable2 -- +1 www.baidu.com baidu +10 https://news.clickhouse.com.tr/ clickhouse +2 www.google.com.cn google +3 invalid url +4 +5 +6 \N \N +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example +9 https://news.clickhouse.com/ clickhouse + +-- !nullable3 -- +1 www.baidu.com baidu.com +10 https://news.clickhouse.com.tr/ clickhouse.com.tr +2 www.google.com.cn google.com.cn +3 invalid url +4 +5 +6 \N \N +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example.com +9 https://news.clickhouse.com/ clickhouse.com + +-- !not_nullable1 -- +1 www.baidu.com com +10 https://news.clickhouse.com.tr/ tr +2 www.google.com.cn cn +3 invalid url +4 +5 +6 +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b com +9 https://news.clickhouse.com/ com + +-- !not_nullable2 -- +1 www.baidu.com baidu +10 https://news.clickhouse.com.tr/ clickhouse +2 www.google.com.cn google +3 invalid url +4 +5 +6 +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example +9 https://news.clickhouse.com/ clickhouse + +-- !not_nullable3 -- +1 www.baidu.com baidu.com +10 https://news.clickhouse.com.tr/ clickhouse.com.tr +2 www.google.com.cn google.com.cn +3 invalid url +4 +5 +6 +7 xxxxxxxx +8 http://www.example.com/a/b/c?a=b example.com +9 https://news.clickhouse.com/ clickhouse.com + diff --git a/regression-test/suites/correctness_p0/test_bit_test_function.groovy b/regression-test/suites/correctness_p0/test_bit_test_function.groovy new file mode 100644 index 00000000000..6d2ab6da3a2 --- /dev/null +++ b/regression-test/suites/correctness_p0/test_bit_test_function.groovy @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_bit_test_function") { + qt_bit_test1 'select number,bit_test(number,0) from numbers("number"="10") order by 1;' + qt_bit_test2 'select number,bit_test_all(number,0) from numbers("number"="10") order by 1;' + qt_bit_test3 'select number,bit_test_all(number,1) from numbers("number"="10") order by 1;' + qt_bit_test4 'select number,bit_test(number,0,1) from numbers("number"="10") order by 1;' + qt_bit_test5 'select bit_test(cast (-1 as tinyint),0);' + qt_bit_test6 'select bit_test(cast (-1 as smallint),1);' + qt_bit_test7 'select bit_test(cast (-1 as int),2);' + qt_bit_test8 'select bit_test(cast (-1 as bigint),3);' + qt_bit_test9 'select bit_test(cast (-1 as largeint),4);' + qt_bit_test10 'select bit_test(10,-1);' + qt_bit_test11 'select bit_test(100,-2);' + qt_bit_test12 'select bit_test(100,1000);' + qt_bit_test13 'select bit_test(-43,1);' + qt_bit_test14 'select bit_test(-43,2);' + qt_bit_test_TINYINT_MAX 'select bit_test(cast (127 as tinyint),2);' // TINYINT_MAX + qt_bit_test_TINYINT_MIN 'select bit_test(cast (-128 as tinyint),4);' // TINYINT_MIN + qt_bit_test_SMALLINT_MAX 'select bit_test(cast (32767 as smallint),5);' // SMALLINT_MAX + qt_bit_test_SMALLINT_MIN 'select bit_test(cast (-32768 as smallint),10);' // SMALLINT_MIN + qt_bit_test_INT_MAX 'select bit_test(cast (2147483647 as int),12);' // INT_MAX + qt_bit_test_INT_MIN 'select bit_test(cast (-2147483648 as int),11);' // INT_MIN + qt_bit_test_INT64_MAX 'select bit_test(cast (9223372036854775807 as bigint),12);' // INT64_MAX + qt_bit_test_INT64_MIN 'select bit_test(cast (-9223372036854775808 as bigint),12);' // INT64_MIN + // INT128_MAX + qt_bit_test_INT128_MAX """ + select bit_test(170141183460469231731687303715884105727,13), + bit_test(cast (170141183460469231731687303715884105727 as largeint),13); + """ + // INT128_MIN + qt_bit_test_INT128_MIN """ + select bit_test(-170141183460469231731687303715884105728,11), + bit_test(cast (-170141183460469231731687303715884105728 as largeint),11); + """ + // NULL + qt_select1_const "select bit_test(NULL,1);" + qt_select2_const "select bit_test(1,NULL);" + qt_select3_const "select bit_test(NULL,NULL);" + qt_select4_const "select bit_test(111,1,2,3,NULL);" + + sql """DROP TABLE IF EXISTS test_bit_test""" + sql """ + CREATE TABLE IF NOT EXISTS test_bit_test ( + `k1` int(11) NULL COMMENT "", + `s1` int(20) NULL COMMENT "", + `s2` int(20) NOT NULL COMMENT "", + `p1` int(20) NULL COMMENT "", + `p2` int(20) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + sql """ INSERT INTO test_bit_test VALUES(1, 1, 1, 1, 1); """ + sql """ INSERT INTO test_bit_test VALUES(2, NULL, 1, 1, 1); """ + sql """ INSERT INTO test_bit_test VALUES(3, NULL, 1, NULL, 1); """ + sql """ INSERT INTO test_bit_test VALUES(4, NULL, 2147483647, NULL, 1); """ + sql """ INSERT INTO test_bit_test VALUES(5, NULL, 2147483647, NULL, 2147483647); """ + + + // null and not_null combine + qt_select1_null_null "select k1,s1,p1,bit_test(s1, p1) from test_bit_test order by k1;" + qt_select2_null_not_null "select k1,s1,p2,bit_test(s1, p2) from test_bit_test order by k1;" + qt_select3_not_null_not_null "select k1,s2,p2,bit_test(s2, p2) from test_bit_test order by k1;" + qt_select4_not_null_null "select k1,s2,p1,bit_test(s2, p1) from test_bit_test order by k1;" + qt_select5_null_const "select k1,s1,1,bit_test(s1, 1) from test_bit_test order by k1;" + qt_select6_not_null_const "select k1,s2,1,bit_test(s2, 1) from test_bit_test order by k1;" + qt_select7_const_null "select k1,6,p1,bit_test(6, p1) from test_bit_test order by k1;" + qt_select7_const_not_null "select k1,6,p2,bit_test(6, p2) from test_bit_test order by k1;" + qt_select7_null_null "select k1,s1,p1,bit_test(s1, p1,1,2,3) from test_bit_test order by k1;" + qt_select7_not_null_not_null "select k1,s2,p2,bit_test(s2, p2,1,2,3) from test_bit_test order by k1;" +} diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy new file mode 100644 index 00000000000..64051ec7afc --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_count_substrings") { + // const / NULL + qt_select1 "select count_substrings(NULL,NULL);" + qt_select2 "select count_substrings('a12bc23de345f',NULL);" + qt_select3 "select count_substrings(NULL, 'a12bc23de345f');" + qt_select4 "select count_substrings('a12bc23de345f','2');" + qt_select5 "select count_substrings('a1你你c我你3我d你3你5你','你');" + qt_select6 "select count_substrings('ccc','cc');" + + sql """DROP TABLE IF EXISTS test_count_substrings""" + sql """ + CREATE TABLE IF NOT EXISTS test_count_substrings ( + `k1` int(11) NULL COMMENT "", + `s1` varchar(30) NULL COMMENT "", + `s2` varchar(30) NOT NULL COMMENT "", + `p1` varchar(30) NULL COMMENT "", + `p2` varchar(30) NOT NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`k1`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + // empty + qt_select4_empty "select count_substrings(s1,p1) from test_count_substrings;" + qt_select5_empty "select count_substrings(s2,p2) from test_count_substrings;" + qt_select6_empty "select count_substrings(s1,p2) from test_count_substrings;" + qt_select7_empty "select count_substrings(s2,p1) from test_count_substrings;" + + // some normal/special/null value + sql """ INSERT INTO test_count_substrings VALUES(1, 'abcde', 'abcde', '', '') """ + sql """ INSERT INTO test_count_substrings VALUES(2, '', '', '', '') """ + sql """ INSERT INTO test_count_substrings VALUES(3, '', '','a','a') """ + sql """ INSERT INTO test_count_substrings VALUES(4, NULL, '', NULL,'') """ + sql """ INSERT INTO test_count_substrings VALUES(5, 'asdasd', 'asdasd','a','a') """ + sql """ INSERT INTO test_count_substrings VALUES(6, 'a1b1c1d', 'a1b1c1d','1','1') """ + sql """ INSERT INTO test_count_substrings VALUES(7, ',,,', ',,,','#','#') """ + sql """ INSERT INTO test_count_substrings VALUES(8, 'a,b,c', 'a,b,c','v','v') """ + sql """ INSERT INTO test_count_substrings VALUES(9, 'a,b,c,', 'a,b,c',NULL,'') """ + sql """ INSERT INTO test_count_substrings VALUES(10, NULL, '','asd','asd') """ + sql """ INSERT INTO test_count_substrings VALUES(11, 'a,b,c,12345', 'a,b,c,12345','5','5') """ + sql """ INSERT INTO test_count_substrings VALUES(12, 'a,b,c,12345', 'a,b,c,12345','a','a') """ + sql """ INSERT INTO test_count_substrings VALUES(13, 'a,你,你,1我2你4我5', 'a你,你,1我2你4我5','你','我') """ + + // null and not_null combine + qt_select5_null_null "select s1,p1,count_substrings(s1, p1) from test_count_substrings order by k1;" + qt_select6_null_not "select s1, p2,count_substrings(s1, p2) from test_count_substrings order by k1;" + qt_select7_not_null "select s2, p1,count_substrings(s2, p1) from test_count_substrings order by k1;" + qt_select8_not_not "select s2, p2,count_substrings(s2, p2) from test_count_substrings order by k1;" + + // null const combine + qt_select9_null_const "select s1, 'a',count_substrings(s1, 'a') from test_count_substrings order by k1;" + qt_select10_not_null_const "select s2, 'a',count_substrings(s2, 'a') from test_count_substrings order by k1;" + qt_select11_const_null "select 'a',p1,count_substrings('a', p1) from test_count_substrings order by k1;" + qt_select12_const_not_null "select 'a',p2,count_substrings('a', p2) from test_count_substrings order by k1;" +} + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy new file mode 100644 index 00000000000..389020b63e2 --- /dev/null +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_url_functions.groovy @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_url_functions") { + sql " drop table if exists test_url_functions" + sql """ + create table test_url_functions ( + id int, + s1 string not null, + s2 string null + ) + DISTRIBUTED BY HASH(id) + PROPERTIES + ( + "replication_num" = "1" + ); + """ + + //empty table + order_qt_empty_nullable1 "select top_level_domain(s2) from test_url_functions" + order_qt_empty_nullable2 "select first_significant_subdomain(s2) from test_url_functions" + order_qt_empty_nullable3 "select cut_to_first_significant_subdomain(s2) from test_url_functions" + order_qt_empty_not_nullable1 "select top_level_domain(s1) from test_url_functions" + order_qt_empty_not_nullable2 "select first_significant_subdomain(s1) from test_url_functions" + order_qt_empty_not_nullable3 "select cut_to_first_significant_subdomain(s1) from test_url_functions" + + //null / const + order_qt_empty_null1 "select top_level_domain(NULL)" + order_qt_empty_null2 "select first_significant_subdomain(NULL)" + order_qt_empty_null3 "select cut_to_first_significant_subdomain(NULL)" + + //vaild url + order_qt_empty_const1 "select top_level_domain('www.baidu.com')" + order_qt_empty_const2 "select first_significant_subdomain('www.baidu.com')" + order_qt_empty_const3 "select cut_to_first_significant_subdomain('www.baidu.com')" + order_qt_empty_const4 "select top_level_domain('www.google.com.cn')" + order_qt_empty_const5 "select first_significant_subdomain('www.google.com.cn')" + order_qt_empty_const6 "select cut_to_first_significant_subdomain('www.google.com.cn')" + + //invaild url + order_qt_empty_const7 "select top_level_domain('I am invaild url')" + order_qt_empty_const8 "select first_significant_subdomain('I am invaild url')" + order_qt_empty_const9 "select cut_to_first_significant_subdomain('I am invaild url')" + + + sql """ insert into test_url_functions values (1, 'www.baidu.com', 'www.baidu.com'); """ + sql """ insert into test_url_functions values (2, 'www.google.com.cn', 'www.google.com.cn'); """ + sql """ insert into test_url_functions values (3, 'invalid url', 'invalid url'); """ + sql """ insert into test_url_functions values (4, '', ''); """ + sql """ insert into test_url_functions values (5, ' ', ' '); """ + sql """ insert into test_url_functions values (6, ' ', NULL); """ + sql """ insert into test_url_functions values (7, 'xxxxxxxx', 'xxxxxxxx'); """ + sql """ insert into test_url_functions values (8, 'http://www.example.com/a/b/c?a=b', 'http://www.example.com/a/b/c?a=b'); """ + sql """ insert into test_url_functions values (9, 'https://news.clickhouse.com/', 'https://news.clickhouse.com/'); """ + sql """ insert into test_url_functions values (10, 'https://news.clickhouse.com.tr/', 'https://news.clickhouse.com.tr/'); """ + + order_qt_nullable1 "select id,s2,top_level_domain(s2) from test_url_functions order by id" + order_qt_nullable2 "select id,s2,first_significant_subdomain(s2) from test_url_functions order by id" + order_qt_nullable3 "select id,s2,cut_to_first_significant_subdomain(s2) from test_url_functions order by id" + + order_qt_not_nullable1 "select id,s1,top_level_domain(s1) from test_url_functions order by id" + order_qt_not_nullable2 "select id,s1,first_significant_subdomain(s1) from test_url_functions order by id" + order_qt_not_nullable3 "select id,s1,cut_to_first_significant_subdomain(s1) from test_url_functions order by id" + +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org