github-actions[bot] commented on code in PR #41590: URL: https://github.com/apache/doris/pull/41590#discussion_r1792933449
########## be/src/util/jsonb_document.h: ########## @@ -362,6 +378,19 @@ leg_vector.emplace_back(leg.release()); } + void pop_leg_from_leg_vector() { leg_vector.pop_back(); } + + bool to_string(std::string* res) const { + res->push_back(SCOPE); + for (const auto& leg : leg_vector) { + auto valid = leg->to_string(res); + if (!valid) { + return false; + } + } + return true; + } + size_t get_leg_vector_size() { return leg_vector.size(); } Review Comment: warning: use of undeclared identifier 'leg_vector' [clang-diagnostic-error] ```cpp size_t get_leg_vector_size() { return leg_vector.size(); } ^ ``` ########## be/src/util/jsonb_document.h: ########## @@ -362,6 +378,19 @@ class JsonbPath { leg_vector.emplace_back(leg.release()); Review Comment: warning: use of undeclared identifier 'leg_vector' [clang-diagnostic-error] ```cpp leg_vector.emplace_back(leg.release()); ^ ``` ########## be/src/vec/functions/function_jsonb.cpp: ########## @@ -1597,6 +1599,356 @@ } }; +class FunctionJsonSearch : public IFunction { +private: + using OneFun = std::function<Status(size_t, bool*)>; + static Status always_one(size_t i, bool* res) { + *res = true; + return Status::OK(); + } + static Status always_all(size_t i, bool* res) { + *res = false; + return Status::OK(); + } + + using CheckNullFun = std::function<bool(size_t)>; + static bool always_not_null(size_t) { return false; } + static bool always_null(size_t) { return true; } + + using GetJsonStringRefFun = std::function<StringRef(size_t)>; + + Status matched(const std::string_view& str, LikeState* state, unsigned char* res) const { + StringRef pattern; // not used + StringRef value_val(str.data(), str.size()); + return (state->scalar_function)(&state->search_state, value_val, pattern, res); + } + + /** + * Recursive search for matching string, if found, the result will be added to a vector + * @param element json element + * @param one_match + * @param search_str + * @param cur_path + * @param matches The path that has already been matched + * @return true if matched else false + */ + bool find_matches(const SimdJSONParser::Element& element, const bool& one_match, + LikeState* state, JsonbPath* cur_path, + std::unordered_set<std::string>* matches) const { + if (element.isString()) { + const std::string_view str = element.getString(); + unsigned char res; + RETURN_IF_ERROR(matched(str, state, &res)); + if (res) { + std::string str; + auto valid = cur_path->to_string(&str); + if (!valid) { + return false; + } + auto res = matches->insert(str); + return res.second; + } else { + return false; + } + } else if (element.isObject()) { + const SimdJSONParser::Object& object = element.getObject(); + bool find = false; + for (size_t i = 0; i < object.size(); ++i) { + const SimdJSONParser::KeyValuePair& item = object[i]; + const std::string_view& key = item.first; + const SimdJSONParser::Element& child_element = item.second; + // construct an object member path leg. + auto leg = std::make_unique<leg_info>(const_cast<char*>(key.data()), key.size(), 0, + MEMBER_CODE); + cur_path->add_leg_to_leg_vector(std::move(leg)); + find |= find_matches(child_element, one_match, state, cur_path, matches); + cur_path->pop_leg_from_leg_vector(); + if (one_match && find) { + return true; + } + } + return find; + } else if (element.isArray()) { + const SimdJSONParser::Array& array = element.getArray(); + bool find = false; + for (size_t i = 0; i < array.size(); ++i) { + auto leg = std::make_unique<leg_info>(nullptr, 0, i, ARRAY_CODE); + cur_path->add_leg_to_leg_vector(std::move(leg)); + const SimdJSONParser::Element& child_element = array[i]; + // construct an array cell path leg. + find |= find_matches(child_element, one_match, state, cur_path, matches); + cur_path->pop_leg_from_leg_vector(); + if (one_match && find) { + return true; + } + } + return find; + } else { + return false; + } + } + + void make_result_str(std::unordered_set<std::string>& matches, ColumnString* result_col) const { + JsonbWriter writer; + if (matches.size() == 1) { + for (const auto& str_ref : matches) { + writer.writeStartString(); + writer.writeString(str_ref); + writer.writeEndString(); + } + } else { + writer.writeStartArray(); + for (const auto& str_ref : matches) { + writer.writeStartString(); + writer.writeString(str_ref); + writer.writeEndString(); + } + writer.writeEndArray(); + } + + result_col->insert_data(writer.getOutput()->getBuffer(), + (size_t)writer.getOutput()->getSize()); + } + + template <bool search_is_const> + Status execute_vector(Block& block, size_t input_rows_count, CheckNullFun json_null_check, + GetJsonStringRefFun col_json_string, CheckNullFun one_null_check, + OneFun one_check, CheckNullFun search_null_check, + const ColumnString* col_search_string, FunctionContext* context, + size_t result) const { + auto result_col = ColumnString::create(); + auto null_map = ColumnUInt8::create(input_rows_count, 0); + + std::shared_ptr<LikeState> state_ptr; + LikeState* state = nullptr; + if (search_is_const) { + state = reinterpret_cast<LikeState*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + } + + SimdJSONParser parser; + SimdJSONParser::Element root_element; + bool is_one = false; + + for (size_t i = 0; i < input_rows_count; ++i) { + // an error occurs if the json_doc argument is not a valid json document. + if (json_null_check(i)) { + null_map->get_data()[i] = 1; + result_col->insert_data("", 0); + continue; + } + const auto& json_doc = col_json_string(i); + if (!parser.parse(json_doc.data, json_doc.size, root_element)) { + return Status::InvalidArgument( + "the json_doc argument {} is not a valid json document", json_doc); + } + + if (!one_null_check(i)) { + RETURN_IF_ERROR(one_check(i, &is_one)); + } + + if (one_null_check(i) || search_null_check(i)) { + null_map->get_data()[i] = 1; + result_col->insert_data("", 0); + continue; + } + + // an error occurs if any path argument is not a valid path expression. + std::string root_path_str = "$"; + JsonbPath root_path; + root_path.seek(root_path_str.c_str(), root_path_str.size()); + std::vector<JsonbPath*> paths; + paths.push_back(&root_path); + + if (!search_is_const) { + state_ptr = std::make_shared<LikeState>(); + state_ptr->is_like_pattern = true; + const auto& search_str = col_search_string->get_data_at(i); + RETURN_IF_ERROR(FunctionLike::construct_like_const_state(context, search_str, + state_ptr, false)); + state = state_ptr.get(); + } + + // maintain a hashset to deduplicate matches. + std::unordered_set<std::string> matches; + for (const auto& item : paths) { + auto cur_path = item; + auto find = find_matches(root_element, is_one, state, cur_path, &matches); + if (is_one && find) { + break; + } + } + if (matches.empty()) { + // returns NULL if the search_str is not found in the document. + null_map->get_data()[i] = 1; + result_col->insert_data("", 0); + continue; + } + make_result_str(matches, result_col.get()); + } + auto result_col_nullable = + ColumnNullable::create(std::move(result_col), std::move(null_map)); + block.replace_by_position(result, std::move(result_col_nullable)); + return Status::OK(); + } + + static constexpr auto one = "one"; + static constexpr auto all = "all"; + +public: + static constexpr auto name = "json_search"; + static FunctionPtr create() { return std::make_shared<FunctionJsonSearch>(); } + + String get_name() const override { return name; } + bool is_variadic() const override { return false; } + size_t get_number_of_arguments() const override { return 3; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared<DataTypeJsonb>()); + } + + bool use_default_implementation_for_nulls() const override { return false; } + + Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { + if (scope != FunctionContext::THREAD_LOCAL) { + return Status::OK(); + } + if (context->is_col_constant(2)) { + std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); + state->is_like_pattern = true; + const auto pattern_col = context->get_constant_col(2)->column_ptr; + const auto& pattern = pattern_col->get_data_at(0); + RETURN_IF_ERROR( + FunctionLike::construct_like_const_state(context, pattern, state, false)); + context->set_function_state(scope, state); + } + return Status::OK(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, Review Comment: warning: function 'execute_impl' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, ^ ``` <details> <summary>Additional context</summary> **be/src/vec/functions/function_jsonb.cpp:1827:** 120 lines including whitespace and comments (threshold 80) ```cpp Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, ^ ``` </details> ########## be/src/util/jsonb_document.h: ########## @@ -362,6 +378,19 @@ leg_vector.emplace_back(leg.release()); } + void pop_leg_from_leg_vector() { leg_vector.pop_back(); } Review Comment: warning: use of undeclared identifier 'leg_vector' [clang-diagnostic-error] ```cpp void pop_leg_from_leg_vector() { leg_vector.pop_back(); } ^ ``` ########## be/src/vec/functions/like.cpp: ########## @@ -816,119 +816,121 @@ void verbose_log_match(const std::string& str, const std::string& pattern_name, } } +Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern, Review Comment: warning: function 'construct_like_const_state' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern, ^ ``` <details> <summary>Additional context</summary> **be/src/vec/functions/like.cpp:818:** 96 lines including whitespace and comments (threshold 80) ```cpp Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern, ^ ``` </details> ########## be/src/vec/functions/function_jsonb.cpp: ########## @@ -1597,6 +1599,356 @@ struct JsonbContainsAndPathImpl { } }; +class FunctionJsonSearch : public IFunction { +private: + using OneFun = std::function<Status(size_t, bool*)>; + static Status always_one(size_t i, bool* res) { + *res = true; + return Status::OK(); + } + static Status always_all(size_t i, bool* res) { + *res = false; + return Status::OK(); + } + + using CheckNullFun = std::function<bool(size_t)>; + static bool always_not_null(size_t) { return false; } + static bool always_null(size_t) { return true; } + + using GetJsonStringRefFun = std::function<StringRef(size_t)>; + + Status matched(const std::string_view& str, LikeState* state, unsigned char* res) const { + StringRef pattern; // not used + StringRef value_val(str.data(), str.size()); + return (state->scalar_function)(&state->search_state, value_val, pattern, res); + } + + /** + * Recursive search for matching string, if found, the result will be added to a vector + * @param element json element + * @param one_match + * @param search_str + * @param cur_path + * @param matches The path that has already been matched + * @return true if matched else false + */ + bool find_matches(const SimdJSONParser::Element& element, const bool& one_match, + LikeState* state, JsonbPath* cur_path, + std::unordered_set<std::string>* matches) const { + if (element.isString()) { + const std::string_view str = element.getString(); + unsigned char res; + RETURN_IF_ERROR(matched(str, state, &res)); + if (res) { + std::string str; + auto valid = cur_path->to_string(&str); + if (!valid) { + return false; + } + auto res = matches->insert(str); + return res.second; + } else { + return false; + } + } else if (element.isObject()) { + const SimdJSONParser::Object& object = element.getObject(); + bool find = false; + for (size_t i = 0; i < object.size(); ++i) { + const SimdJSONParser::KeyValuePair& item = object[i]; + const std::string_view& key = item.first; + const SimdJSONParser::Element& child_element = item.second; + // construct an object member path leg. + auto leg = std::make_unique<leg_info>(const_cast<char*>(key.data()), key.size(), 0, + MEMBER_CODE); + cur_path->add_leg_to_leg_vector(std::move(leg)); + find |= find_matches(child_element, one_match, state, cur_path, matches); + cur_path->pop_leg_from_leg_vector(); + if (one_match && find) { + return true; + } + } + return find; + } else if (element.isArray()) { + const SimdJSONParser::Array& array = element.getArray(); + bool find = false; + for (size_t i = 0; i < array.size(); ++i) { Review Comment: warning: use range-based for loop instead [modernize-loop-convert] ```suggestion for (auto child_element : array) { ``` be/src/vec/functions/function_jsonb.cpp:1676: ```diff - const SimdJSONParser::Element& child_element = array[i]; - // construct an array cell path leg. + // construct an array cell path leg. ``` ########## be/src/util/jsonb_document.h: ########## @@ -362,6 +378,19 @@ leg_vector.emplace_back(leg.release()); } + void pop_leg_from_leg_vector() { leg_vector.pop_back(); } + + bool to_string(std::string* res) const { + res->push_back(SCOPE); + for (const auto& leg : leg_vector) { + auto valid = leg->to_string(res); + if (!valid) { + return false; + } + } + return true; + } + size_t get_leg_vector_size() { return leg_vector.size(); } leg_info* get_leg_from_leg_vector(size_t i) { return leg_vector[i].get(); } Review Comment: warning: use of undeclared identifier 'leg_vector' [clang-diagnostic-error] ```cpp leg_info* get_leg_from_leg_vector(size_t i) { return leg_vector[i].get(); } ^ ``` ########## be/src/util/jsonb_document.h: ########## @@ -362,6 +378,19 @@ leg_vector.emplace_back(leg.release()); } + void pop_leg_from_leg_vector() { leg_vector.pop_back(); } + + bool to_string(std::string* res) const { + res->push_back(SCOPE); + for (const auto& leg : leg_vector) { Review Comment: warning: use of undeclared identifier 'leg_vector' [clang-diagnostic-error] ```cpp for (const auto& leg : leg_vector) { ^ ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org