This is an automated email from the ASF dual-hosted git repository. zhangstar333 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new e9500b3e9b2 [opt](fucntion) improve json_extract function handle const column (#36927) e9500b3e9b2 is described below commit e9500b3e9b2be42434bb0fb970e4bcb89774bd60 Author: zhangstar333 <87313068+zhangstar...@users.noreply.github.com> AuthorDate: Wed Jul 3 20:19:16 2024 +0800 [opt](fucntion) improve json_extract function handle const column (#36927) ## Proposed changes VARCHAR json_extract(VARCHAR json_str, VARCHAR path[, VARCHAR path] ...) ``` for (int i = 0; i < arguments.size(); i++) { column_ptrs.push_back( block.get_by_position(arguments[i]).column->convert_to_full_column_if_const()); data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get())); } before not handle const column, as the input arguments are variadic, and most user case is like: json_extract(column, '$.fparam.nested_2') so could special deal with two arguments could reuse json document. ``` ``` mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from json_table_2; +---------------------------------------------+ | count(json_extract(a, '$.fparam.nested_2')) | +---------------------------------------------+ | 10000001 | +---------------------------------------------+ 1 row in set (1.06 sec) mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from json_table_2; +---------------------------------------------+ | count(json_extract(a, '$.fparam.nested_2')) | +---------------------------------------------+ | 10000001 | +---------------------------------------------+ 1 row in set (1.02 sec) mysql [test]> mysql [test]> mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from json_table_2; +---------------------------------------------+ | count(json_extract(a, '$.fparam.nested_2')) | +---------------------------------------------+ | 10000001 | +---------------------------------------------+ 1 row in set (44.22 sec) mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from json_table_2; +---------------------------------------------+ | count(json_extract(a, '$.fparam.nested_2')) | +---------------------------------------------+ | 10000001 | +---------------------------------------------+ 1 row in set (42.80 sec) ``` <!--Describe your changes.--> --- be/src/vec/functions/function_json.cpp | 125 ++++++++++++++++++++++++++------- 1 file changed, 98 insertions(+), 27 deletions(-) diff --git a/be/src/vec/functions/function_json.cpp b/be/src/vec/functions/function_json.cpp index e7c2fc1781d..2faeb24d514 100644 --- a/be/src/vec/functions/function_json.cpp +++ b/be/src/vec/functions/function_json.cpp @@ -826,44 +826,61 @@ struct FunctionJsonExtractImpl { static constexpr auto name = "json_extract"; static rapidjson::Value parse_json(const ColumnString* json_col, const ColumnString* path_col, - rapidjson::Document::AllocatorType& allocator, - const int row) { + rapidjson::Document::AllocatorType& allocator, const int row, + const int col, std::vector<bool>& column_is_consts) { rapidjson::Value value; rapidjson::Document document; - const auto obj = json_col->get_data_at(row); + const auto obj = json_col->get_data_at(index_check_const(row, column_is_consts[0])); std::string_view json_string(obj.data, obj.size); - const auto path = path_col->get_data_at(row); + const auto path = path_col->get_data_at(index_check_const(row, column_is_consts[col])); std::string_view path_string(path.data, path.size); - - auto root = get_json_object<JSON_FUN_STRING>(json_string, path_string, &document); + auto* root = get_json_object<JSON_FUN_STRING>(json_string, path_string, &document); if (root != nullptr) { value.CopyFrom(*root, allocator); } return value; } + static rapidjson::Value* get_document(const ColumnString* path_col, + rapidjson::Document* document, + std::vector<JsonPath>& parsed_paths, const int row, + bool is_const_column) { + const auto path = path_col->get_data_at(index_check_const(row, is_const_column)); + std::string_view path_string(path.data, path.size); + //Cannot use '\' as the last character, return NULL + if (path_string.back() == '\\') { + document->SetNull(); + return nullptr; + } + +#ifdef USE_LIBCPP + std::string s(path_string); + auto tok = get_json_token(s); +#else + auto tok = get_json_token(path_string); +#endif + // TODO: here maybe could use std::vector<std::string_view> or std::span + std::vector<std::string> paths(tok.begin(), tok.end()); + get_parsed_paths(paths, &parsed_paths); + if (parsed_paths.empty()) { + return nullptr; + } + if (!(parsed_paths)[0].is_valid) { + return nullptr; + } + return document; + } + static void execute(const std::vector<const ColumnString*>& data_columns, - ColumnString& result_column, NullMap& null_map, size_t input_rows_count) { + ColumnString& result_column, NullMap& null_map, size_t input_rows_count, + std::vector<bool>& column_is_consts) { rapidjson::Document document; rapidjson::Document::AllocatorType& allocator = document.GetAllocator(); rapidjson::StringBuffer buf; rapidjson::Writer<rapidjson::StringBuffer> writer(buf); - - const auto json_col = data_columns[0]; - for (size_t row = 0; row < input_rows_count; row++) { - rapidjson::Value value; - if (data_columns.size() == 2) { - value = parse_json(json_col, data_columns[1], allocator, row); - } else { - value.SetArray(); - value.Reserve(data_columns.size() - 1, allocator); - for (size_t col = 1; col < data_columns.size(); ++col) { - value.PushBack(parse_json(json_col, data_columns[col], allocator, row), - allocator); - } - } - + const auto* json_col = data_columns[0]; + auto insert_result_lambda = [&](rapidjson::Value& value, int row) { if (value.IsNull()) { null_map[row] = 1; result_column.insert_default(); @@ -874,6 +891,57 @@ struct FunctionJsonExtractImpl { value.Accept(writer); result_column.insert_data(buf.GetString(), buf.GetSize()); } + }; + if (data_columns.size() == 2) { + rapidjson::Value value; + if (column_is_consts[1]) { + std::vector<JsonPath> parsed_paths; + auto* root = get_document(data_columns[1], &document, parsed_paths, 0, + column_is_consts[1]); + for (size_t row = 0; row < input_rows_count; row++) { + if (root != nullptr) { + const auto& obj = json_col->get_data_at(row); + std::string_view json_string(obj.data, obj.size); + if (UNLIKELY((parsed_paths).size() == 1)) { + document.SetString(json_string.data(), json_string.size(), allocator); + } + document.Parse(json_string.data(), json_string.size()); + if (UNLIKELY(document.HasParseError())) { + null_map[row] = 1; + result_column.insert_default(); + continue; + } + auto* root_val = match_value(parsed_paths, &document, allocator); + if (root_val != nullptr) { + value.CopyFrom(*root_val, allocator); + } else { + rapidjson::Value tmp; + value.Swap(tmp); + } + } + insert_result_lambda(value, row); + } + } else { + for (size_t row = 0; row < input_rows_count; row++) { + value = parse_json(json_col, data_columns[1], allocator, row, 1, + column_is_consts); + insert_result_lambda(value, row); + } + } + + } else { + rapidjson::Value value; + value.SetArray(); + value.Reserve(data_columns.size() - 1, allocator); + for (size_t row = 0; row < input_rows_count; row++) { + value.Clear(); + for (size_t col = 1; col < data_columns.size(); ++col) { + value.PushBack(parse_json(json_col, data_columns[col], allocator, row, col, + column_is_consts), + allocator); + } + insert_result_lambda(value, row); + } } } }; @@ -929,15 +997,18 @@ public: size_t result, size_t input_rows_count) const override { auto result_column = ColumnString::create(); auto null_map = ColumnUInt8::create(input_rows_count, 0); - std::vector<ColumnPtr> column_ptrs; // prevent converted column destruct std::vector<const ColumnString*> data_columns; + std::vector<bool> column_is_consts; for (int i = 0; i < arguments.size(); i++) { - column_ptrs.push_back( - block.get_by_position(arguments[i]).column->convert_to_full_column_if_const()); - data_columns.push_back(assert_cast<const ColumnString*>(column_ptrs.back().get())); + ColumnPtr arg_col; + bool arg_const; + std::tie(arg_col, arg_const) = + unpack_if_const(block.get_by_position(arguments[i]).column); + column_is_consts.push_back(arg_const); + data_columns.push_back(assert_cast<const ColumnString*>(arg_col.get())); } Impl::execute(data_columns, *assert_cast<ColumnString*>(result_column.get()), - null_map->get_data(), input_rows_count); + null_map->get_data(), input_rows_count, column_is_consts); block.replace_by_position( result, ColumnNullable::create(std::move(result_column), std::move(null_map))); return Status::OK(); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org