This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new be55cb8dfc [Improve](jsonb_extract) support jsonb_extract multi parse path (#21555) be55cb8dfc is described below commit be55cb8dfc4da4fefc25eb0f538a71454580b5e9 Author: amory <wangqian...@selectdb.com> AuthorDate: Wed Jul 12 21:37:36 2023 +0800 [Improve](jsonb_extract) support jsonb_extract multi parse path (#21555) support jsonb_extract multi parse path --- be/src/vec/functions/function_jsonb.cpp | 154 +++++++++++++++------ gensrc/script/doris_builtins_functions.py | 8 +- .../data/jsonb_p0/test_jsonb_load_and_function.out | 23 +++ .../test_jsonb_load_unique_key_and_function.out | 23 +++ .../jsonb_p0/test_jsonb_load_and_function.groovy | 3 + .../test_jsonb_load_unique_key_and_function.groovy | 4 + 6 files changed, 171 insertions(+), 44 deletions(-) diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp index 42bdbe4b85..d9e498b4cf 100644 --- a/be/src/vec/functions/function_jsonb.cpp +++ b/be/src/vec/functions/function_jsonb.cpp @@ -339,7 +339,7 @@ using FunctionJsonbParseNotnullErrorValue = using FunctionJsonbParseNotnullErrorInvalid = FunctionJsonbParseBase<NullalbeMode::NOT_NULL, JsonbParseErrorMode::RETURN_INVALID>; -// func(json,string) -> nullable(type) +// func(jsonb, [varchar, varchar, ...]) -> nullable(type) template <typename Impl> class FunctionJsonbExtract : public IFunction { public: @@ -347,7 +347,8 @@ public: static constexpr auto alias = Impl::alias; static FunctionPtr create() { return std::make_shared<FunctionJsonbExtract>(); } String get_name() const override { return name; } - size_t get_number_of_arguments() const override { return 2; } + bool is_variadic() const override { return true; } + size_t get_number_of_arguments() const override { return 0; } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { return make_nullable(std::make_shared<typename Impl::ReturnType>()); } @@ -355,26 +356,32 @@ public: Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, size_t result, size_t input_rows_count) override { auto null_map = ColumnUInt8::create(input_rows_count, 0); - DCHECK_EQ(arguments.size(), 2); - ColumnPtr argument_columns[2]; - bool col_const[2]; - for (int i = 0; i < 2; ++i) { - std::tie(argument_columns[i], col_const[i]) = - unpack_if_const(block.get_by_position(arguments[i]).column); - check_set_nullable(argument_columns[i], null_map, col_const[i]); + DCHECK_GE(arguments.size(), 2); + + ColumnPtr jsonb_data_column; + bool jsonb_data_const = false; + // prepare jsonb data column + std::tie(jsonb_data_column, jsonb_data_const) = + unpack_if_const(block.get_by_position(arguments[0]).column); + check_set_nullable(jsonb_data_column, null_map, jsonb_data_const); + auto& ldata = assert_cast<const ColumnString*>(jsonb_data_column.get())->get_chars(); + auto& loffsets = assert_cast<const ColumnString*>(jsonb_data_column.get())->get_offsets(); + + // prepare parse path column prepare + std::vector<const ColumnString*> jsonb_path_columns; + std::vector<bool> path_const(arguments.size() - 1); + for (int i = 0; i < arguments.size() - 1; ++i) { + ColumnPtr path_column; + bool is_const = false; + std::tie(path_column, is_const) = + unpack_if_const(block.get_by_position(arguments[i + 1]).column); + path_const[i] = is_const; + check_set_nullable(path_column, null_map, path_const[i]); + jsonb_path_columns.push_back(assert_cast<const ColumnString*>(path_column.get())); } auto res = Impl::ColumnType::create(); - auto jsonb_data_column = assert_cast<const ColumnString*>(argument_columns[0].get()); - auto jsonb_path_column = assert_cast<const ColumnString*>(argument_columns[1].get()); - - auto& ldata = jsonb_data_column->get_chars(); - auto& loffsets = jsonb_data_column->get_offsets(); - - auto& rdata = jsonb_path_column->get_chars(); - auto& roffsets = jsonb_path_column->get_offsets(); - bool is_invalid_json_path = false; // execute Impl @@ -382,36 +389,34 @@ public: std::is_same_v<typename Impl::ReturnType, DataTypeJsonb>) { auto& res_data = res->get_chars(); auto& res_offsets = res->get_offsets(); - if (col_const[0]) { - Impl::scalar_vector(context, jsonb_data_column->get_data_at(0), rdata, roffsets, - res_data, res_offsets, null_map->get_data(), - is_invalid_json_path); - } else if (col_const[1]) { - Impl::vector_scalar(context, ldata, loffsets, jsonb_path_column->get_data_at(0), - res_data, res_offsets, null_map->get_data(), - is_invalid_json_path); - } else { - Impl::vector_vector(context, ldata, loffsets, rdata, roffsets, res_data, - res_offsets, null_map->get_data(), is_invalid_json_path); + Status st = Impl::vector_vector_v2( + context, ldata, loffsets, jsonb_data_const, jsonb_path_columns, path_const, + res_data, res_offsets, null_map->get_data(), is_invalid_json_path); + if (st != Status::OK()) { + return st; } } else { - if (col_const[0]) { + // not support other extract type for now (e.g. int, double, ...) + DCHECK_EQ(jsonb_path_columns.size(), 1); + auto& rdata = jsonb_path_columns[0]->get_chars(); + auto& roffsets = jsonb_path_columns[0]->get_offsets(); + if (jsonb_data_const) { Impl::scalar_vector(context, jsonb_data_column->get_data_at(0), rdata, roffsets, res->get_data(), null_map->get_data(), is_invalid_json_path); - } else if (col_const[1]) { - Impl::vector_scalar(context, ldata, loffsets, jsonb_path_column->get_data_at(0), + } else if (path_const[0]) { + Impl::vector_scalar(context, ldata, loffsets, jsonb_path_columns[0]->get_data_at(0), res->get_data(), null_map->get_data(), is_invalid_json_path); } else { Impl::vector_vector(context, ldata, loffsets, rdata, roffsets, res->get_data(), null_map->get_data(), is_invalid_json_path); } - } - - if (is_invalid_json_path) { - return Status::InvalidArgument( - "Json path error: {} for value: {}", - JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH), - std::string_view(reinterpret_cast<const char*>(rdata.data()), rdata.size())); + if (is_invalid_json_path) { + return Status::InvalidArgument( + "Json path error: {} for value: {}", + JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH), + std::string_view(reinterpret_cast<const char*>(rdata.data()), + rdata.size())); + } } block.get_by_position(result).column = @@ -503,6 +508,77 @@ private: public: // for jsonb_extract_string + static Status vector_vector_v2( + FunctionContext* context, const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const bool& json_data_const, + const std::vector<const ColumnString*>& rdata_columns, // here we can support more paths + const std::vector<bool>& path_const, ColumnString::Chars& res_data, + ColumnString::Offsets& res_offsets, NullMap& null_map, bool& is_invalid_json_path) { + size_t input_rows_count = json_data_const ? rdata_columns.size() : loffsets.size(); + res_offsets.resize(input_rows_count); + + auto writer = std::make_unique<JsonbWriter>(); + std::unique_ptr<JsonbToJson> formater; + + for (size_t i = 0; i < input_rows_count; ++i) { + if (null_map[i]) { + StringOP::push_null_string(i, res_data, res_offsets, null_map); + continue; + } + size_t l_off = loffsets[index_check_const(i, json_data_const) - 1]; + size_t l_size = loffsets[index_check_const(i, json_data_const)] - l_off; + const char* l_raw = reinterpret_cast<const char*>(&ldata[l_off]); + if (rdata_columns.size() == 1) { // just return origin value + const ColumnString* path_col = rdata_columns[0]; + const ColumnString::Chars& rdata = path_col->get_chars(); + const ColumnString::Offsets& roffsets = path_col->get_offsets(); + size_t r_off = roffsets[index_check_const(i, path_const[0]) - 1]; + size_t r_size = roffsets[index_check_const(i, path_const[0])] - r_off; + const char* r_raw = reinterpret_cast<const char*>(&rdata[r_off]); + inner_loop_impl(i, res_data, res_offsets, null_map, writer, formater, l_raw, l_size, + r_raw, r_size, is_invalid_json_path); + } else { // will make array string to user + writer->reset(); + writer->writeStartArray(); + for (size_t pi = 0; pi < rdata_columns.size(); ++pi) { + const ColumnString* path_col = rdata_columns[pi]; + const ColumnString::Chars& rdata = path_col->get_chars(); + const ColumnString::Offsets& roffsets = path_col->get_offsets(); + size_t r_off = roffsets[index_check_const(i, path_const[pi]) - 1]; + size_t r_size = roffsets[index_check_const(i, path_const[pi])] - r_off; + const char* r_raw = reinterpret_cast<const char*>(&rdata[r_off]); + // doc is NOT necessary to be deleted since JsonbDocument will not allocate memory + JsonbDocument* doc = JsonbDocument::createDocument(l_raw, l_size); + if (UNLIKELY(!doc || !doc->getValue())) { + writer->writeNull(); + continue; + } + // value is NOT necessary to be deleted since JsonbValue will not allocate memory + JsonbValue* value = + doc->getValue()->findPath(r_raw, r_size, is_invalid_json_path, nullptr); + // if not valid json path , should return error message to user + if (is_invalid_json_path) { + return Status::InvalidArgument( + "Json path error: {} for value: {}", + JsonbErrMsg::getErrMsg(JsonbErrType::E_INVALID_JSON_PATH), + std::string_view(reinterpret_cast<const char*>(rdata.data()), + rdata.size())); + } + if (UNLIKELY(!value)) { + writer->writeNull(); + } else { + writer->writeValue(value); + } + } + writer->writeEndArray(); + StringOP::push_value_string(std::string_view(writer->getOutput()->getBuffer(), + writer->getOutput()->getSize()), + i, res_data, res_offsets); + } + } //for + return Status::OK(); + } + static void vector_vector(FunctionContext* context, const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const ColumnString::Chars& rdata, diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index ed17bf524a..bf34a7cf52 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1690,11 +1690,11 @@ visible_functions = { [['jsonb_exists_path'], 'BOOLEAN', ['JSONB', 'VARCHAR'], ''], [['jsonb_exists_path'], 'BOOLEAN', ['JSONB', 'STRING'], ''], - [['jsonb_type'], 'STRING', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], + [['jsonb_type'], 'STRING', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], [['jsonb_type'], 'STRING', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], - [['jsonb_extract'], 'JSONB', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], - [['jsonb_extract'], 'JSONB', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], + [['jsonb_extract'], 'JSONB', ['JSONB', 'VARCHAR', '...'], 'ALWAYS_NULLABLE'], + [['jsonb_extract'], 'JSONB', ['JSONB', 'STRING', '...'], 'ALWAYS_NULLABLE'], [['jsonb_extract_isnull'], 'BOOLEAN', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], [['jsonb_extract_isnull'], 'BOOLEAN', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], [['jsonb_extract_bool'], 'BOOLEAN', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], @@ -1725,8 +1725,6 @@ visible_functions = { [['json_type'], 'STRING', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], [['json_type'], 'STRING', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], - [['jsonb_extract'], 'JSONB', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], - [['jsonb_extract'], 'JSONB', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], [['json_extract_isnull'], 'BOOLEAN', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], [['json_extract_isnull'], 'BOOLEAN', ['JSONB', 'STRING'], 'ALWAYS_NULLABLE'], [['json_extract_bool'], 'BOOLEAN', ['JSONB', 'VARCHAR'], 'ALWAYS_NULLABLE'], diff --git a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out index 00944828bd..e1c176273c 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out +++ b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out @@ -709,6 +709,29 @@ 27 {"k1":"v1","k2":200} \N 28 {"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"} \N +-- !jsonb_extract_multipath -- +1 \N \N +2 null [null,null,null,null] +3 true [true,null,null,null] +4 false [false,null,null,null] +5 100 [100,null,null,null] +6 10000 [10000,null,null,null] +7 1000000000 [1000000000,null,null,null] +8 1152921504606846976 [1152921504606846976,null,null,null] +9 6.18 [6.18,null,null,null] +10 "abcd" ["abcd",null,null,null] +11 {} [{},{},null,{}] +12 {"k1":"v31","k2":300} [{"k1":"v31","k2":300},{"k1":"v31","k2":300},"v31",{"k1":"v31","k2":300}] +13 [] [[],null,null,null] +14 [123,456] [[123,456],null,null,123] +15 ["abc","def"] [["abc","def"],null,null,"abc"] +16 [null,true,false,100,6.18,"abc"] [[null,true,false,100,6.18,"abc"],null,null,null] +17 [{"k1":"v41","k2":400},1,"a",3.14] [[{"k1":"v41","k2":400},1,"a",3.14],null,null,{"k1":"v41","k2":400}] +18 {"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]} [{"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]},{"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]},"v31",{"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]}] +26 \N \N +27 {"k1":"v1","k2":200} [{"k1":"v1","k2":200},{"k1":"v1","k2":200},"v1",{"k1":"v1","k2":200}] +28 {"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"} [{"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"},{"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"},null,{"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"}] + -- !jsonb_extract_string_select -- 1 \N \N 2 null null diff --git a/regression-test/data/jsonb_p0/test_jsonb_load_unique_key_and_function.out b/regression-test/data/jsonb_p0/test_jsonb_load_unique_key_and_function.out index 04946beaa1..632badf32e 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_load_unique_key_and_function.out +++ b/regression-test/data/jsonb_p0/test_jsonb_load_unique_key_and_function.out @@ -709,6 +709,29 @@ 27 {"k1":"v1","k2":200} \N 28 {"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"} \N +-- !jsonb_extract_multipath -- +1 \N \N +2 null [null,null,null,null] +3 true [true,null,null,null] +4 false [false,null,null,null] +5 100 [100,null,null,null] +6 10000 [10000,null,null,null] +7 1000000000 [1000000000,null,null,null] +8 1152921504606846976 [1152921504606846976,null,null,null] +9 6.18 [6.18,null,null,null] +10 "abcd" ["abcd",null,null,null] +11 {} [{},{},null,{}] +12 {"k1":"v31","k2":300} [{"k1":"v31","k2":300},{"k1":"v31","k2":300},"v31",{"k1":"v31","k2":300}] +13 [] [[],null,null,null] +14 [123,456] [[123,456],null,null,123] +15 ["abc","def"] [["abc","def"],null,null,"abc"] +16 [null,true,false,100,6.18,"abc"] [[null,true,false,100,6.18,"abc"],null,null,null] +17 [{"k1":"v41","k2":400},1,"a",3.14] [[{"k1":"v41","k2":400},1,"a",3.14],null,null,{"k1":"v41","k2":400}] +18 {"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]} [{"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]},{"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]},"v31",{"k1":"v31","k2":300,"a1":[{"k1":"v41","k2":400},1,"a",3.14]}] +26 \N \N +27 {"k1":"v1","k2":200} [{"k1":"v1","k2":200},{"k1":"v1","k2":200},"v1",{"k1":"v1","k2":200}] +28 {"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"} [{"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"},{"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"},null,{"a.b.c":{"k1.a1":"v31","k2":300},"a":"niu"}] + -- !jsonb_extract_string_select -- 1 \N \N 2 null null diff --git a/regression-test/suites/jsonb_p0/test_jsonb_load_and_function.groovy b/regression-test/suites/jsonb_p0/test_jsonb_load_and_function.groovy index 6941f335c2..d58b991ef2 100644 --- a/regression-test/suites/jsonb_p0/test_jsonb_load_and_function.groovy +++ b/regression-test/suites/jsonb_p0/test_jsonb_load_and_function.groovy @@ -178,6 +178,9 @@ suite("test_jsonb_load_and_function", "p0") { qt_select "SELECT id, j, jsonb_extract(j, '\$.a1[-1]') FROM ${testTable} ORDER BY id" qt_select "SELECT id, j, jsonb_extract(j, '\$.a1[-10]') FROM ${testTable} ORDER BY id" + // jsonb_extract_multipath + qt_jsonb_extract_multipath "SELECT id, j, jsonb_extract(j, '\$', '\$.*', '\$.k1', '\$[0]') FROM ${testTable} ORDER BY id" + // jsonb_extract_string qt_jsonb_extract_string_select "SELECT id, j, jsonb_extract_string(j, '\$') FROM ${testTable} ORDER BY id" diff --git a/regression-test/suites/jsonb_p0/test_jsonb_load_unique_key_and_function.groovy b/regression-test/suites/jsonb_p0/test_jsonb_load_unique_key_and_function.groovy index eee42e3a69..5423ad33fc 100644 --- a/regression-test/suites/jsonb_p0/test_jsonb_load_unique_key_and_function.groovy +++ b/regression-test/suites/jsonb_p0/test_jsonb_load_unique_key_and_function.groovy @@ -171,6 +171,10 @@ suite("test_jsonb_unique_load_and_function", "p0") { qt_select "SELECT id, j, jsonb_extract(j, '\$.a1[-1]') FROM ${testTable} ORDER BY id" qt_select "SELECT id, j, jsonb_extract(j, '\$.a1[-10]') FROM ${testTable} ORDER BY id" + + // jsonb_extract_multipath + qt_jsonb_extract_multipath "SELECT id, j, jsonb_extract(j, '\$', '\$.*', '\$.k1', '\$[0]') FROM ${testTable} ORDER BY id" + // jsonb_extract_string qt_jsonb_extract_string_select "SELECT id, j, jsonb_extract_string(j, '\$') FROM ${testTable} ORDER BY id" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org