This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new e486dd387b9 [fix](json) Add . after in JSON path to support correct
token parsing (#52543) (#52744)
e486dd387b9 is described below
commit e486dd387b929e94174bb7903e8f58b1d4aaae42
Author: Jerry Hu <[email protected]>
AuthorDate: Fri Jul 4 15:13:43 2025 +0800
[fix](json) Add . after in JSON path to support correct token parsing
(#52543) (#52744)
Boost tokenizer requires explicit "." after "$" to correctly extract
JSON path tokens. Without this, expressions like "$[0].key" cannot be
properly split, causing issues in downstream logic. This commit ensures
a "." is automatically added after "$" to maintain consistent token
parsing behavior.
### What problem does this PR solve?
pick #52543
Issue Number: close #xxx
Related PR: #52543
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/vec/functions/function_json.cpp | 106 +++++++++------------
be/test/vec/function/function_json_test.cpp | 16 ++--
.../data/json_p0/test_json_load_and_function.out | Bin 261681 -> 260175 bytes
.../data/jsonb_p0/test_jsonb_load_and_function.out | Bin 189687 -> 188934 bytes
.../data/nereids_function_p0/scalar_function/J.out | Bin 160933 -> 160276 bytes
.../json_functions/test_json_extract.out | Bin 286 -> 332 bytes
.../json_functions/test_json_extract.groovy | 8 ++
7 files changed, 59 insertions(+), 71 deletions(-)
diff --git a/be/src/vec/functions/function_json.cpp
b/be/src/vec/functions/function_json.cpp
index 6901dfb2b69..ab1ff616f2b 100644
--- a/be/src/vec/functions/function_json.cpp
+++ b/be/src/vec/functions/function_json.cpp
@@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.
+#include <glog/logging.h>
#include <rapidjson/allocators.h>
#include <rapidjson/document.h>
#include <rapidjson/encodings.h>
@@ -144,45 +145,7 @@ rapidjson::Value* match_value(const std::vector<JsonPath>&
parsed_paths, rapidjs
const std::string& col = parsed_paths[i].key;
int index = parsed_paths[i].idx;
if (LIKELY(!col.empty())) {
- if (root->IsArray()) {
- array_obj = static_cast<rapidjson::Value*>(
- mem_allocator.Malloc(sizeof(rapidjson::Value)));
- array_obj->SetArray();
- bool is_null = true;
-
- // if array ,loop the array,find out all Objects,then find the
results from the objects
- for (int j = 0; j < root->Size(); j++) {
- rapidjson::Value* json_elem = &((*root)[j]);
-
- if (json_elem->IsArray() || json_elem->IsNull()) {
- continue;
- } else {
- if (!json_elem->IsObject()) {
- continue;
- }
- if (!json_elem->HasMember(col.c_str())) {
- if (is_insert_null) { // not found item, then
insert a null object.
- is_null = false;
- rapidjson::Value
nullObject(rapidjson::kNullType);
- array_obj->PushBack(nullObject, mem_allocator);
- }
- continue;
- }
- rapidjson::Value* obj = &((*json_elem)[col.c_str()]);
- if (obj->IsArray()) {
- is_null = false;
- for (int k = 0; k < obj->Size(); k++) {
- array_obj->PushBack((*obj)[k], mem_allocator);
- }
- } else if (!obj->IsNull()) {
- is_null = false;
- array_obj->PushBack(*obj, mem_allocator);
- }
- }
- }
-
- root = is_null ? &(array_obj->SetNull()) : array_obj;
- } else if (root->IsObject()) {
+ if (root->IsObject()) {
if (!root->HasMember(col.c_str())) {
return nullptr;
} else {
@@ -233,8 +196,17 @@ rapidjson::Value* get_json_object(std::string_view
json_string, std::string_view
//Cannot use '\' as the last character, return NULL
if (path_string.back() == '\\') {
- document->SetNull();
- return document;
+ return nullptr;
+ }
+
+ std::string fixed_string;
+ if (path_string.size() >= 2 && path_string[0] == '$' && path_string[1] !=
'.') {
+ // Boost tokenizer requires explicit "." after "$" to correctly
extract JSON path tokens.
+ // Without this, expressions like "$[0].key" cannot be properly split.
+ // This commit ensures a "." is automatically added after "$" to
maintain consistent token parsing behavior.
+ fixed_string = "$.";
+ fixed_string += path_string.substr(1);
+ path_string = fixed_string;
}
try {
@@ -251,13 +223,13 @@ rapidjson::Value* get_json_object(std::string_view
json_string, std::string_view
}
} catch (boost::escaped_list_error&) {
// meet unknown escape sequence, example '$.name\k'
- return document;
+ return nullptr;
}
parsed_paths = &tmp_parsed_paths;
if (!(*parsed_paths)[0].is_valid) {
- return document;
+ return nullptr;
}
if (UNLIKELY((*parsed_paths).size() == 1)) {
@@ -272,8 +244,7 @@ rapidjson::Value* get_json_object(std::string_view
json_string, std::string_view
if (UNLIKELY(document->HasParseError())) {
// VLOG_CRITICAL << "Error at offset " << document->GetErrorOffset()
<< ": "
// << GetParseError_En(document->GetParseError());
- document->SetNull();
- return document;
+ return nullptr;
}
return match_value(*parsed_paths, document, document->GetAllocator());
@@ -858,9 +829,10 @@ template <typename Name, bool remove_quotes>
struct FunctionJsonExtractImpl {
static constexpr auto name = Name::name;
- static rapidjson::Value parse_json(const ColumnString* json_col, const
ColumnString* path_col,
- rapidjson::Document::AllocatorType&
allocator, const int row,
- const int col, std::vector<bool>&
column_is_consts) {
+ static std::pair<bool, rapidjson::Value> parse_json(
+ const ColumnString* json_col, const ColumnString* path_col,
+ rapidjson::Document::AllocatorType& allocator, const int row,
const int col,
+ std::vector<bool>& column_is_consts) {
rapidjson::Value value;
rapidjson::Document document;
@@ -869,10 +841,13 @@ struct FunctionJsonExtractImpl {
const auto path = path_col->get_data_at(index_check_const(row,
column_is_consts[col]));
std::string_view path_string(path.data, path.size);
auto* root = get_json_object<JSON_FUN_STRING>(json_string,
path_string, &document);
+ bool found = false;
if (root != nullptr) {
+ found = true;
value.CopyFrom(*root, allocator);
}
- return value;
+
+ return {found, std::move(value)};
}
static rapidjson::Value* get_document(const ColumnString* path_col,
@@ -913,8 +888,9 @@ struct FunctionJsonExtractImpl {
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
const auto* json_col = data_columns[0];
- auto insert_result_lambda = [&](rapidjson::Value& value, int row) {
- if (value.IsNull()) {
+
+ auto insert_result_lambda = [&](rapidjson::Value& value, bool is_null,
int row) {
+ if (is_null) {
null_map[row] = 1;
result_column.insert_default();
} else {
@@ -935,12 +911,13 @@ struct FunctionJsonExtractImpl {
}
};
if (data_columns.size() == 2) {
- rapidjson::Value value;
if (column_is_consts[1]) {
std::vector<JsonPath> parsed_paths;
auto* root = get_document(data_columns[1], &document,
parsed_paths, 0,
column_is_consts[1]);
for (size_t row = 0; row < input_rows_count; row++) {
+ bool is_null = false;
+ rapidjson::Value value;
if (root != nullptr) {
const auto& obj = json_col->get_data_at(row);
std::string_view json_string(obj.data, obj.size);
@@ -957,17 +934,18 @@ struct FunctionJsonExtractImpl {
if (root_val != nullptr) {
value.CopyFrom(*root_val, allocator);
} else {
- rapidjson::Value tmp;
- value.Swap(tmp);
+ is_null = true;
}
+ } else {
+ is_null = true;
}
- insert_result_lambda(value, row);
+ insert_result_lambda(value, is_null, row);
}
} else {
for (size_t row = 0; row < input_rows_count; row++) {
- value = parse_json(json_col, data_columns[1], allocator,
row, 1,
- column_is_consts);
- insert_result_lambda(value, row);
+ auto result = parse_json(json_col, data_columns[1],
allocator, row, 1,
+ column_is_consts);
+ insert_result_lambda(result.second, !result.first, row);
}
}
@@ -977,12 +955,16 @@ struct FunctionJsonExtractImpl {
value.Reserve(data_columns.size() - 1, allocator);
for (size_t row = 0; row < input_rows_count; row++) {
value.Clear();
+ bool found_any = false;
for (size_t col = 1; col < data_columns.size(); ++col) {
- value.PushBack(parse_json(json_col, data_columns[col],
allocator, row, col,
- column_is_consts),
- allocator);
+ auto result = parse_json(json_col, data_columns[col],
allocator, row, col,
+ column_is_consts);
+ if (result.first) {
+ found_any = true;
+ value.PushBack(std::move(result.second), allocator);
+ }
}
- insert_result_lambda(value, row);
+ insert_result_lambda(value, !found_any, row);
}
}
}
diff --git a/be/test/vec/function/function_json_test.cpp
b/be/test/vec/function/function_json_test.cpp
index 988d4ca731a..8f28a474958 100644
--- a/be/test/vec/function/function_json_test.cpp
+++ b/be/test/vec/function/function_json_test.cpp
@@ -72,15 +72,13 @@ TEST(FunctionJsonTEST, GetJsonStringTest) {
std::string func_name = "get_json_string";
InputTypeSet input_types = {TypeIndex::String, TypeIndex::String};
DataSet data_set = {
- {{VARCHAR("{\"k1\":\"v1\", \"k2\":\"v2\"}"), VARCHAR("$.k1")},
VARCHAR("v1")},
- {{VARCHAR("{\"k1\":\"v1\", \"my.key\":[\"e1\", \"e2\", \"e3\"]}"),
- VARCHAR("$.\"my.key\"[1]")},
+ {{VARCHAR(R"({"k1":"v1", "k2":"v2"})"), VARCHAR("$.k1")},
VARCHAR("v1")},
+ {{VARCHAR(R"({"k1":"v1", "my.key":["e1", "e2", "e3"]})"),
VARCHAR("$.\"my.key\"[1]")},
VARCHAR("e2")},
- {{VARCHAR("{\"k1.key\":{\"k2\":[\"v1\", \"v2\"]}}"),
VARCHAR("$.\"k1.key\".k2[0]")},
+ {{VARCHAR(R"({"k1.key":{"k2":["v1", "v2"]}})"),
VARCHAR("$.\"k1.key\".k2[0]")},
VARCHAR("v1")},
- {{VARCHAR("[{\"k1\":\"v1\"}, {\"k2\":\"v2\"}, {\"k1\":\"v3\"},
{\"k1\":\"v4\"}]"),
- VARCHAR("$.k1")},
- VARCHAR("[\"v1\",\"v3\",\"v4\"]")}};
+ {{VARCHAR(R"([{"k1":"v1"}, {"k2":"v2"}, {"k1":"v3"},
{"k1":"v4"}])"), VARCHAR("$.k1")},
+ Null()}};
static_cast<void>(check_function<DataTypeString, true>(func_name,
input_types, data_set));
}
@@ -93,7 +91,7 @@ TEST(FunctionJsonTEST, JsonExtractTest) {
// json_extract root
DataSet data_set = {
{{Null(), STRING("$")}, Null()},
- {{STRING("null"), STRING("$")}, Null()},
+ {{STRING("null"), STRING("$")}, STRING("null")},
{{STRING("true"), STRING("$")}, STRING("true")},
{{STRING("false"), STRING("$")}, STRING("false")},
{{STRING("100"), STRING("$")}, STRING("100")},
//int8
@@ -127,7 +125,7 @@ TEST(FunctionJsonTEST, JsonExtractTest) {
data_set = {
{{Null(), STRING("$")}, Null()},
- {{STRING("null"), STRING("$")}, Null()},
+ {{STRING("null"), STRING("$")}, STRING("null")},
{{STRING("true"), STRING("$")}, STRING("true")},
{{STRING("false"), STRING("$")}, STRING("false")},
{{STRING("100"), STRING("$")}, STRING("100")},
//int8
diff --git a/regression-test/data/json_p0/test_json_load_and_function.out
b/regression-test/data/json_p0/test_json_load_and_function.out
index de25adfef2e..b6098cfbf64 100644
Binary files a/regression-test/data/json_p0/test_json_load_and_function.out and
b/regression-test/data/json_p0/test_json_load_and_function.out differ
diff --git a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
index dbd756dab9d..d7d7611931f 100644
Binary files a/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out
and b/regression-test/data/jsonb_p0/test_jsonb_load_and_function.out differ
diff --git a/regression-test/data/nereids_function_p0/scalar_function/J.out
b/regression-test/data/nereids_function_p0/scalar_function/J.out
index 64a76e24826..dc1c34eafe6 100644
Binary files a/regression-test/data/nereids_function_p0/scalar_function/J.out
and b/regression-test/data/nereids_function_p0/scalar_function/J.out differ
diff --git
a/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
b/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
index 2b64e92fd3e..822470fb273 100644
Binary files
a/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
and
b/regression-test/data/query_p0/sql_functions/json_functions/test_json_extract.out
differ
diff --git
a/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
b/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
index 41e68111d79..b728f7ee517 100644
---
a/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
+++
b/regression-test/suites/query_p0/sql_functions/json_functions/test_json_extract.groovy
@@ -28,4 +28,12 @@ suite("test_json_extract") {
sql """ SELECT JSON_EXTRACT_STRING('{"id": 123, "name": "doris"}',
'\$.'); """
exception "Invalid Json Path for value: \$."
}
+
+ qt_fix_array_path """
+ select
+ JSON_EXTRACT('[{"key": [123]}]', '\$[0].key') v1
+ , JSON_EXTRACT('[{"key": [123]}]', '\$.[0].key') v2
+ , JSONB_EXTRACT('[{"key": [123]}]', '\$[0].key') v3
+ , JSONB_EXTRACT('[{"key": [123]}]', '\$.[0].key') v4;
+ """
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]