This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new ef2f765e432 [Bug](json reader) object should stop processing when encounter error (#31159) ef2f765e432 is described below commit ef2f765e4324035242525deaa8321bfb71fa7f33 Author: lihangyu <15605149...@163.com> AuthorDate: Wed Feb 21 10:13:43 2024 +0800 [Bug](json reader) object should stop processing when encounter error (#31159) If DATA_QUALITY_ERROR encountered we should stop processing this document any more.Otherwise there will be UB in simdjson. --- be/src/exprs/json_functions.cpp | 2 +- be/src/vec/exec/format/json/new_json_reader.cpp | 6 ++-- .../data/load_p0/stream_load/test_json_load.out | 4 +++ .../stream_load/test_malformed_json_with_path.json | 3 ++ .../load_p0/stream_load/test_json_load.groovy | 33 ++++++++++++++++++++-- 5 files changed, 42 insertions(+), 6 deletions(-) diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index ff432c4655a..29c1596ed8f 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -261,7 +261,7 @@ Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, const std::string& _msg = msg; \ if (UNLIKELY(_err)) { \ if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS) { \ - return Status::DataQualityError( \ + return Status::NotFound<false>( \ fmt::format("Not found target filed, err: {}, msg: {}", \ simdjson::error_message(_err), _msg)); \ } \ diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 97affdcd0bb..514a925cba4 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -1199,7 +1199,7 @@ Status NewJsonReader::_simdjson_handle_flat_array_complex_json_write_columns( simdjson::ondemand::value val; Status st = JsonFunctions::extract_from_object(cur, _parsed_json_root, &val); if (UNLIKELY(!st.ok())) { - if (st.is<DATA_QUALITY_ERROR>()) { + if (st.is<NOT_FOUND>()) { RETURN_IF_ERROR(_append_error_msg(nullptr, st.to_string(), "", nullptr)); ADVANCE_ROW(); continue; @@ -1630,11 +1630,11 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( Status st; if (i < _parsed_jsonpaths.size()) { st = JsonFunctions::extract_from_object(*value, _parsed_jsonpaths[i], &json_value); - if (!st.ok() && !st.is<DATA_QUALITY_ERROR>()) { + if (!st.ok() && !st.is<NOT_FOUND>()) { return st; } } - if (i >= _parsed_jsonpaths.size() || st.is<DATA_QUALITY_ERROR>()) { + if (i >= _parsed_jsonpaths.size() || st.is<NOT_FOUND>()) { // not match in jsondata, filling with default value RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); if (!(*valid)) { diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index b9250608475..7351891633d 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -241,3 +241,7 @@ John 30 New York {"email":"j...@example.com","phone":"+1-123-456-7890"} 100 2345676 200 755 +-- !select26 -- +android \N \N \N \N \N +android \N \N \N \N \N + diff --git a/regression-test/data/load_p0/stream_load/test_malformed_json_with_path.json b/regression-test/data/load_p0/stream_load/test_malformed_json_with_path.json new file mode 100644 index 00000000000..f87ebaa5d3b --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_malformed_json_with_path.json @@ -0,0 +1,3 @@ +{"app_version":"v1.0.0","app_package":"com.fdf.listen","subject":"USER","ip":"45334","platform":"android","app_name":"听听","pro_brand":"图书","report_time":0,"user_id":"unknown","platform_ID":"1","action":"CLICK","event_name":"section_play","phone_num":"45645642692","pro_code":"unknown","event_value":"device_id":"gikj78675678","media_id":"67867","album_id":"1734","duration":"60","event_time":1706841911773,"object":"play_content"} +{"app_version":"v1.0.0","app_package":"com.fdf.listen","subject":"USER","ip":"45334","platform":"android","app_name":"听听","pro_brand":"图书","report_time":0,"user_id":"unknown","platform_ID":"1","action":"CLICK","event_name":"section_play","phone_num":"45645642692","pro_code":"unknown","device_id":"gikj78675678","media_id":"67867","album_id":"1734","duration":"60","event_time":1706841911773,"object":"play_content"} +{"app_version":"v1.0.0","app_package":"com.fdf.listen","subject":"USER","ip":"45334","platform":"android","app_name":"听听","pro_brand":"图书","report_time":0,"user_id":"unknown","platform_ID":"1","action":"CLICK","event_name":"section_play","phone_num":"45645642692","pro_code":"unknown","device_id":"gikj78675678","syscode":123, "media_id":"67867","album_id":"1734","duration":"60","event_time":1706841911773,"object":"play_content"} \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index 816765b1233..f41610f3ba2 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -727,6 +727,35 @@ suite("test_json_load", "p0") { try_sql("DROP TABLE IF EXISTS ${testTable}") } + // case27: import json with malformed json along with json path + try { + sql "DROP TABLE IF EXISTS ${testTable}" + + sql """CREATE TABLE IF NOT EXISTS ${testTable} + ( + `syscode` VARCHAR(20) NOT NULL COMMENT "", + `event_dt` DateTime NULL COMMENT "", + `pro_brand` VARCHAR(20) COMMENT "", + `app_package` VARCHAR(50) COMMENT "", + `platform` VARCHAR(20) COMMENT "", + `log_num` BIGINT DEFAULT "0" COMMENT "" + ) + DUPLICATE KEY(`syscode`, `event_dt`,`pro_brand`,`app_package`,`platform`) + COMMENT '' + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + );""" + + load_json_data.call("${testTable}", "${testTable}_case27_1", 'false', 'true', 'json', 'id= id * 10', '[\"$.platform\",\"$.app_package\",\"$.sysCode\",\"$.sys_code\",\"$.proBrand\",\"$.pro_brand\",\"$.event_time\"]', + '', '', '', 'test_malformed_json_with_path.json', false, 2) + sql "sync" + qt_select26 "select * from ${testTable}" + + } finally { + try_sql("DROP TABLE IF EXISTS ${testTable}") + } + // test jsonpaths error try { sql "DROP TABLE IF EXISTS ${testTable}" @@ -734,7 +763,7 @@ suite("test_json_load", "p0") { create_json_test_table.call(testTable) streamLoad { table "${testTable}" - set 'jsonpaths', '[\"Name\", \"Age\", \"Agent_id\"]' + set 'jsonpaths', '[\"$.Name\", \"$.Age\", \"$.Agent_id\"]' set 'format', 'json' file 'test_json_error.json' // import json file time 10000 // limit inflight 10s @@ -754,7 +783,7 @@ suite("test_json_load", "p0") { def code = process.waitFor() def out = process.text log.info("result: ${out}".toString()) - def reason = "Reason: There is no column matching jsonpaths in the json file, columns:[name, age, agent_id, ], please check columns and jsonpaths:[\"Name\", \"Age\", \"Agent_id\"]. src line [{\"name\":\"Name1\",\"age\":21,\"agent_id\":\"1\"}]; \n" + def reason = "Reason: There is no column matching jsonpaths in the json file, columns:[name, age, agent_id, ], please check columns and jsonpaths:[\"\$.Name\", \"\$.Age\", \"\$.Agent_id\"]. src line [{\"name\":\"Name1\",\"age\":21,\"agent_id\":\"1\"}]; \n" assertEquals("${reason}", "${out}") } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org