xy720 commented on code in PR #8599:
URL: https://github.com/apache/incubator-doris/pull/8599#discussion_r858660713


##########
be/src/exec/json_scanner.cpp:
##########
@@ -360,6 +365,79 @@ void JsonReader::_close() {
     _closed = true;
 }
 
+Status JsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
+    // read a whole message
+    SCOPED_TIMER(_file_read_timer);
+    const uint8_t* json_str = nullptr;
+    std::unique_ptr<uint8_t[]> json_str_ptr;
+    if (_line_reader != nullptr) {
+        RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof));
+    } else {
+        int64_t length = 0;
+        RETURN_IF_ERROR(_file_reader->read_one_message(&json_str_ptr, 
&length));
+        json_str = json_str_ptr.get();
+        *size = length;
+        if (length == 0) {
+            *eof = true;
+        }
+    }
+
+    _bytes_read_counter += *size;
+    if (*eof) {
+        return Status::OK();
+    }
+    _json_doc = nullptr;
+
+    auto document = _parser.parse((char*)json_str, *size);
+    if (document.error() != 0) {
+        LOG(INFO) << "json_str: " << std::string((char*)json_str, *size) << ", 
error code: " << document.error();
+        fmt::memory_buffer error_msg;
+        fmt::format_to(error_msg, "Simdjson Parse json data for JsonDoc 
failed. code: {}", document.error());
+        RETURN_IF_ERROR(_state->append_error_msg_to_file([&]() -> std::string 
{ return std::string((char*)json_str, *size); },
+                [&]() -> std::string { return error_msg.data(); }, 
_scanner_eof));
+        _counter->num_rows_filtered++;
+        if (*_scanner_eof) {
+            // Case A: if _scanner_eof is set to true in 
"append_error_msg_to_file", which means
+            // we meet enough invalid rows and the scanner should be stopped.
+            // So we set eof to true and return OK, the caller will stop the 
process as we meet the end of file.
+            *eof = true;
+            return Status::OK();
+        }
+        return Status::DataQualityError(error_msg.data());
+    }
+    _element = document.value_unsafe();
+
+    if (_element.is_array() && !_strip_outer_array) {
+        fmt::memory_buffer error_msg;
+        fmt::format_to(error_msg, "{}", "JSON data is array-object, 
`strip_outer_array` must be TRUE.");
+        RETURN_IF_ERROR(_state->append_error_msg_to_file([&]() -> std::string 
{ return _print_json_value(_origin_json_doc); },

Review Comment:
   replace with simdjson's _print_json_value method.



##########
be/src/exec/json_scanner.cpp:
##########
@@ -360,6 +365,79 @@ void JsonReader::_close() {
     _closed = true;
 }
 
+Status JsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
+    // read a whole message
+    SCOPED_TIMER(_file_read_timer);
+    const uint8_t* json_str = nullptr;
+    std::unique_ptr<uint8_t[]> json_str_ptr;
+    if (_line_reader != nullptr) {
+        RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof));
+    } else {
+        int64_t length = 0;
+        RETURN_IF_ERROR(_file_reader->read_one_message(&json_str_ptr, 
&length));
+        json_str = json_str_ptr.get();
+        *size = length;
+        if (length == 0) {
+            *eof = true;
+        }
+    }
+
+    _bytes_read_counter += *size;
+    if (*eof) {
+        return Status::OK();
+    }
+    _json_doc = nullptr;

Review Comment:
   _json_doc is a rapidjson object, it is unused here.



##########
be/src/exec/json_scanner.cpp:
##########
@@ -360,6 +365,79 @@ void JsonReader::_close() {
     _closed = true;
 }
 
+Status JsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
+    // read a whole message
+    SCOPED_TIMER(_file_read_timer);
+    const uint8_t* json_str = nullptr;
+    std::unique_ptr<uint8_t[]> json_str_ptr;
+    if (_line_reader != nullptr) {
+        RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof));
+    } else {
+        int64_t length = 0;
+        RETURN_IF_ERROR(_file_reader->read_one_message(&json_str_ptr, 
&length));
+        json_str = json_str_ptr.get();
+        *size = length;
+        if (length == 0) {
+            *eof = true;
+        }
+    }
+
+    _bytes_read_counter += *size;
+    if (*eof) {
+        return Status::OK();
+    }
+    _json_doc = nullptr;
+
+    auto document = _parser.parse((char*)json_str, *size);
+    if (document.error() != 0) {
+        LOG(INFO) << "json_str: " << std::string((char*)json_str, *size) << ", 
error code: " << document.error();
+        fmt::memory_buffer error_msg;
+        fmt::format_to(error_msg, "Simdjson Parse json data for JsonDoc 
failed. code: {}", document.error());
+        RETURN_IF_ERROR(_state->append_error_msg_to_file([&]() -> std::string 
{ return std::string((char*)json_str, *size); },
+                [&]() -> std::string { return error_msg.data(); }, 
_scanner_eof));
+        _counter->num_rows_filtered++;
+        if (*_scanner_eof) {
+            // Case A: if _scanner_eof is set to true in 
"append_error_msg_to_file", which means
+            // we meet enough invalid rows and the scanner should be stopped.
+            // So we set eof to true and return OK, the caller will stop the 
process as we meet the end of file.
+            *eof = true;
+            return Status::OK();
+        }
+        return Status::DataQualityError(error_msg.data());
+    }
+    _element = document.value_unsafe();

Review Comment:
   Why not need to set json root first? In case of that user has specified a 
json root.



##########
be/src/exec/json_scanner.cpp:
##########
@@ -360,6 +365,79 @@ void JsonReader::_close() {
     _closed = true;
 }
 
+Status JsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
+    // read a whole message
+    SCOPED_TIMER(_file_read_timer);
+    const uint8_t* json_str = nullptr;
+    std::unique_ptr<uint8_t[]> json_str_ptr;
+    if (_line_reader != nullptr) {
+        RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof));
+    } else {
+        int64_t length = 0;
+        RETURN_IF_ERROR(_file_reader->read_one_message(&json_str_ptr, 
&length));
+        json_str = json_str_ptr.get();
+        *size = length;
+        if (length == 0) {
+            *eof = true;
+        }
+    }
+
+    _bytes_read_counter += *size;
+    if (*eof) {
+        return Status::OK();
+    }
+    _json_doc = nullptr;
+
+    auto document = _parser.parse((char*)json_str, *size);
+    if (document.error() != 0) {
+        LOG(INFO) << "json_str: " << std::string((char*)json_str, *size) << ", 
error code: " << document.error();
+        fmt::memory_buffer error_msg;
+        fmt::format_to(error_msg, "Simdjson Parse json data for JsonDoc 
failed. code: {}", document.error());
+        RETURN_IF_ERROR(_state->append_error_msg_to_file([&]() -> std::string 
{ return std::string((char*)json_str, *size); },
+                [&]() -> std::string { return error_msg.data(); }, 
_scanner_eof));
+        _counter->num_rows_filtered++;
+        if (*_scanner_eof) {
+            // Case A: if _scanner_eof is set to true in 
"append_error_msg_to_file", which means
+            // we meet enough invalid rows and the scanner should be stopped.
+            // So we set eof to true and return OK, the caller will stop the 
process as we meet the end of file.
+            *eof = true;
+            return Status::OK();
+        }
+        return Status::DataQualityError(error_msg.data());
+    }
+    _element = document.value_unsafe();
+
+    if (_element.is_array() && !_strip_outer_array) {
+        fmt::memory_buffer error_msg;
+        fmt::format_to(error_msg, "{}", "JSON data is array-object, 
`strip_outer_array` must be TRUE.");
+        RETURN_IF_ERROR(_state->append_error_msg_to_file([&]() -> std::string 
{ return _print_json_value(_origin_json_doc); },
+                [&]() -> std::string { return error_msg.data(); }, 
_scanner_eof));
+        _counter->num_rows_filtered++;
+        if (*_scanner_eof) {
+            // Same as Case A
+            *eof = true;
+            return Status::OK();
+        }
+        return Status::DataQualityError(error_msg.data());
+    }
+
+    if (!_element.is_array() && _strip_outer_array) {
+        fmt::memory_buffer error_msg;
+        fmt::format_to(error_msg, "{}", "JSON data is not an array-object, 
`strip_outer_array` must be FALSE.");
+        RETURN_IF_ERROR(_state->append_error_msg_to_file([&]() -> std::string 
{ return _print_json_value(_origin_json_doc); },

Review Comment:
   replace with simdjson's _print_json_value method.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to