HappenLee commented on code in PR #51300: URL: https://github.com/apache/doris/pull/51300#discussion_r2115202502
########## be/src/util/jsonb_parser_simd.h: ########## @@ -56,219 +56,171 @@ * and modified by Doris */ -#ifndef JSONB_JSONBJSONPARSERSIMD_H -#define JSONB_JSONBJSONPARSERSIMD_H - +#pragma once #include <simdjson.h> #include <cmath> #include <limits> +#include "common/status.h" #include "jsonb_document.h" -#include "jsonb_error.h" #include "jsonb_writer.h" #include "string_parser.hpp" namespace doris { - +#include "common/compile_check_begin.h" using int128_t = __int128; -class JsonbParserTSIMD { -public: - JsonbParserTSIMD() : err_(JsonbErrType::E_NONE) {} - - explicit JsonbParserTSIMD(JsonbOutStream& os) : writer_(os), err_(JsonbErrType::E_NONE) {} - - // parse a UTF-8 JSON string - bool parse(const std::string& str) { return parse(str.c_str(), str.size()); } - - // parse a UTF-8 JSON c-style string (NULL terminated) - bool parse(const char* c_str) { return parse(c_str, strlen(c_str)); } - +struct JsonbParser { // parse a UTF-8 JSON string with length - bool parse(const char* pch, size_t len) { - // reset state before parse - reset(); - + // will reset writer before parse + static Status parse(const char* pch, size_t len, JsonbWriter& writer) { if (!pch || len == 0) { - err_ = JsonbErrType::E_EMPTY_DOCUMENT; - VLOG_DEBUG << "empty json string"; - return false; + return Status::InternalError("Empty JSON document"); } - - // parse json using simdjson, return false on exception + writer.reset(); try { + simdjson::ondemand::parser simdjson_parser; simdjson::padded_string json_str {pch, len}; - simdjson::ondemand::document doc = parser_.iterate(json_str); + simdjson::ondemand::document doc = simdjson_parser.iterate(json_str); // simdjson process top level primitive types specially // so some repeated code here switch (doc.type()) { case simdjson::ondemand::json_type::object: case simdjson::ondemand::json_type::array: { - parse(doc.get_value()); + RETURN_IF_ERROR(parse(doc.get_value(), writer)); break; } case simdjson::ondemand::json_type::null: { - if (writer_.writeNull() == 0) { - err_ = JsonbErrType::E_OUTPUT_FAIL; - LOG(WARNING) << "writeNull failed"; + if (writer.writeNull() == 0) { + return Status::InternalError("writeNull failed"); } break; } case simdjson::ondemand::json_type::boolean: { - if (writer_.writeBool(doc.get_bool()) == 0) { - err_ = JsonbErrType::E_OUTPUT_FAIL; - LOG(WARNING) << "writeBool failed"; + if (writer.writeBool(doc.get_bool()) == 0) { + return Status::InternalError("writeBool failed"); } break; } case simdjson::ondemand::json_type::string: { - write_string(doc.get_string()); + RETURN_IF_ERROR(write_string(doc.get_string(), writer)); break; } case simdjson::ondemand::json_type::number: { - write_number(doc.get_number(), doc.raw_json_token()); + RETURN_IF_ERROR(write_number(doc.get_number(), doc.raw_json_token(), writer)); break; } } - - return err_ == JsonbErrType::E_NONE; } catch (simdjson::simdjson_error& e) { - err_ = JsonbErrType::E_EXCEPTION; - VLOG_DEBUG << "simdjson parse exception: " << e.what(); - return false; + return Status::InternalError(fmt::format("simdjson parse exception: {}", e.what())); } + return Status::OK(); } +private: // parse json, recursively if necessary, by simdjson // and serialize to binary format by writer - void parse(simdjson::ondemand::value value) { + static Status parse(simdjson::ondemand::value value, JsonbWriter& writer) { switch (value.type()) { case simdjson::ondemand::json_type::null: { - if (writer_.writeNull() == 0) { - err_ = JsonbErrType::E_OUTPUT_FAIL; - LOG(WARNING) << "writeNull failed"; + if (writer.writeNull() == 0) { + return Status::InternalError("writeNull failed"); } break; } case simdjson::ondemand::json_type::boolean: { - if (writer_.writeBool(value.get_bool()) == 0) { - err_ = JsonbErrType::E_OUTPUT_FAIL; - LOG(WARNING) << "writeBool failed"; + if (writer.writeBool(value.get_bool()) == 0) { + return Status::InternalError("writeBool failed"); } break; } case simdjson::ondemand::json_type::string: { - write_string(value.get_string()); + RETURN_IF_ERROR(write_string(value.get_string(), writer)); break; } case simdjson::ondemand::json_type::number: { - write_number(value.get_number(), value.raw_json_token()); + RETURN_IF_ERROR(write_number(value.get_number(), value.raw_json_token(), writer)); break; } case simdjson::ondemand::json_type::object: { - if (!writer_.writeStartObject()) { - err_ = JsonbErrType::E_OUTPUT_FAIL; - LOG(WARNING) << "writeStartObject failed"; - break; + if (!writer.writeStartObject()) { + return Status::InternalError("writeStartObject failed"); } for (auto kv : value.get_object()) { std::string_view key; simdjson::error_code e = kv.unescaped_key().get(key); if (e != simdjson::SUCCESS) { - err_ = JsonbErrType::E_INVALID_KEY_STRING; - LOG(WARNING) << "simdjson get key failed: " << e; - break; + return Status::InternalError(fmt::format("simdjson get key failed: {}", e)); } - if (writer_.writeKey(key.data(), key.size()) == 0) { - err_ = JsonbErrType::E_OUTPUT_FAIL; - LOG(WARNING) << "writeKey failed key: " << key; + // write key + if (key.size() > std::numeric_limits<uint8_t>::max()) { + return Status::InternalError("key size exceeds max limit: {} , {}", key.size(), + std::numeric_limits<uint8_t>::max()); + } + if (writer.writeKey(key.data(), (uint8_t)key.size()) == 0) { + return Status::InternalError("writeKey failed : {}", key); break; Review Comment: unless break -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org