This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-lts in repository https://gitbox.apache.org/repos/asf/doris.git
commit b204d46615608116e645a5f1c060a6528a2a005f Author: Kang <kxiao.ti...@gmail.com> AuthorDate: Thu Dec 29 09:29:09 2022 +0800 [Improvement](JSONB) improve performance JSONB initial json parsing using simdjson (#15219) test data: https://data.gharchive.org/2020-11-13-18.json.gz, 2GB, 197696 lines before: String 13s vs. JSONB 28s after: String 13s vs. JSONB 16s **NOTICE: simdjson need to be patched since BOOL is conflicted with a macro BOOL defined in odbc sqltypes.h** --- .licenserc.yaml | 1 + be/CMakeLists.txt | 2 + be/src/runtime/jsonb_value.h | 4 +- be/src/util/jsonb_error.h | 10 +- be/src/util/jsonb_parser_simd.h | 350 ++++++++++++++++++++++++++++++++ be/src/vec/functions/function_jsonb.cpp | 9 +- 6 files changed, 369 insertions(+), 7 deletions(-) diff --git a/.licenserc.yaml b/.licenserc.yaml index d458e45269..020ee7b4e8 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -52,6 +52,7 @@ header: - "be/src/util/jsonb_document.h" - "be/src/util/jsonb_error.h" - "be/src/util/jsonb_parser.h" + - "be/src/util/jsonb_parser_simd.h" - "be/src/util/jsonb_stream.h" - "be/src/util/jsonb_updater.h" - "be/src/util/jsonb_utils.h" diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index cf51ddc492..915c207189 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -492,6 +492,8 @@ if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}" if (USE_AVX2) set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2") endif() + # set -mlzcnt for leading zero count used by simdjson + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2") endif() set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-attributes -DS2_USE_GFLAGS -DS2_USE_GLOG") diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h index bdb5ba4976..1e03518d9d 100644 --- a/be/src/runtime/jsonb_value.h +++ b/be/src/runtime/jsonb_value.h @@ -22,7 +22,7 @@ #include "util/cpu_info.h" #include "util/hash_util.hpp" #include "util/jsonb_error.h" -#include "util/jsonb_parser.h" +#include "util/jsonb_parser_simd.h" #include "util/jsonb_utils.h" #include "vec/common/string_ref.h" @@ -38,7 +38,7 @@ struct JsonBinaryValue { // default nullprt and size 0 for invalid or NULL value const char* ptr = nullptr; size_t len = 0; - JsonbParser parser; + JsonbParserSIMD parser; JsonBinaryValue() : ptr(nullptr), len(0) {} JsonBinaryValue(char* ptr, int len) { from_json_string(const_cast<const char*>(ptr), len); } diff --git a/be/src/util/jsonb_error.h b/be/src/util/jsonb_error.h index 77d6fa16d0..2ad632fb8b 100644 --- a/be/src/util/jsonb_error.h +++ b/be/src/util/jsonb_error.h @@ -30,12 +30,14 @@ enum class JsonbErrType { E_EMPTY_DOCUMENT, E_OUTPUT_FAIL, E_INVALID_DOCU, + E_INVALID_TYPE, E_INVALID_SCALAR_VALUE, E_INVALID_KEY_STRING, E_INVALID_KEY_LENGTH, E_INVALID_STR, E_INVALID_OBJ, E_INVALID_ARR, + E_INVALID_NUMBER, E_INVALID_HEX, E_INVALID_OCTAL, E_INVALID_DECIMAL, @@ -53,6 +55,7 @@ enum class JsonbErrType { E_INVALID_JSONB_OBJ, E_NESTING_LVL_OVERFLOW, E_INVALID_DOCU_COMPAT, + E_EXCEPTION, // new error code should always be added above E_NUM_ERRORS @@ -77,13 +80,15 @@ private: "Invalid document version", "Empty document", "Fatal error in writing JSONB", - "Invalid document: document must be an object or an array", + "Invalid document", + "Invalid json value type", "Invalid scalar value", "Invalid key string", "Key length exceeds maximum size allowed (64 bytes)", "Invalid string value", "Invalid JSON object", "Invalid JSON array", + "Invalid number", "Invalid HEX number", "Invalid octal number", "Invalid decimal number", @@ -100,7 +105,8 @@ private: "Invalid update operation", "Invalid JSONB object (internal)", "Object or array has too many nesting levels", - "Invalid document: document must be an object or an array", + "Invalid document", + "Exception throwed", nullptr /* E_NUM_ERRORS */ }; diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h new file mode 100644 index 0000000000..10d19a3f57 --- /dev/null +++ b/be/src/util/jsonb_parser_simd.h @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2014, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. An additional grant + * of patent rights can be found in the PATENTS file in the same directory. + * + */ + +/* + * This file defines JsonbParserTSIMD (template) and JsonbParser. + * + * JsonbParserTSIMD is a template class which implements a JSON parser. + * JsonbParserTSIMD parses JSON text, and serialize it to JSONB binary format + * by using JsonbWriterT object. By default, JsonbParserTSIMD creates a new + * JsonbWriterT object with an output stream object. However, you can also + * pass in your JsonbWriterT or any stream object that implements some basic + * interface of std::ostream (see JsonbStream.h). + * + * JsonbParser specializes JsonbParserTSIMD with JsonbOutStream type (see + * JsonbStream.h). So unless you want to provide own a different output stream + * type, use JsonbParser object. + * + * ** Parsing JSON ** + * JsonbParserTSIMD parses JSON string, and directly serializes into JSONB + * packed bytes. There are three ways to parse a JSON string: (1) using + * c-string, (2) using string with len, (3) using std::istream object. You can + * use custom streambuf to redirect output. JsonbOutBuffer is a streambuf used + * internally if the input is raw character buffer. + * + * You can reuse an JsonbParserTSIMD object to parse/serialize multiple JSON + * strings, and the previous JSONB will be overwritten. + * + * If parsing fails (returned false), the error code will be set to one of + * JsonbErrType, and can be retrieved by calling getErrorCode(). + * + * ** External dictionary ** + * During parsing a JSON string, you can pass a call-back function to map a key + * string to an id, and store the dictionary id in JSONB to save space. The + * purpose of using an external dictionary is more towards a collection of + * documents (which has common keys) rather than a single document, so that + * space saving will be significant. + * + * ** Endianness ** + * Note: JSONB serialization doesn't assume endianness of the server. However + * you will need to ensure that the endianness at the reader side is the same + * as that at the writer side (if they are on different machines). Otherwise, + * proper conversion is needed when a number value is returned to the + * caller/writer. + * + * @author Tian Xia <ti...@fb.com> + * + * this file is copied from + * https://github.com/facebook/mysql-5.6/blob/fb-mysql-5.6.35/fbson/FbsonJsonParser.h + * and modified by Doris + */ + +#ifndef JSONB_JSONBJSONPARSERSIMD_H +#define JSONB_JSONBJSONPARSERSIMD_H + +#include <simdjson.h> + +#include <cmath> +#include <limits> + +#include "jsonb_document.h" +#include "jsonb_error.h" +#include "jsonb_writer.h" +#include "string_parser.hpp" + +namespace doris { + +/* + * Template JsonbParserTSIMD + */ +template <class OS_TYPE> +class JsonbParserTSIMD { +public: + JsonbParserTSIMD() : err_(JsonbErrType::E_NONE) {} + + explicit JsonbParserTSIMD(OS_TYPE& os) : writer_(os), err_(JsonbErrType::E_NONE) {} + + // parse a UTF-8 JSON string + bool parse(const std::string& str, hDictInsert handler = nullptr) { + return parse(str.c_str(), (unsigned int)str.size(), handler); + } + + // parse a UTF-8 JSON c-style string (NULL terminated) + bool parse(const char* c_str, hDictInsert handler = nullptr) { + return parse(c_str, (unsigned int)strlen(c_str), handler); + } + + // parse a UTF-8 JSON string with length + bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) { + // reset state before parse + reset(); + + if (!pch || len == 0) { + err_ = JsonbErrType::E_EMPTY_DOCUMENT; + LOG(WARNING) << "empty json string"; + return false; + } + + // parse json using simdjson, return false on exception + try { + simdjson::padded_string json_str {pch, len}; + simdjson::ondemand::document doc = parser_.iterate(json_str); + + // simdjson process top level primitive types specially + // so some repeated code here + switch (doc.type()) { + case simdjson::ondemand::json_type::object: + case simdjson::ondemand::json_type::array: { + parse(doc.get_value(), handler); + break; + } + case simdjson::ondemand::json_type::null: { + if (writer_.writeNull() == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeNull failed"; + } + break; + } + case simdjson::ondemand::json_type::boolean: { + if (writer_.writeBool(doc.get_bool()) == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeBool failed"; + } + break; + } + case simdjson::ondemand::json_type::string: { + write_string(doc.get_string()); + break; + } + case simdjson::ondemand::json_type::number: { + write_number(doc.get_number()); + break; + } + } + + return err_ == JsonbErrType::E_NONE; + } catch (simdjson::simdjson_error& e) { + err_ = JsonbErrType::E_EXCEPTION; + LOG(WARNING) << "simdjson parse exception: " << e.what(); + return false; + } + } + + // parse json, recursively if necessary, by simdjson + // and serialize to binary format by writer + void parse(simdjson::ondemand::value value, hDictInsert handler = nullptr) { + switch (value.type()) { + case simdjson::ondemand::json_type::null: { + if (writer_.writeNull() == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeNull failed"; + } + break; + } + case simdjson::ondemand::json_type::boolean: { + if (writer_.writeBool(value.get_bool()) == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeBool failed"; + } + break; + } + case simdjson::ondemand::json_type::string: { + write_string(value.get_string()); + break; + } + case simdjson::ondemand::json_type::number: { + write_number(value.get_number()); + break; + } + case simdjson::ondemand::json_type::object: { + if (!writer_.writeStartObject()) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeStartObject failed"; + break; + } + + for (auto kv : value.get_object()) { + std::string_view key; + simdjson::error_code e = kv.unescaped_key().get(key); + if (e != simdjson::SUCCESS) { + err_ = JsonbErrType::E_INVALID_KEY_STRING; + LOG(WARNING) << "simdjson get key failed: " << e; + break; + } + + int key_id = -1; + if (handler) { + key_id = handler(key.data(), key.size()); + } + + if (key_id < 0) { + if (writer_.writeKey(key.data(), key.size()) == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeKey failed key: " << key; + break; + } + } else { + if (writer_.writeKey(key_id) == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeKey failed key_id: " << key_id; + break; + } + } + + // parse object value + parse(kv.value(), handler); + if (err_ != JsonbErrType::E_NONE) { + LOG(WARNING) << "parse object value failed"; + break; + } + } + if (err_ != JsonbErrType::E_NONE) { + break; + } + + if (!writer_.writeEndObject()) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeEndObject failed"; + break; + } + + break; + } + case simdjson::ondemand::json_type::array: { + if (!writer_.writeStartArray()) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeStartArray failed"; + break; + } + + for (auto elem : value.get_array()) { + // parse array element + parse(elem.value(), handler); + if (err_ != JsonbErrType::E_NONE) { + LOG(WARNING) << "parse array element failed"; + break; + } + } + if (err_ != JsonbErrType::E_NONE) { + break; + } + + if (!writer_.writeEndArray()) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeEndArray failed"; + break; + } + + break; + } + default: { + err_ = JsonbErrType::E_INVALID_TYPE; + LOG(WARNING) << "unknown value type: "; // << value; + break; + } + + } // end of switch + } + + void write_string(std::string_view str) { + // start writing string + if (!writer_.writeStartString()) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeStartString failed"; + return; + } + + // write string + if (str.size() > 0) { + if (writer_.writeString(str.data(), str.size()) == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeString failed"; + return; + } + } + + // end writing string + if (!writer_.writeEndString()) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeEndString failed"; + return; + } + } + + void write_number(simdjson::ondemand::number num) { + if (num.is_double()) { + if (writer_.writeDouble(num.get_double()) == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeDouble failed"; + return; + } + } else if (num.is_int64() || num.is_uint64()) { + if (num.is_uint64() && num.get_uint64() > std::numeric_limits<int64_t>::max()) { + err_ = JsonbErrType::E_OCTAL_OVERFLOW; + LOG(WARNING) << "overflow number: " << num.get_uint64(); + return; + } + int64_t val = num.is_int64() ? num.get_int64() : num.get_uint64(); + int size = 0; + if (val <= std::numeric_limits<int8_t>::max()) { + size = writer_.writeInt8((int8_t)val); + } else if (val <= std::numeric_limits<int16_t>::max()) { + size = writer_.writeInt16((int16_t)val); + } else if (val <= std::numeric_limits<int32_t>::max()) { + size = writer_.writeInt32((int32_t)val); + } else { // val <= INT64_MAX + size = writer_.writeInt64(val); + } + + if (size == 0) { + err_ = JsonbErrType::E_OUTPUT_FAIL; + LOG(WARNING) << "writeInt failed"; + return; + } + } else { + err_ = JsonbErrType::E_INVALID_NUMBER; + LOG(WARNING) << "invalid number: " << num.as_double(); + return; + } + } + + JsonbWriterT<OS_TYPE>& getWriter() { return writer_; } + + JsonbErrType getErrorCode() { return err_; } + + // clear error code + void clearErr() { err_ = JsonbErrType::E_NONE; } + + void reset() { + writer_.reset(); + clearErr(); + } + +private: + simdjson::ondemand::parser parser_; + JsonbWriterT<OS_TYPE> writer_; + JsonbErrType err_; +}; + +using JsonbParserSIMD = JsonbParserTSIMD<JsonbOutStream>; + +} // namespace doris + +#endif // JSONB_JSONBJSONPARSERSIMD_H diff --git a/be/src/vec/functions/function_jsonb.cpp b/be/src/vec/functions/function_jsonb.cpp index ea84ddf3ae..02f352fb57 100644 --- a/be/src/vec/functions/function_jsonb.cpp +++ b/be/src/vec/functions/function_jsonb.cpp @@ -18,6 +18,7 @@ #include <boost/token_functions.hpp> #include <vector> +// #include "util/jsonb_parser_simd.h" #include "util/string_parser.hpp" #include "util/string_util.h" #include "vec/columns/column.h" @@ -47,7 +48,7 @@ enum class JsonbParseErrorMode { FAIL = 0, RETURN_NULL, RETURN_VALUE, RETURN_INV template <NullalbeMode nullable_mode, JsonbParseErrorMode parse_error_handle_mode> class FunctionJsonbParseBase : public IFunction { private: - JsonbParser default_value_parser; + JsonbParserSIMD default_value_parser; bool has_const_default_value = false; public: @@ -193,6 +194,10 @@ public: size_t size = col_from.size(); col_to->reserve(size); + // parser can be reused for performance + JsonbParserSIMD parser; + JsonbErrType error = JsonbErrType::E_NONE; + for (size_t i = 0; i < input_rows_count; ++i) { if (col_from.is_null_at(i)) { null_map->get_data()[i] = 1; @@ -201,8 +206,6 @@ public: } const auto& val = col_from_string->get_data_at(i); - JsonbParser parser; - JsonbErrType error = JsonbErrType::E_NONE; if (parser.parse(val.data, val.size)) { // insert jsonb format data col_to->insert_data(parser.getWriter().getOutput()->getBuffer(), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org