This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new 04304c71f8e [Fix](Variant) fix serialize with json key contains `.` as name (#50102) 04304c71f8e is described below commit 04304c71f8e2944e9d94d561d1cfe35a75cb1168 Author: lihangyu <lihan...@selectdb.com> AuthorDate: Wed Apr 16 18:30:33 2025 +0800 [Fix](Variant) fix serialize with json key contains `.` as name (#50102) 1. get_path with lost object nesting level information when calling ColumnObject::get when VariantMap is std::map<std::string, Field>, so change VariantMap to std::<PathInData, field> to maintain nesting level 2. serialize/deserialize should also serialize PathInData to ColumnPathInfo to maintain nesting level cherry-pick from #49594 --- .../rowset/segment_v2/hierarchical_data_reader.cpp | 5 ++--- .../rowset/segment_v2/hierarchical_data_reader.h | 13 ++++++++++--- be/src/vec/columns/column_object.cpp | 10 +++------- be/src/vec/core/field.h | 3 ++- be/src/vec/data_types/data_type_object.cpp | 17 ++++++++++------- be/src/vec/functions/function_variant_type.cpp | 2 +- be/src/vec/json/json_parser.h | 7 +++++++ be/src/vec/json/path_in_data.h | 19 ------------------- be/test/vec/columns/column_object_test.cpp | 6 ++++-- gensrc/proto/data.proto | 1 + regression-test/data/variant_p0/column_name.out | Bin 469 -> 545 bytes regression-test/suites/variant_p0/column_name.groovy | 15 +++++++++++++-- 12 files changed, 53 insertions(+), 45 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp index f0af8f77894..7e806ae9665 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp @@ -173,7 +173,7 @@ ordinal_t HierarchicalDataReader::get_current_ordinal() const { Status HierarchicalDataReader::_process_sub_columns( vectorized::ColumnObject& container_variant, - const vectorized::PathsWithColumnAndType& non_nested_subcolumns) { + const PathsWithColumnAndType& non_nested_subcolumns) { for (const auto& entry : non_nested_subcolumns) { DCHECK(!entry.path.has_nested_part()); bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), @@ -188,8 +188,7 @@ Status HierarchicalDataReader::_process_sub_columns( Status HierarchicalDataReader::_process_nested_columns( vectorized::ColumnObject& container_variant, - const std::map<vectorized::PathInData, vectorized::PathsWithColumnAndType>& - nested_subcolumns, + const std::map<vectorized::PathInData, PathsWithColumnAndType>& nested_subcolumns, size_t nrows) { using namespace vectorized; // Iterate nested subcolumns and flatten them, the entry contains the nested subcolumns of the same nested parent diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h index 83e9fbcc0e7..3df24176a5e 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.h @@ -51,6 +51,14 @@ namespace doris::segment_v2 { +struct PathWithColumnAndType { + vectorized::PathInData path; + vectorized::ColumnPtr column; + vectorized::DataTypePtr type; +}; + +using PathsWithColumnAndType = std::vector<PathWithColumnAndType>; + // Reader for hierarchical data for variant, merge with root(sparse encoded columns) class HierarchicalDataReader : public ColumnIterator { public: @@ -97,12 +105,11 @@ private: } Status _process_sub_columns(vectorized::ColumnObject& container_variant, - const vectorized::PathsWithColumnAndType& non_nested_subcolumns); + const PathsWithColumnAndType& non_nested_subcolumns); Status _process_nested_columns( vectorized::ColumnObject& container_variant, - const std::map<vectorized::PathInData, vectorized::PathsWithColumnAndType>& - nested_subcolumns, + const std::map<vectorized::PathInData, PathsWithColumnAndType>& nested_subcolumns, size_t nrows); Status _process_sparse_column(vectorized::ColumnObject& container_variant, size_t nrows); diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index b495679d783..c8b0091dbc2 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -770,11 +770,7 @@ void ColumnObject::try_insert(const Field& field) { root->insert(field); } else { const auto& object = field.get<const VariantMap&>(); - for (const auto& [key_str, value] : object) { - PathInData key; - if (!key_str.empty()) { - key = PathInData(key_str); - } + for (const auto& [key, value] : object) { if (!has_subcolumn(key)) { bool succ = add_sub_column(key, old_size); if (!succ) { @@ -1036,7 +1032,7 @@ void ColumnObject::get(size_t n, Field& res) const { entry->data.get(n, field); // Notice: we treat null as empty field, since we do not distinguish null and empty for Variant type. if (field.get_type() != Field::Types::Null) { - object.try_emplace(entry->path.get_path(), field); + object.try_emplace(entry->path, field); } } @@ -1048,7 +1044,7 @@ void ColumnObject::get(size_t n, Field& res) const { for (size_t i = offset; i != end; ++i) { const StringRef path_data = path->get_data_at(i); const auto& data = ColumnObject::deserialize_from_sparse_column(value, i); - object.try_emplace(std::string(path_data.data, path_data.size), data.first); + object.try_emplace(PathInData(path_data), data.first); } if (object.empty()) { diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h index 246bf551469..d2f941d2483 100644 --- a/be/src/vec/core/field.h +++ b/be/src/vec/core/field.h @@ -44,6 +44,7 @@ #include "util/quantile_state.h" #include "vec/common/uint128.h" #include "vec/core/types.h" +#include "vec/json/path_in_data.h" namespace doris { namespace vectorized { @@ -155,7 +156,7 @@ DEFINE_FIELD_VECTOR(Tuple); DEFINE_FIELD_VECTOR(Map); #undef DEFINE_FIELD_VECTOR -using FieldMap = std::map<String, Field, std::less<String>>; +using FieldMap = std::map<PathInData, Field>; #define DEFINE_FIELD_MAP(X) \ struct X : public FieldMap { \ using FieldMap::FieldMap; \ diff --git a/be/src/vec/data_types/data_type_object.cpp b/be/src/vec/data_types/data_type_object.cpp index 457d66adc64..9003c711f89 100644 --- a/be/src/vec/data_types/data_type_object.cpp +++ b/be/src/vec/data_types/data_type_object.cpp @@ -80,6 +80,7 @@ int64_t DataTypeObject::get_uncompressed_serialized_bytes(const IColumn& column, } PColumnMeta column_meta_pb; column_meta_pb.set_name(entry->path.get_path()); + entry->path.to_protobuf(column_meta_pb.mutable_column_path(), -1 /*not used here*/); type->to_pb_column_meta(&column_meta_pb); std::string meta_binary; column_meta_pb.SerializeToString(&meta_binary); @@ -130,6 +131,7 @@ char* DataTypeObject::serialize(const IColumn& column, char* buf, int be_exec_ve ++num_of_columns; PColumnMeta column_meta_pb; column_meta_pb.set_name(entry->path.get_path()); + entry->path.to_protobuf(column_meta_pb.mutable_column_path(), -1 /*not used here*/); type->to_pb_column_meta(&column_meta_pb); std::string meta_binary; column_meta_pb.SerializeToString(&meta_binary); @@ -168,7 +170,6 @@ const char* DataTypeObject::deserialize(const char* buf, MutableColumnPtr* colum // 1. deserialize num of subcolumns uint32_t num_subcolumns = *reinterpret_cast<const uint32_t*>(buf); buf += sizeof(uint32_t); - bool root_added = false; // 2. deserialize each subcolumn in a loop for (uint32_t i = 0; i < num_subcolumns; i++) { // 2.1 deserialize subcolumn column path (str size + str data) @@ -184,13 +185,15 @@ const char* DataTypeObject::deserialize(const char* buf, MutableColumnPtr* colum MutableColumnPtr sub_column = type->create_column(); buf = type->deserialize(buf, &sub_column, be_exec_version); - // add subcolumn to column_object PathInData key; - if (!column_meta_pb.name().empty()) { + if (column_meta_pb.has_column_path()) { + // init from path pb + key.from_protobuf(column_meta_pb.column_path()); + } else if (!column_meta_pb.name().empty()) { + // init from name for compatible key = PathInData {column_meta_pb.name()}; - } else { - root_added = true; } + // add subcolumn to column_object column_object->add_sub_column(key, std::move(sub_column), type); } size_t num_rows = 0; @@ -211,8 +214,8 @@ const char* DataTypeObject::deserialize(const char* buf, MutableColumnPtr* colum column_object->get_sparse_column()->size() + num_rows); } - if (!root_added && column_object->get_subcolumn({})) { - column_object->get_subcolumn({})->insert_many_defaults(num_rows); + if (column_object->get_subcolumn({})) { + column_object->get_subcolumn({})->resize(num_rows); } column_object->set_num_rows(num_rows); diff --git a/be/src/vec/functions/function_variant_type.cpp b/be/src/vec/functions/function_variant_type.cpp index 8e541a6958f..f0f443546f6 100644 --- a/be/src/vec/functions/function_variant_type.cpp +++ b/be/src/vec/functions/function_variant_type.cpp @@ -52,7 +52,7 @@ public: } FieldInfo info; schema_util::get_field_info(value, &info); - result[key] = getTypeName(info.scalar_type_id); + result[key.get_path()] = getTypeName(info.scalar_type_id); } return result; } diff --git a/be/src/vec/json/json_parser.h b/be/src/vec/json/json_parser.h index af2e452dddc..52336d455c0 100644 --- a/be/src/vec/json/json_parser.h +++ b/be/src/vec/json/json_parser.h @@ -124,6 +124,13 @@ enum class ExtractType { struct ParseConfig { bool enable_flatten_nested = false; }; +/// Result of parsing of a document. +/// Contains all paths extracted from document +/// and values which are related to them. +struct ParseResult { + std::vector<PathInData> paths; + std::vector<Field> values; +}; template <typename ParserImpl> class JSONDataParser { public: diff --git a/be/src/vec/json/path_in_data.h b/be/src/vec/json/path_in_data.h index d4a84323231..d44b7c93784 100644 --- a/be/src/vec/json/path_in_data.h +++ b/be/src/vec/json/path_in_data.h @@ -29,11 +29,7 @@ #include <vector> #include "gen_cpp/segment_v2.pb.h" -#include "vec/columns/column.h" #include "vec/common/uint128.h" -#include "vec/core/field.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type.h" namespace doris::vectorized { @@ -134,13 +130,6 @@ private: size_t current_anonymous_array_level = 0; }; using PathsInData = std::vector<PathInData>; -/// Result of parsing of a document. -/// Contains all paths extracted from document -/// and values which are related to them. -struct ParseResult { - std::vector<PathInData> paths; - std::vector<Field> values; -}; struct PathInDataRef { const PathInData* ref; @@ -153,12 +142,4 @@ struct PathInDataRef { bool operator==(const PathInDataRef& other) const { return *this->ref == *other.ref; } }; -struct PathWithColumnAndType { - PathInData path; - ColumnPtr column; - DataTypePtr type; -}; - -using PathsWithColumnAndType = std::vector<PathWithColumnAndType>; - } // namespace doris::vectorized diff --git a/be/test/vec/columns/column_object_test.cpp b/be/test/vec/columns/column_object_test.cpp index 99130c739a5..4d37ab02242 100644 --- a/be/test/vec/columns/column_object_test.cpp +++ b/be/test/vec/columns/column_object_test.cpp @@ -77,7 +77,8 @@ void convert_field_to_rapidjson(const vectorized::Field& field, rapidjson::Value continue; } rapidjson::Value key; - key.SetString(item.first.data(), cast_set<rapidjson::SizeType>(item.first.size())); + key.SetString(item.first.get_path().data(), + cast_set<rapidjson::SizeType>(item.first.get_path().size())); rapidjson::Value val; convert_field_to_rapidjson(item.second, val, allocator); if (val.IsNull() && item.first.empty()) { @@ -104,7 +105,8 @@ void convert_variant_map_to_rapidjson(const vectorized::VariantMap& map, rapidjs continue; } rapidjson::Value key; - key.SetString(item.first.data(), cast_set<rapidjson::SizeType>(item.first.size())); + key.SetString(item.first.get_path().data(), + cast_set<rapidjson::SizeType>(item.first.get_path().size())); rapidjson::Value val; convert_field_to_rapidjson(item.second, val, allocator); if (val.IsNull() && item.first.empty()) { diff --git a/gensrc/proto/data.proto b/gensrc/proto/data.proto index 96bd7ece7e3..4c4376ee151 100644 --- a/gensrc/proto/data.proto +++ b/gensrc/proto/data.proto @@ -66,6 +66,7 @@ message PColumnMeta { optional string function_name = 7; optional int32 be_exec_version = 8; optional int32 variant_max_subcolumns_count = 9 [default = 0]; + optional segment_v2.ColumnPathInfo column_path = 10; } message PBlock { diff --git a/regression-test/data/variant_p0/column_name.out b/regression-test/data/variant_p0/column_name.out index 6ac882d2922..0f54df05d91 100644 Binary files a/regression-test/data/variant_p0/column_name.out and b/regression-test/data/variant_p0/column_name.out differ diff --git a/regression-test/suites/variant_p0/column_name.groovy b/regression-test/suites/variant_p0/column_name.groovy index 7962112ff75..7cf7fe198b1 100644 --- a/regression-test/suites/variant_p0/column_name.groovy +++ b/regression-test/suites/variant_p0/column_name.groovy @@ -25,7 +25,7 @@ suite("regression_test_variant_column_name", "variant_type"){ ) DUPLICATE KEY(`k`) DISTRIBUTED BY HASH(k) BUCKETS 1 - properties("replication_num" = "1", "disable_auto_compaction" = "true"); + properties("replication_num" = "1", "disable_auto_compaction" = "false"); """ sql """insert into ${table_name} values (1, '{"中文" : "中文", "\\\u4E2C\\\u6587": "unicode"}')""" @@ -61,7 +61,18 @@ suite("regression_test_variant_column_name", "variant_type"){ sql """insert into var_column_name values (7, '{"": 1234566}')""" sql """insert into var_column_name values (7, '{"": 8888888}')""" - qt_sql "select Tags[''] from var_column_name order by cast(Tags[''] as string)" + qt_sql "select cast(Tags[''] as text) from var_column_name order by cast(Tags[''] as string)" + + // name with `.` + sql "truncate table var_column_name" + sql """insert into var_column_name values (7, '{"a.b": "UPPER CASE", "a.c": "lower case", "a" : {"b" : 123}, "a" : {"c" : 456}}')""" + for (int i = 0; i < 7; i++) { + sql """insert into var_column_name select * from var_column_name""" + } + qt_sql_cnt_1 "select count(Tags['a.b']) from var_column_name" + qt_sql_cnt_2 "select count(Tags['a.c']) from var_column_name" + qt_sql_cnt_3 "select count(Tags['a']['b']) from var_column_name" + qt_sql_cnt_4 "select count(Tags['a']['c']) from var_column_name" try { sql """insert into var_column_name values (7, '{"": "UPPER CASE", "": "lower case"}')""" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org