This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new f64ddfebad1 [Improvement](Variant) optimize performance while loading variant data (#49316) f64ddfebad1 is described below commit f64ddfebad1daffa8cad9add91596c8c1f3961b5 Author: lihangyu <lihan...@selectdb.com> AuthorDate: Thu Mar 20 17:23:39 2025 +0800 [Improvement](Variant) optimize performance while loading variant data (#49316) --- .../rowset/segment_v2/vertical_segment_writer.cpp | 2 +- be/src/vec/columns/column_object.cpp | 32 ++++++++++++++++------ be/src/vec/columns/column_object.h | 15 ++++++++++ be/src/vec/json/parse2column.cpp | 12 ++++++-- be/src/vec/olap/olap_data_convertor.cpp | 4 +++ be/src/vec/olap/olap_data_convertor.h | 1 + 6 files changed, 53 insertions(+), 13 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 8e8c1a5d209..1f559ae6217 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -1097,7 +1097,7 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& RETURN_IF_ERROR(_column_writers[current_column_id]->append( column->get_nullmap(), column->get_data(), data.num_rows)); _flush_schema->append_column(tablet_column); - _olap_data_convertor->clear_source_content(); + _olap_data_convertor->clear_source_content(current_column_id); } } diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index 90720c57235..15b24589d46 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -368,7 +368,10 @@ DataTypeSerDeSPtr ColumnObject::Subcolumn::generate_data_serdes(DataTypePtr type ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, bool is_nullable_, bool is_root_) - : least_common_type(type), is_nullable(is_nullable_), is_root(is_root_) { + : least_common_type(type), + is_nullable(is_nullable_), + is_root(is_root_), + num_rows(data_->size()) { data.push_back(std::move(data_)); data_types.push_back(type); data_serdes.push_back(generate_data_serdes(type, is_root)); @@ -380,14 +383,11 @@ ColumnObject::Subcolumn::Subcolumn(size_t size_, bool is_nullable_, bool is_root : least_common_type(std::make_shared<DataTypeNothing>()), is_nullable(is_nullable_), num_of_defaults_in_prefix(size_), - is_root(is_root_) {} + is_root(is_root_), + num_rows(size_) {} size_t ColumnObject::Subcolumn::Subcolumn::size() const { - size_t res = num_of_defaults_in_prefix; - for (const auto& part : data) { - res += part->size(); - } - return res; + return num_rows + current_num_of_defaults; } size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const { @@ -423,6 +423,7 @@ void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) { void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { auto base_type = WhichDataType(info.scalar_type_id); + ++num_rows; if (base_type.is_nothing() && info.num_dimensions == 0) { insert_default(); return; @@ -554,6 +555,7 @@ void ColumnObject::Subcolumn::insert_range_from(const Subcolumn& src, size_t sta length, src.size()); } size_t end = start + length; + num_rows += length; // num_rows += length; if (data.empty()) { add_new_column_part(src.get_least_common_type()); @@ -634,7 +636,7 @@ void ColumnObject::Subcolumn::insert_range_from(const Subcolumn& src, size_t sta } bool ColumnObject::Subcolumn::is_finalized() const { - return num_of_defaults_in_prefix == 0 && (data.empty() || (data.size() == 1)); + return current_num_of_defaults == 0 && num_of_defaults_in_prefix == 0 && (data.empty() || (data.size() == 1)); } template <typename Func> @@ -681,6 +683,10 @@ void ColumnObject::resize(size_t n) { } void ColumnObject::Subcolumn::finalize(FinalizeMode mode) { + if (current_num_of_defaults) { + insert_many_defaults(current_num_of_defaults); + current_num_of_defaults = 0; + } if (!is_root && data.size() == 1 && num_of_defaults_in_prefix == 0) { data[0] = data[0]->convert_to_full_column_if_const(); return; @@ -722,11 +728,12 @@ void ColumnObject::Subcolumn::finalize(FinalizeMode mode) { } void ColumnObject::Subcolumn::insert_default() { - if (data.empty()) { + if (UNLIKELY(data.empty())) { ++num_of_defaults_in_prefix; } else { data.back()->insert_default(); } + ++num_rows; } void ColumnObject::Subcolumn::insert_many_defaults(size_t length) { @@ -735,6 +742,7 @@ void ColumnObject::Subcolumn::insert_many_defaults(size_t length) { } else { data.back()->insert_many_defaults(length); } + num_rows += length; } void ColumnObject::Subcolumn::pop_back(size_t n) { @@ -742,6 +750,7 @@ void ColumnObject::Subcolumn::pop_back(size_t n) { throw doris::Exception(ErrorCode::OUT_OF_BOUND, "Invalid number of elements to pop: {}, size: {}", n, size()); } + num_rows -= n; size_t num_removed = 0; for (auto it = data.rbegin(); it != data.rend(); ++it) { if (n == 0) { @@ -2127,6 +2136,7 @@ void ColumnObject::ensure_root_node_type(const DataTypePtr& expected_root_type) root.data[0] = casted_column; root.data_types[0] = expected_root_type; root.least_common_type = Subcolumn::LeastCommonType {expected_root_type, true}; + root.num_rows = casted_column->size(); } } @@ -2181,6 +2191,9 @@ size_t ColumnObject::filter(const Filter& filter) { if (count == 0) { clear(); } else { + for (auto& subcolumn : subcolumns) { + subcolumn->data.num_rows = count; + } for_each_subcolumn([&](auto& part) { if (part->size() != count) { if (part->is_exclusive()) { @@ -2210,6 +2223,7 @@ void ColumnObject::clear_column_data() { (*std::move(part)).clear(); } entry->data.num_of_defaults_in_prefix = 0; + entry->data.num_rows = 0; } serialized_sparse_column->clear(); num_rows = 0; diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index c367da89da4..422bb38a704 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -154,6 +154,18 @@ public: void insert_default(); + void increment_default_counter() { + ++current_num_of_defaults; + } + + void reset_current_num_of_defaults() { + current_num_of_defaults = 0; + } + + size_t cur_num_of_defaults() { + return current_num_of_defaults; + } + void insert_many_defaults(size_t length); void insert_range_from(const Subcolumn& src, size_t start, size_t length); @@ -252,6 +264,9 @@ public: // If it is the root subcolumn of SubcolumnsTree, // the root Node should be JSONB type when finalize bool is_root = false; + size_t num_rows = 0; + // distinguish from num_of_defaults_in_prefix when data is not empty + size_t current_num_of_defaults = 0; }; using Subcolumns = SubcolumnsTree<Subcolumn>; diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp index 0e8472928a5..4ef416dc865 100644 --- a/be/src/vec/json/parse2column.cpp +++ b/be/src/vec/json/parse2column.cpp @@ -154,7 +154,7 @@ void parse_json_to_variant(IColumn& column, const char* src, size_t length, } auto& [paths, values] = *result; assert(paths.size() == values.size()); - size_t old_num_rows = column_object.size(); + size_t old_num_rows = column_object.rows(); for (size_t i = 0; i < paths.size(); ++i) { FieldInfo field_info; get_field_info(values[i], &field_info); @@ -173,6 +173,10 @@ void parse_json_to_variant(IColumn& column, const char* src, size_t length, throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}", paths[i].get_path()); } + if (subcolumn->cur_num_of_defaults() > 0) { + subcolumn->insert_many_defaults(subcolumn->cur_num_of_defaults()); + subcolumn->reset_current_num_of_defaults(); + } if (subcolumn->size() != old_num_rows) { throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "subcolumn {} size missmatched, may contains duplicated entry", @@ -184,9 +188,10 @@ void parse_json_to_variant(IColumn& column, const char* src, size_t length, const auto& subcolumns = column_object.get_subcolumns(); for (const auto& entry : subcolumns) { if (entry->data.size() == old_num_rows) { - bool inserted = column_object.try_insert_default_from_nested(entry); + bool inserted = UNLIKELY(entry->path.has_nested_part() && + column_object.try_insert_default_from_nested(entry)); if (!inserted) { - entry->data.insert_default(); + entry->data.increment_default_counter(); } } } @@ -213,6 +218,7 @@ void parse_json_to_variant(IColumn& column, const ColumnString& raw_json_column, StringRef raw_json = raw_json_column.get_data_at(i); parse_json_to_variant(column, raw_json.data, raw_json.size, parser.get(), config); } + column.finalize(); } } // namespace doris::vectorized diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 47e85633488..c2887c41934 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -285,6 +285,10 @@ void OlapBlockDataConvertor::clear_source_content() { } } +void OlapBlockDataConvertor::clear_source_content(size_t cid) { + _convertors[cid]->clear_source_column(); +} + std::pair<Status, IOlapColumnDataAccessor*> OlapBlockDataConvertor::convert_column_data( size_t cid) { assert(cid < _convertors.size()); diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index 936179356bb..c2a8e6ace47 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -84,6 +84,7 @@ public: size_t row_pos, size_t num_rows, uint32_t cid); void clear_source_content(); + void clear_source_content(size_t cid); std::pair<Status, IOlapColumnDataAccessor*> convert_column_data(size_t cid); void add_column_data_convertor(const TabletColumn& column); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org