This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/variant-sparse by this push:
     new f64ddfebad1 [Improvement](Variant) optimize performance while loading 
variant data  (#49316)
f64ddfebad1 is described below

commit f64ddfebad1daffa8cad9add91596c8c1f3961b5
Author: lihangyu <lihan...@selectdb.com>
AuthorDate: Thu Mar 20 17:23:39 2025 +0800

    [Improvement](Variant) optimize performance while loading variant data  
(#49316)
---
 .../rowset/segment_v2/vertical_segment_writer.cpp  |  2 +-
 be/src/vec/columns/column_object.cpp               | 32 ++++++++++++++++------
 be/src/vec/columns/column_object.h                 | 15 ++++++++++
 be/src/vec/json/parse2column.cpp                   | 12 ++++++--
 be/src/vec/olap/olap_data_convertor.cpp            |  4 +++
 be/src/vec/olap/olap_data_convertor.h              |  1 +
 6 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
index 8e8c1a5d209..1f559ae6217 100644
--- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
@@ -1097,7 +1097,7 @@ Status 
VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock&
             RETURN_IF_ERROR(_column_writers[current_column_id]->append(
                     column->get_nullmap(), column->get_data(), data.num_rows));
             _flush_schema->append_column(tablet_column);
-            _olap_data_convertor->clear_source_content();
+            _olap_data_convertor->clear_source_content(current_column_id);
         }
     }
 
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index 90720c57235..15b24589d46 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -368,7 +368,10 @@ DataTypeSerDeSPtr 
ColumnObject::Subcolumn::generate_data_serdes(DataTypePtr type
 
 ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, DataTypePtr type, 
bool is_nullable_,
                                    bool is_root_)
-        : least_common_type(type), is_nullable(is_nullable_), 
is_root(is_root_) {
+        : least_common_type(type),
+          is_nullable(is_nullable_),
+          is_root(is_root_),
+          num_rows(data_->size()) {
     data.push_back(std::move(data_));
     data_types.push_back(type);
     data_serdes.push_back(generate_data_serdes(type, is_root));
@@ -380,14 +383,11 @@ ColumnObject::Subcolumn::Subcolumn(size_t size_, bool 
is_nullable_, bool is_root
         : least_common_type(std::make_shared<DataTypeNothing>()),
           is_nullable(is_nullable_),
           num_of_defaults_in_prefix(size_),
-          is_root(is_root_) {}
+          is_root(is_root_),
+          num_rows(size_) {}
 
 size_t ColumnObject::Subcolumn::Subcolumn::size() const {
-    size_t res = num_of_defaults_in_prefix;
-    for (const auto& part : data) {
-        res += part->size();
-    }
-    return res;
+    return num_rows + current_num_of_defaults;
 }
 
 size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const {
@@ -423,6 +423,7 @@ void 
ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) {
 
 void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) {
     auto base_type = WhichDataType(info.scalar_type_id);
+    ++num_rows;
     if (base_type.is_nothing() && info.num_dimensions == 0) {
         insert_default();
         return;
@@ -554,6 +555,7 @@ void ColumnObject::Subcolumn::insert_range_from(const 
Subcolumn& src, size_t sta
                 length, src.size());
     }
     size_t end = start + length;
+    num_rows += length;
     // num_rows += length;
     if (data.empty()) {
         add_new_column_part(src.get_least_common_type());
@@ -634,7 +636,7 @@ void ColumnObject::Subcolumn::insert_range_from(const 
Subcolumn& src, size_t sta
 }
 
 bool ColumnObject::Subcolumn::is_finalized() const {
-    return num_of_defaults_in_prefix == 0 && (data.empty() || (data.size() == 
1));
+    return  current_num_of_defaults == 0 && num_of_defaults_in_prefix == 0 && 
(data.empty() || (data.size() == 1));
 }
 
 template <typename Func>
@@ -681,6 +683,10 @@ void ColumnObject::resize(size_t n) {
 }
 
 void ColumnObject::Subcolumn::finalize(FinalizeMode mode) {
+    if (current_num_of_defaults) {
+        insert_many_defaults(current_num_of_defaults);
+        current_num_of_defaults = 0;
+    }
     if (!is_root && data.size() == 1 && num_of_defaults_in_prefix == 0) {
         data[0] = data[0]->convert_to_full_column_if_const();
         return;
@@ -722,11 +728,12 @@ void ColumnObject::Subcolumn::finalize(FinalizeMode mode) 
{
 }
 
 void ColumnObject::Subcolumn::insert_default() {
-    if (data.empty()) {
+    if (UNLIKELY(data.empty())) {
         ++num_of_defaults_in_prefix;
     } else {
         data.back()->insert_default();
     }
+    ++num_rows;
 }
 
 void ColumnObject::Subcolumn::insert_many_defaults(size_t length) {
@@ -735,6 +742,7 @@ void ColumnObject::Subcolumn::insert_many_defaults(size_t 
length) {
     } else {
         data.back()->insert_many_defaults(length);
     }
+    num_rows += length;
 }
 
 void ColumnObject::Subcolumn::pop_back(size_t n) {
@@ -742,6 +750,7 @@ void ColumnObject::Subcolumn::pop_back(size_t n) {
         throw doris::Exception(ErrorCode::OUT_OF_BOUND,
                                "Invalid number of elements to pop: {}, size: 
{}", n, size());
     }
+    num_rows -= n;
     size_t num_removed = 0;
     for (auto it = data.rbegin(); it != data.rend(); ++it) {
         if (n == 0) {
@@ -2127,6 +2136,7 @@ void ColumnObject::ensure_root_node_type(const 
DataTypePtr& expected_root_type)
         root.data[0] = casted_column;
         root.data_types[0] = expected_root_type;
         root.least_common_type = Subcolumn::LeastCommonType 
{expected_root_type, true};
+        root.num_rows = casted_column->size();
     }
 }
 
@@ -2181,6 +2191,9 @@ size_t ColumnObject::filter(const Filter& filter) {
     if (count == 0) {
         clear();
     } else {
+        for (auto& subcolumn : subcolumns) {
+            subcolumn->data.num_rows = count;
+        }
         for_each_subcolumn([&](auto& part) {
             if (part->size() != count) {
                 if (part->is_exclusive()) {
@@ -2210,6 +2223,7 @@ void ColumnObject::clear_column_data() {
             (*std::move(part)).clear();
         }
         entry->data.num_of_defaults_in_prefix = 0;
+        entry->data.num_rows = 0;
     }
     serialized_sparse_column->clear();
     num_rows = 0;
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index c367da89da4..422bb38a704 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -154,6 +154,18 @@ public:
 
         void insert_default();
 
+        void increment_default_counter() {
+            ++current_num_of_defaults;
+        }
+
+        void reset_current_num_of_defaults() {
+            current_num_of_defaults = 0;
+        }
+
+        size_t cur_num_of_defaults() {
+            return current_num_of_defaults;
+        }
+
         void insert_many_defaults(size_t length);
 
         void insert_range_from(const Subcolumn& src, size_t start, size_t 
length);
@@ -252,6 +264,9 @@ public:
         // If it is the root subcolumn of SubcolumnsTree,
         // the root Node should be JSONB type when finalize
         bool is_root = false;
+        size_t num_rows = 0;
+        // distinguish from num_of_defaults_in_prefix when data is not empty
+        size_t current_num_of_defaults = 0;
     };
     using Subcolumns = SubcolumnsTree<Subcolumn>;
 
diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp
index 0e8472928a5..4ef416dc865 100644
--- a/be/src/vec/json/parse2column.cpp
+++ b/be/src/vec/json/parse2column.cpp
@@ -154,7 +154,7 @@ void parse_json_to_variant(IColumn& column, const char* 
src, size_t length,
     }
     auto& [paths, values] = *result;
     assert(paths.size() == values.size());
-    size_t old_num_rows = column_object.size();
+    size_t old_num_rows = column_object.rows();
     for (size_t i = 0; i < paths.size(); ++i) {
         FieldInfo field_info;
         get_field_info(values[i], &field_info);
@@ -173,6 +173,10 @@ void parse_json_to_variant(IColumn& column, const char* 
src, size_t length,
             throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to 
find sub column {}",
                                    paths[i].get_path());
         }
+        if (subcolumn->cur_num_of_defaults() > 0) {
+            subcolumn->insert_many_defaults(subcolumn->cur_num_of_defaults());
+            subcolumn->reset_current_num_of_defaults();
+        }
         if (subcolumn->size() != old_num_rows) {
             throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
                                    "subcolumn {} size missmatched, may 
contains duplicated entry",
@@ -184,9 +188,10 @@ void parse_json_to_variant(IColumn& column, const char* 
src, size_t length,
     const auto& subcolumns = column_object.get_subcolumns();
     for (const auto& entry : subcolumns) {
         if (entry->data.size() == old_num_rows) {
-            bool inserted = 
column_object.try_insert_default_from_nested(entry);
+            bool inserted = UNLIKELY(entry->path.has_nested_part() &&
+                                     
column_object.try_insert_default_from_nested(entry));
             if (!inserted) {
-                entry->data.insert_default();
+                entry->data.increment_default_counter();
             }
         }
     }
@@ -213,6 +218,7 @@ void parse_json_to_variant(IColumn& column, const 
ColumnString& raw_json_column,
         StringRef raw_json = raw_json_column.get_data_at(i);
         parse_json_to_variant(column, raw_json.data, raw_json.size, 
parser.get(), config);
     }
+    column.finalize();
 }
 
 } // namespace doris::vectorized
diff --git a/be/src/vec/olap/olap_data_convertor.cpp 
b/be/src/vec/olap/olap_data_convertor.cpp
index 47e85633488..c2887c41934 100644
--- a/be/src/vec/olap/olap_data_convertor.cpp
+++ b/be/src/vec/olap/olap_data_convertor.cpp
@@ -285,6 +285,10 @@ void OlapBlockDataConvertor::clear_source_content() {
     }
 }
 
+void OlapBlockDataConvertor::clear_source_content(size_t cid) {
+    _convertors[cid]->clear_source_column();
+}
+
 std::pair<Status, IOlapColumnDataAccessor*> 
OlapBlockDataConvertor::convert_column_data(
         size_t cid) {
     assert(cid < _convertors.size());
diff --git a/be/src/vec/olap/olap_data_convertor.h 
b/be/src/vec/olap/olap_data_convertor.h
index 936179356bb..c2a8e6ace47 100644
--- a/be/src/vec/olap/olap_data_convertor.h
+++ b/be/src/vec/olap/olap_data_convertor.h
@@ -84,6 +84,7 @@ public:
                                                    size_t row_pos, size_t 
num_rows, uint32_t cid);
 
     void clear_source_content();
+    void clear_source_content(size_t cid);
     std::pair<Status, IOlapColumnDataAccessor*> convert_column_data(size_t 
cid);
     void add_column_data_convertor(const TabletColumn& column);
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to