This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch variant-sparse
in repository https://gitbox.apache.org/repos/asf/doris.git

commit e9bb4e8db152f58e411058247ece6f4dae4905e4
Author: lihangyu <lihan...@selectdb.com>
AuthorDate: Wed Mar 5 10:44:23 2025 +0800

    fix sparse caculate wrong paths and iterator init with wrong column_id 
(#48656)
---
 be/src/olap/rowset/segment_v2/column_reader.cpp    |  1 +
 be/src/olap/rowset/segment_v2/segment.cpp          |  1 +
 .../segment_v2/variant_column_writer_impl.cpp      |  4 +-
 be/src/vec/common/schema_util.cpp                  | 64 +++++++++++++---------
 4 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 7b68de904c5..d193216a7a2 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -321,6 +321,7 @@ Status 
VariantColumnReader::_create_sparse_merge_reader(ColumnIterator** iterato
             return Status::InternalError("Failed to add node path {}", path);
         }
     }
+    VLOG_DEBUG << "subcolumns to merge " << src_subcolumns_for_sparse.size();
 
     // Create sparse column merge reader
     *iterator = new SparseColumnMergeReader(
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 579d94c0eb7..9a4dcbf27d4 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -603,6 +603,7 @@ Status Segment::_create_column_readers(const 
SegmentFooterPB& footer) {
     for (const auto& column_meta : _footer_pb->columns()) {
         // no need to create column reader for variant's subcolumn
         if (column_meta.unique_id() == -1) {
+            ordinal++;
             continue;
         }
         column_id_to_footer_ordinal.try_emplace(column_meta.unique_id(), 
ordinal++);
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp 
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 087de2b5e02..56d09224e48 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -311,7 +311,7 @@ Status 
VariantColumnWriterImpl::_process_subcolumns(vectorized::ColumnObject* pt
         // set unique_id and parent_unique_id, will use unique_id to get 
iterator correct
         auto column = vectorized::schema_util::get_column_by_type(
                 final_data_type_from_object, column_name,
-                vectorized::schema_util::ExtraInfo {.unique_id = 
_tablet_column->unique_id(),
+                vectorized::schema_util::ExtraInfo {.unique_id = -1,
                                                     .parent_unique_id = 
_tablet_column->unique_id(),
                                                     .path_info = full_path});
         return column;
@@ -649,7 +649,7 @@ Status VariantSubcolumnWriter::finalize() {
     TabletColumn flush_column = vectorized::schema_util::get_column_by_type(
             ptr->get_root_type(), _tablet_column->name(),
             vectorized::schema_util::ExtraInfo {
-                    .unique_id = _tablet_column->unique_id(),
+                    .unique_id = -1,
                     .parent_unique_id = _tablet_column->parent_unique_id(),
                     .path_info = *_tablet_column->path_info_ptr()});
     ColumnWriterOptions opts = _opts;
diff --git a/be/src/vec/common/schema_util.cpp 
b/be/src/vec/common/schema_util.cpp
index 7f03a4f70aa..181500d2bd6 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -687,34 +687,43 @@ Status collect_path_stats(const RowsetSharedPtr& rs,
     return Status::OK();
 }
 
+// get the subpaths and sparse paths for the variant column
 void get_subpaths(const TabletColumn& variant,
                   const std::unordered_map<int32_t, PathToNoneNullValues>& 
path_stats,
                   std::unordered_map<int32_t, TabletSchema::PathsSetInfo>& 
uid_to_paths_set_info) {
-    for (const auto& [uid, stats] : path_stats) {
-        if (stats.size() > variant.variant_max_subcolumns_count()) {
-            // 按非空值数量排序
-            std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
-            paths_with_sizes.reserve(stats.size());
-            for (const auto& [path, size] : stats) {
-                paths_with_sizes.emplace_back(size, path);
-            }
-            std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), 
std::greater());
-
-            // 选取前N个路径作为子列,其余路径作为稀疏列
-            for (const auto& [size, path] : paths_with_sizes) {
-                if (uid_to_paths_set_info[uid].sub_path_set.size() <
-                    variant.variant_max_subcolumns_count()) {
-                    uid_to_paths_set_info[uid].sub_path_set.emplace(path);
-                } else {
-                    uid_to_paths_set_info[uid].sparse_path_set.emplace(path);
-                }
-            }
-        } else {
-            // 使用所有路径
-            for (const auto& [path, _] : stats) {
+    if (path_stats.find(variant.unique_id()) == path_stats.end()) {
+        return;
+    }
+    // get the stats for the variant column
+    const auto& stats = path_stats.at(variant.unique_id());
+    int32_t uid = variant.unique_id();
+    if (stats.size() > variant.variant_max_subcolumns_count()) {
+        // 按非空值数量排序
+        std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
+        paths_with_sizes.reserve(stats.size());
+        for (const auto& [path, size] : stats) {
+            paths_with_sizes.emplace_back(size, path);
+        }
+        std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), 
std::greater());
+
+        // Select top N paths as subcolumns, remaining paths as sparse columns
+        for (const auto& [size, path] : paths_with_sizes) {
+            if (uid_to_paths_set_info[uid].sub_path_set.size() <
+                variant.variant_max_subcolumns_count()) {
                 uid_to_paths_set_info[uid].sub_path_set.emplace(path);
+            } else {
+                uid_to_paths_set_info[uid].sparse_path_set.emplace(path);
             }
         }
+        LOG(INFO) << "subpaths " << 
uid_to_paths_set_info[uid].sub_path_set.size()
+                  << " sparse paths " << 
uid_to_paths_set_info[uid].sparse_path_set.size()
+                  << " variant max subcolumns count " << 
variant.variant_max_subcolumns_count()
+                  << " stats size " << paths_with_sizes.size();
+    } else {
+        // Apply all paths as subcolumns
+        for (const auto& [path, _] : stats) {
+            uid_to_paths_set_info[uid].sub_path_set.emplace(path);
+        }
     }
 }
 
@@ -729,12 +738,12 @@ Status get_compaction_schema(const 
std::vector<RowsetSharedPtr>& rowsets,
                              TabletSchemaSPtr& target) {
     std::unordered_map<int32_t, PathToNoneNullValues> uid_to_path_stats;
 
-    // 收集统计信息
+    // collect path stats from all rowsets and segments
     for (const auto& rs : rowsets) {
         RETURN_IF_ERROR(collect_path_stats(rs, uid_to_path_stats));
     }
 
-    // 构建输出schema
+    // build the output schema
     TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>();
     output_schema->shawdow_copy_without_columns(*target);
     std::unordered_map<int32_t, TabletSchema::PathsSetInfo> 
uid_to_paths_set_info;
@@ -743,14 +752,15 @@ Status get_compaction_schema(const 
std::vector<RowsetSharedPtr>& rowsets,
         if (!column->is_variant_type()) {
             continue;
         }
+        VLOG_DEBUG << "column " << column->name() << " unique id " << 
column->unique_id();
 
-        // 获取子路径
+        // get the subpaths
         get_subpaths(*column, uid_to_path_stats, uid_to_paths_set_info);
         std::vector<StringRef> sorted_subpaths(
                 
uid_to_paths_set_info[column->unique_id()].sub_path_set.begin(),
                 uid_to_paths_set_info[column->unique_id()].sub_path_set.end());
         std::sort(sorted_subpaths.begin(), sorted_subpaths.end());
-        // 添加子列
+        // append subcolumns
         for (const auto& subpath : sorted_subpaths) {
             TabletColumn subcolumn;
             subcolumn.set_name(column->name() + "." + subpath.to_string());
@@ -762,7 +772,7 @@ Status get_compaction_schema(const 
std::vector<RowsetSharedPtr>& rowsets,
             subcolumn.set_is_nullable(true);
             output_schema->append_column(subcolumn);
         }
-        // 添加稀疏列
+        // append sparse column
         TabletColumn sparse_column = create_sparse_column(*column);
         output_schema->append_column(sparse_column);
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to