This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
commit e9bb4e8db152f58e411058247ece6f4dae4905e4 Author: lihangyu <lihan...@selectdb.com> AuthorDate: Wed Mar 5 10:44:23 2025 +0800 fix sparse caculate wrong paths and iterator init with wrong column_id (#48656) --- be/src/olap/rowset/segment_v2/column_reader.cpp | 1 + be/src/olap/rowset/segment_v2/segment.cpp | 1 + .../segment_v2/variant_column_writer_impl.cpp | 4 +- be/src/vec/common/schema_util.cpp | 64 +++++++++++++--------- 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 7b68de904c5..d193216a7a2 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -321,6 +321,7 @@ Status VariantColumnReader::_create_sparse_merge_reader(ColumnIterator** iterato return Status::InternalError("Failed to add node path {}", path); } } + VLOG_DEBUG << "subcolumns to merge " << src_subcolumns_for_sparse.size(); // Create sparse column merge reader *iterator = new SparseColumnMergeReader( diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 579d94c0eb7..9a4dcbf27d4 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -603,6 +603,7 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { for (const auto& column_meta : _footer_pb->columns()) { // no need to create column reader for variant's subcolumn if (column_meta.unique_id() == -1) { + ordinal++; continue; } column_id_to_footer_ordinal.try_emplace(column_meta.unique_id(), ordinal++); diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 087de2b5e02..56d09224e48 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -311,7 +311,7 @@ Status VariantColumnWriterImpl::_process_subcolumns(vectorized::ColumnObject* pt // set unique_id and parent_unique_id, will use unique_id to get iterator correct auto column = vectorized::schema_util::get_column_by_type( final_data_type_from_object, column_name, - vectorized::schema_util::ExtraInfo {.unique_id = _tablet_column->unique_id(), + vectorized::schema_util::ExtraInfo {.unique_id = -1, .parent_unique_id = _tablet_column->unique_id(), .path_info = full_path}); return column; @@ -649,7 +649,7 @@ Status VariantSubcolumnWriter::finalize() { TabletColumn flush_column = vectorized::schema_util::get_column_by_type( ptr->get_root_type(), _tablet_column->name(), vectorized::schema_util::ExtraInfo { - .unique_id = _tablet_column->unique_id(), + .unique_id = -1, .parent_unique_id = _tablet_column->parent_unique_id(), .path_info = *_tablet_column->path_info_ptr()}); ColumnWriterOptions opts = _opts; diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 7f03a4f70aa..181500d2bd6 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -687,34 +687,43 @@ Status collect_path_stats(const RowsetSharedPtr& rs, return Status::OK(); } +// get the subpaths and sparse paths for the variant column void get_subpaths(const TabletColumn& variant, const std::unordered_map<int32_t, PathToNoneNullValues>& path_stats, std::unordered_map<int32_t, TabletSchema::PathsSetInfo>& uid_to_paths_set_info) { - for (const auto& [uid, stats] : path_stats) { - if (stats.size() > variant.variant_max_subcolumns_count()) { - // 按非空值数量排序 - std::vector<std::pair<size_t, std::string_view>> paths_with_sizes; - paths_with_sizes.reserve(stats.size()); - for (const auto& [path, size] : stats) { - paths_with_sizes.emplace_back(size, path); - } - std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater()); - - // 选取前N个路径作为子列,其余路径作为稀疏列 - for (const auto& [size, path] : paths_with_sizes) { - if (uid_to_paths_set_info[uid].sub_path_set.size() < - variant.variant_max_subcolumns_count()) { - uid_to_paths_set_info[uid].sub_path_set.emplace(path); - } else { - uid_to_paths_set_info[uid].sparse_path_set.emplace(path); - } - } - } else { - // 使用所有路径 - for (const auto& [path, _] : stats) { + if (path_stats.find(variant.unique_id()) == path_stats.end()) { + return; + } + // get the stats for the variant column + const auto& stats = path_stats.at(variant.unique_id()); + int32_t uid = variant.unique_id(); + if (stats.size() > variant.variant_max_subcolumns_count()) { + // 按非空值数量排序 + std::vector<std::pair<size_t, std::string_view>> paths_with_sizes; + paths_with_sizes.reserve(stats.size()); + for (const auto& [path, size] : stats) { + paths_with_sizes.emplace_back(size, path); + } + std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater()); + + // Select top N paths as subcolumns, remaining paths as sparse columns + for (const auto& [size, path] : paths_with_sizes) { + if (uid_to_paths_set_info[uid].sub_path_set.size() < + variant.variant_max_subcolumns_count()) { uid_to_paths_set_info[uid].sub_path_set.emplace(path); + } else { + uid_to_paths_set_info[uid].sparse_path_set.emplace(path); } } + LOG(INFO) << "subpaths " << uid_to_paths_set_info[uid].sub_path_set.size() + << " sparse paths " << uid_to_paths_set_info[uid].sparse_path_set.size() + << " variant max subcolumns count " << variant.variant_max_subcolumns_count() + << " stats size " << paths_with_sizes.size(); + } else { + // Apply all paths as subcolumns + for (const auto& [path, _] : stats) { + uid_to_paths_set_info[uid].sub_path_set.emplace(path); + } } } @@ -729,12 +738,12 @@ Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target) { std::unordered_map<int32_t, PathToNoneNullValues> uid_to_path_stats; - // 收集统计信息 + // collect path stats from all rowsets and segments for (const auto& rs : rowsets) { RETURN_IF_ERROR(collect_path_stats(rs, uid_to_path_stats)); } - // 构建输出schema + // build the output schema TabletSchemaSPtr output_schema = std::make_shared<TabletSchema>(); output_schema->shawdow_copy_without_columns(*target); std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; @@ -743,14 +752,15 @@ Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, if (!column->is_variant_type()) { continue; } + VLOG_DEBUG << "column " << column->name() << " unique id " << column->unique_id(); - // 获取子路径 + // get the subpaths get_subpaths(*column, uid_to_path_stats, uid_to_paths_set_info); std::vector<StringRef> sorted_subpaths( uid_to_paths_set_info[column->unique_id()].sub_path_set.begin(), uid_to_paths_set_info[column->unique_id()].sub_path_set.end()); std::sort(sorted_subpaths.begin(), sorted_subpaths.end()); - // 添加子列 + // append subcolumns for (const auto& subpath : sorted_subpaths) { TabletColumn subcolumn; subcolumn.set_name(column->name() + "." + subpath.to_string()); @@ -762,7 +772,7 @@ Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, subcolumn.set_is_nullable(true); output_schema->append_column(subcolumn); } - // 添加稀疏列 + // append sparse column TabletColumn sparse_column = create_sparse_column(*column); output_schema->append_column(sparse_column); } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org