This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new b2127eb305e [fix](variant) fix sparse column reader (#49211) b2127eb305e is described below commit b2127eb305e84d49a3667bddf160cdb2168e9da4 Author: Sun Chenyang <suncheny...@selectdb.com> AuthorDate: Tue Mar 18 21:15:38 2025 +0800 [fix](variant) fix sparse column reader (#49211) --- be/src/olap/rowset/segment_v2/column_reader.cpp | 18 +++++++++--------- be/src/vec/common/schema_util.cpp | 13 +++++++++++-- be/src/vec/data_types/data_type_object.h | 3 ++- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 7d9e375891c..2edbb3a1350 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -378,6 +378,14 @@ Status VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIterator** iter const auto* node = target_col.has_path_info() ? _subcolumn_readers->find_leaf(relative_path) : nullptr; if (!node) { + if (relative_path.get_path() == SPARSE_COLUMN_PATH) { + // read sparse column and filter extracted columns in subcolumn_path_map + ColumnIterator* inner_iter; + RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter)); + // get subcolumns in sparse path set which will be merged into sparse column + RETURN_IF_ERROR(_create_sparse_merge_reader(iterator, opts, target_col, inner_iter)); + return Status::OK(); + } if (existed_in_sparse_column || exceeded_sparse_column_limit) { // Sparse column exists or reached sparse size limit, read sparse column ColumnIterator* inner_iter; @@ -389,14 +397,6 @@ Status VariantColumnReader::_new_iterator_with_flat_leaves(ColumnIterator** iter const_cast<StorageReadOptions*>(opts), target_col); return Status::OK(); } - if (relative_path.get_path() == SPARSE_COLUMN_PATH) { - // read sparse column and filter extracted columns in subcolumn_path_map - ColumnIterator* inner_iter; - RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter)); - // get subcolumns in sparse path set which will be merged into sparse column - RETURN_IF_ERROR(_create_sparse_merge_reader(iterator, opts, target_col, inner_iter)); - return Status::OK(); - } if (target_col.is_nested_subcolumn()) { // using the sibling of the nested column to fill the target nested column RETURN_IF_ERROR(_new_default_iter_with_same_nested(iterator, target_col)); @@ -434,7 +434,7 @@ Status VariantColumnReader::new_iterator(ColumnIterator** iterator, const Tablet // Otherwise the prefix is not exist and the sparse column size is reached limit // which means the path maybe exist in sparse_column bool exceeded_sparse_column_limit = !_statistics->sparse_column_non_null_size.empty() && - _statistics->sparse_column_non_null_size.size() > + _statistics->sparse_column_non_null_size.size() == VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE; // For compaction operations, read flat leaves, otherwise read hierarchical data diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 047d488e5ad..a7d34e80961 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -820,6 +820,8 @@ Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, void calculate_variant_stats(const IColumn& encoded_sparse_column, segment_v2::VariantStatisticsPB* stats, size_t row_pos, size_t num_rows) { + size_t limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE - + stats->sparse_column_non_null_size().size(); // Cast input column to ColumnMap type since sparse column is stored as a map const auto& map_column = assert_cast<const ColumnMap&>(encoded_sparse_column); @@ -844,8 +846,7 @@ void calculate_variant_stats(const IColumn& encoded_sparse_column, } // If path doesn't exist and we haven't hit the max statistics size limit, // add it with count 1 - else if (sparse_data_paths_statistics.size() < - VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) { + else if (sparse_data_paths_statistics.size() < limit) { sparse_data_paths_statistics.emplace(path, 1); } } @@ -862,6 +863,14 @@ void calculate_variant_stats(const IColumn& encoded_sparse_column, count_map.emplace(sparse_path, size); } } + if (stats->sparse_column_non_null_size().size() > + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE) { + throw doris::Exception( + ErrorCode::INTERNAL_ERROR, + "Sparse column non null size: {} is greater than max statistics size: {}", + stats->sparse_column_non_null_size().size(), + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE); + } } #include "common/compile_check_end.h" diff --git a/be/src/vec/data_types/data_type_object.h b/be/src/vec/data_types/data_type_object.h index ad27c57ec68..24f23fb9000 100644 --- a/be/src/vec/data_types/data_type_object.h +++ b/be/src/vec/data_types/data_type_object.h @@ -55,7 +55,8 @@ private: public: DataTypeObject() {} DataTypeObject(int32_t max_subcolumns_count); - const char* get_family_name() const override { return name.c_str(); } + String do_get_name() const override { return name; } + const char* get_family_name() const override { return "Variant"; } TypeIndex get_type_id() const override { return TypeIndex::VARIANT; } TypeDescriptor get_type_as_type_descriptor() const override { return TypeDescriptor(TYPE_VARIANT, _max_subcolumns_count); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org