This is an automated email from the ASF dual-hosted git repository.
eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9952dfefe61 [feat](Variant) support variant sparse feature and schema
template with multi indexes (part 5) (#54328)
9952dfefe61 is described below
commit 9952dfefe6194fcee95f14ee9d4b1c0de94a5a41
Author: lihangyu <[email protected]>
AuthorDate: Tue Aug 5 22:23:39 2025 +0800
[feat](Variant) support variant sparse feature and schema template with
multi indexes (part 5) (#54328)
Add VariantStatsCaculator to caculate variant stats info
---
be/src/olap/rowset/segment_v2/segment_writer.cpp | 11 +-
be/src/olap/rowset/segment_v2/segment_writer.h | 5 +
.../rowset/segment_v2/variant_stats_calculator.cpp | 107 +++++
.../rowset/segment_v2/variant_stats_calculator.h | 55 +++
be/src/vec/common/schema_util.cpp | 84 ++--
be/src/vec/common/schema_util.h | 11 +-
.../segment_v2/variant_stats_calculator_test.cpp | 448 +++++++++++++++++++++
7 files changed, 671 insertions(+), 50 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index 39562d60ab2..fed429af04c 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -50,7 +50,7 @@
#include "olap/rowset/segment_v2/inverted_index_writer.h"
#include "olap/rowset/segment_v2/page_io.h"
#include "olap/rowset/segment_v2/page_pointer.h"
-// #include "olap/rowset/segment_v2/variant/variant_stats_calculator.h"
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
#include "olap/segment_loader.h"
#include "olap/short_key_index.h"
#include "olap/storage_engine.h"
@@ -324,6 +324,10 @@ Status SegmentWriter::init(const std::vector<uint32_t>&
col_ids, bool has_key) {
RETURN_IF_ERROR(_create_writers(_tablet_schema, col_ids));
+ // Initialize variant statistics calculator
+ _variant_stats_calculator =
+ std::make_unique<VariantStatsCaculator>(&_footer, _tablet_schema,
col_ids);
+
// we don't need the short key index for unique key merge on write table.
if (_has_key) {
if (_is_mow()) {
@@ -731,7 +735,10 @@ Status SegmentWriter::append_block(const
vectorized::Block* block, size_t row_po
RETURN_IF_ERROR(_column_writers[id]->append(converted_result.second->get_nullmap(),
converted_result.second->get_data(), num_rows));
}
-
+ if (_opts.write_type == DataWriteType::TYPE_COMPACTION) {
+ RETURN_IF_ERROR(
+ _variant_stats_calculator->calculate_variant_stats(block,
row_pos, num_rows));
+ }
if (_has_key) {
if (_is_mow_with_cluster_key()) {
// for now we don't need to query short key index for CLUSTER BY
feature,
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h
b/be/src/olap/rowset/segment_v2/segment_writer.h
index 76ba9b2ab21..c58ee417864 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.h
+++ b/be/src/olap/rowset/segment_v2/segment_writer.h
@@ -65,6 +65,8 @@ namespace segment_v2 {
extern const char* k_segment_magic;
extern const uint32_t k_segment_magic_length;
+class VariantStatsCaculator;
+
struct SegmentWriterOptions {
uint32_t num_rows_per_block = 1024;
uint32_t max_rows_per_segment = UINT32_MAX;
@@ -106,6 +108,7 @@ public:
Status partial_update_preconditions_check(size_t row_pos);
Status append_block_with_partial_content(const vectorized::Block* block,
size_t row_pos,
size_t num_rows);
+
int64_t max_row_to_add(size_t row_avg_size_in_bytes);
uint64_t estimate_segment_size();
@@ -261,6 +264,8 @@ private:
TabletSchemaSPtr _flush_schema = nullptr;
std::vector<std::string> _primary_keys;
uint64_t _primary_keys_size = 0;
+ // variant statistics calculator for efficient stats collection
+ std::unique_ptr<VariantStatsCaculator> _variant_stats_calculator;
};
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
new file mode 100644
index 00000000000..f1de6260304
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include "common/logging.h"
+#include "util/simd/bits.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/common/schema_util.h"
+
+namespace doris::segment_v2 {
+
+VariantStatsCaculator::VariantStatsCaculator(SegmentFooterPB* footer,
+ TabletSchemaSPtr tablet_schema,
+ const std::vector<uint32_t>&
column_ids)
+ : _footer(footer), _tablet_schema(tablet_schema),
_column_ids(column_ids) {
+ // Build the path to footer index mapping during initialization
+ for (size_t i = 0; i < _footer->columns_size(); ++i) {
+ const auto& column = _footer->columns(i);
+ // path that need to record stats
+ if (column.has_column_path_info() &&
+ column.column_path_info().parrent_column_unique_id() > 0) {
+
_path_to_footer_index[column.column_path_info().parrent_column_unique_id()]
+ [column.column_path_info().path()] = i;
+ }
+ }
+}
+
+Status VariantStatsCaculator::calculate_variant_stats(const vectorized::Block*
block,
+ size_t row_pos, size_t
num_rows) {
+ for (size_t i = 0; i < block->columns(); ++i) {
+ const TabletColumn& tablet_column =
_tablet_schema->column(_column_ids[i]);
+ // Only process sub columns and sparse columns during compaction
+ if (tablet_column.has_path_info() &&
tablet_column.path_info_ptr()->need_record_stats() &&
+ tablet_column.parent_unique_id() > 0) {
+ const std::string& column_path =
tablet_column.path_info_ptr()->get_path();
+ // Find the parent column in footer
+ auto it =
_path_to_footer_index.find(tablet_column.parent_unique_id());
+ if (it == _path_to_footer_index.end()) {
+ return Status::NotFound("Column path not found in footer: {}",
+
tablet_column.path_info_ptr()->get_path());
+ }
+ size_t footer_index = it->second[column_path];
+ ColumnMetaPB* column_meta = _footer->mutable_columns(footer_index);
+
+ // Get the column from the block
+ const auto& column = block->get_by_position(i).column;
+
+ // Check if this is a sparse column or sub column
+ if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) {
+ // This is a sparse column from variant column
+ _calculate_sparse_column_stats(*column, column_meta, row_pos,
num_rows);
+ } else {
+ // This is a sub column from variant column
+ _calculate_sub_column_stats(*column, column_meta, row_pos,
num_rows);
+ }
+ }
+ }
+ return Status::OK();
+}
+
+void VariantStatsCaculator::_calculate_sparse_column_stats(const
vectorized::IColumn& column,
+ ColumnMetaPB*
column_meta,
+ size_t row_pos,
size_t num_rows) {
+ // Get or create variant statistics
+ VariantStatisticsPB* stats = column_meta->mutable_variant_statistics();
+
+ // Use the same logic as the original calculate_variant_stats function
+ vectorized::schema_util::calculate_variant_stats(column, stats, row_pos,
num_rows);
+
+ VLOG_DEBUG << "Sparse column stats updated, non-null size count: "
+ << stats->sparse_column_non_null_size_size();
+}
+
+void VariantStatsCaculator::_calculate_sub_column_stats(const
vectorized::IColumn& column,
+ ColumnMetaPB*
column_meta, size_t row_pos,
+ size_t num_rows) {
+ // For sub columns, we need to calculate the non-null count
+ const auto& nullable_column = assert_cast<const
vectorized::ColumnNullable&>(column);
+ const auto& null_data = nullable_column.get_null_map_data();
+ const int8_t* start = reinterpret_cast<const int8_t*>(null_data.data()) +
row_pos;
+
+ // Count non-null values in the current block
+ size_t current_non_null_count = simd::count_zero_num(start, num_rows);
+
+ // Add to existing non-null count
+ column_meta->set_none_null_size(current_non_null_count +
column_meta->none_null_size());
+
+ VLOG_DEBUG << "Sub column non-null count updated: " <<
column_meta->none_null_size()
+ << " (added " << current_non_null_count << " from current
block)";
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
new file mode 100644
index 00000000000..6ffd74036cb
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/core/block.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCaculator {
+public:
+ explicit VariantStatsCaculator(SegmentFooterPB* footer, TabletSchemaSPtr
tablet_schema,
+ const std::vector<uint32_t>& column_ids);
+
+ // Calculate variant statistics for the given column and block
+ Status calculate_variant_stats(const vectorized::Block* block, size_t
row_pos, size_t num_rows);
+
+private:
+ // Map from column path to footer column index for fast lookup
+ std::unordered_map<int32_t, std::unordered_map<std::string, size_t>>
_path_to_footer_index;
+
+ // Reference to the footer where we store the statistics
+ SegmentFooterPB* _footer;
+ TabletSchemaSPtr _tablet_schema;
+ std::vector<uint32_t> _column_ids;
+
+ // Helper method to calculate sparse column statistics
+ void _calculate_sparse_column_stats(const vectorized::IColumn& column,
+ ColumnMetaPB* column_meta, size_t
row_pos, size_t num_rows);
+
+ // Helper method to calculate sub column statistics
+ void _calculate_sub_column_stats(const vectorized::IColumn& column,
ColumnMetaPB* column_meta,
+ size_t row_pos, size_t num_rows);
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/vec/common/schema_util.cpp
b/be/src/vec/common/schema_util.cpp
index 79a21f638a5..edc016d5138 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -1031,48 +1031,48 @@ Status aggregate_variant_extended_info(
// return Status::OK();
// }
//
-// // Calculate statistics about variant data paths from the encoded sparse
column
-// void calculate_variant_stats(const IColumn& encoded_sparse_column,
-// segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
-// size_t num_rows) {
-// // Cast input column to ColumnMap type since sparse column is stored as
a map
-// const auto& map_column = assert_cast<const
ColumnMap&>(encoded_sparse_column);
-//
-// // Get the keys column which contains the paths as strings
-// const auto& sparse_data_paths =
-// assert_cast<const
ColumnString*>(map_column.get_keys_ptr().get());
-// const auto& serialized_sparse_column_offsets =
-// assert_cast<const
ColumnArray::Offsets64&>(map_column.get_offsets());
-// auto& count_map = *stats->mutable_sparse_column_non_null_size();
-// // Iterate through all paths in the sparse column
-// for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
-// size_t offset = serialized_sparse_column_offsets[i - 1];
-// size_t end = serialized_sparse_column_offsets[i];
-// for (size_t j = offset; j != end; ++j) {
-// auto path = sparse_data_paths->get_data_at(j);
-//
-// const auto& sparse_path = path.to_string();
-// // If path already exists in statistics, increment its count
-// if (auto it = count_map.find(sparse_path); it !=
count_map.end()) {
-// ++it->second;
-// }
-// // If path doesn't exist and we haven't hit the max statistics
size limit,
-// // add it with count 1
-// else if (count_map.size() <
config::variant_max_sparse_column_statistics_size) {
-// count_map.emplace(sparse_path, 1);
-// }
-// }
-// }
-//
-// if (stats->sparse_column_non_null_size().size() >
-// config::variant_max_sparse_column_statistics_size) {
-// throw doris::Exception(
-// ErrorCode::INTERNAL_ERROR,
-// "Sparse column non null size: {} is greater than max
statistics size: {}",
-// stats->sparse_column_non_null_size().size(),
-// config::variant_max_sparse_column_statistics_size);
-// }
-// }
+// Calculate statistics about variant data paths from the encoded sparse column
+void calculate_variant_stats(const IColumn& encoded_sparse_column,
+ segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
+ size_t num_rows) {
+ // Cast input column to ColumnMap type since sparse column is stored as a
map
+ const auto& map_column = assert_cast<const
ColumnMap&>(encoded_sparse_column);
+
+ // Get the keys column which contains the paths as strings
+ const auto& sparse_data_paths =
+ assert_cast<const ColumnString*>(map_column.get_keys_ptr().get());
+ const auto& serialized_sparse_column_offsets =
+ assert_cast<const
ColumnArray::Offsets64&>(map_column.get_offsets());
+ auto& count_map = *stats->mutable_sparse_column_non_null_size();
+ // Iterate through all paths in the sparse column
+ for (size_t i = row_pos; i != row_pos + num_rows; ++i) {
+ size_t offset = serialized_sparse_column_offsets[i - 1];
+ size_t end = serialized_sparse_column_offsets[i];
+ for (size_t j = offset; j != end; ++j) {
+ auto path = sparse_data_paths->get_data_at(j);
+
+ const auto& sparse_path = path.to_string();
+ // If path already exists in statistics, increment its count
+ if (auto it = count_map.find(sparse_path); it != count_map.end()) {
+ ++it->second;
+ }
+ // If path doesn't exist and we haven't hit the max statistics
size limit,
+ // add it with count 1
+ else if (count_map.size() <
config::variant_max_sparse_column_statistics_size) {
+ count_map.emplace(sparse_path, 1);
+ }
+ }
+ }
+
+ if (stats->sparse_column_non_null_size().size() >
+ config::variant_max_sparse_column_statistics_size) {
+ throw doris::Exception(
+ ErrorCode::INTERNAL_ERROR,
+ "Sparse column non null size: {} is greater than max
statistics size: {}",
+ stats->sparse_column_non_null_size().size(),
+ config::variant_max_sparse_column_statistics_size);
+ }
+}
/// Calculates number of dimensions in array field.
/// Returns 0 for scalar fields.
diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h
index 863f25be8fd..840b8bc4307 100644
--- a/be/src/vec/common/schema_util.h
+++ b/be/src/vec/common/schema_util.h
@@ -179,12 +179,11 @@ bool inherit_index(const std::vector<const TabletIndex*>&
parent_indexes,
// Status check_path_stats(const std::vector<RowsetSharedPtr>& intputs,
RowsetSharedPtr output,
// BaseTabletSPtr tablet);
//
-// // Calculate statistics about variant data paths from the encoded sparse
column
-// void calculate_variant_stats(const IColumn& encoded_sparse_column,
-// segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
-// size_t num_rows);
-//
-//
+// Calculate statistics about variant data paths from the encoded sparse column
+void calculate_variant_stats(const IColumn& encoded_sparse_column,
+ segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
+ size_t num_rows);
+
// bool generate_sub_column_info(const TabletSchema& schema, int32_t
col_unique_id,
// const std::string& path,
// TabletSchema::SubColumnInfo* sub_column_info);
diff --git a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
new file mode 100644
index 00000000000..6591c799945
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
@@ -0,0 +1,448 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+
+#include <gtest/gtest.h>
+
+#include "gen_cpp/segment_v2.pb.h"
+#include "olap/tablet_schema.h"
+#include "vec/columns/column_array.h"
+#include "vec/columns/column_map.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_string.h"
+#include "vec/columns/column_vector.h"
+#include "vec/core/block.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/data_types/data_type_map.h"
+#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/json/path_in_data.h"
+
+namespace doris::segment_v2 {
+
+class VariantStatsCalculatorTest : public ::testing::Test {
+protected:
+ void SetUp() override {
+ // Create a mock tablet schema
+ _tablet_schema = std::make_shared<TabletSchema>();
+
+ // Create a segment footer
+ _footer = std::make_unique<SegmentFooterPB>();
+ }
+
+ void TearDown() override {
+ _footer.reset();
+ _tablet_schema.reset();
+ }
+
+ // Helper method to create a mock column with path info
+ TabletColumn create_variant_column(int32_t unique_id, const std::string&
name,
+ int32_t parent_unique_id = -1,
+ const std::string& path = "") {
+ TabletColumn column;
+ column.set_unique_id(unique_id);
+ column.set_name(name);
+ column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+
+ if (parent_unique_id > 0 && !path.empty()) {
+ vectorized::PathInData path_info(path);
+ column.set_path_info(path_info);
+ column.set_parent_unique_id(parent_unique_id);
+ }
+ column.set_variant_max_subcolumns_count(1);
+
+ return column;
+ }
+
+ // Helper method to create a footer column with path info
+ void add_footer_column_with_path(int32_t parent_unique_id, const
std::string& path) {
+ auto* column_meta = _footer->add_columns();
+ column_meta->set_unique_id(100 + _footer->columns_size());
+
+ auto* path_info = column_meta->mutable_column_path_info();
+ path_info->set_path(path);
+ path_info->set_parrent_column_unique_id(parent_unique_id);
+ }
+
+ // Helper method to create a nullable column for testing
+ vectorized::ColumnPtr create_nullable_column(const std::vector<bool>&
null_map,
+ const
std::vector<std::string>& values) {
+ auto string_column = vectorized::ColumnString::create();
+ auto null_column = vectorized::ColumnUInt8::create();
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ if (null_map[i]) {
+ string_column->insert_default();
+ null_column->insert_value(1);
+ } else {
+ string_column->insert_data(values[i].data(),
values[i].length());
+ null_column->insert_value(0);
+ }
+ }
+
+ return vectorized::ColumnNullable::create(std::move(string_column),
std::move(null_column));
+ }
+
+ // Helper method to create a map column (sparse column)
+ vectorized::ColumnPtr create_map_column() {
+ auto keys = vectorized::ColumnString::create();
+ auto values = vectorized::ColumnString::create();
+ auto offsets = vectorized::ColumnArray::ColumnOffsets::create();
+
+ // Add some sample data
+ keys->insert_data("key1", 4);
+ values->insert_data("value1", 6);
+ keys->insert_data("key2", 4);
+ values->insert_data("value2", 6);
+
+ offsets->insert_value(0);
+ offsets->insert_value(2);
+
+ return vectorized::ColumnMap::create(std::move(keys),
std::move(values),
+ std::move(offsets));
+ }
+
+ TabletSchemaSPtr _tablet_schema;
+ std::unique_ptr<SegmentFooterPB> _footer;
+};
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithEmptyFooter) {
+ std::vector<uint32_t> column_ids = {0, 1, 2};
+
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Test with empty footer - should not crash
+ vectorized::Block block;
+ auto status = calculator.calculate_variant_stats(&block, 0, 0);
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, ConstructorWithValidFooter) {
+ // Add some columns with path info to footer
+ add_footer_column_with_path(1, "sub_column_1");
+ add_footer_column_with_path(1, "sub_column_2.__DORIS_VARIANT_SPARSE__");
+ add_footer_column_with_path(2, "another_sub_column");
+
+ std::vector<uint32_t> column_ids = {0, 1, 2};
+
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Constructor should have built the path mapping
+ vectorized::Block block;
+ auto status = calculator.calculate_variant_stats(&block, 0, 0);
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoVariantColumns) {
+ // Create tablet schema with regular columns (no variant columns)
+ TabletColumn regular_column;
+ regular_column.set_unique_id(1);
+ regular_column.set_name("regular_col");
+ regular_column.set_type(FieldType::OLAP_FIELD_TYPE_INT);
+
+ _tablet_schema->append_column(regular_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create a simple block
+ vectorized::Block block;
+ auto int_column =
vectorized::ColumnVector<PrimitiveType::TYPE_INT>::create();
+ int_column->insert_value(123);
+ block.insert(
+ {std::move(int_column),
std::make_shared<vectorized::DataTypeInt32>(), "regular_col"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSubColumn) {
+ // Setup footer with sub column
+ add_footer_column_with_path(1, "sub_column_1");
+
+ // Create variant sub column
+ TabletColumn sub_column =
+ create_variant_column(2, "variant_col.sub_column_1", 1,
"sub_column_1");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with nullable column
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true, false},
{"val1", "", "val3"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column_1"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Check that non-null size was updated
+ auto& column_meta = _footer->columns(0);
+ EXPECT_EQ(column_meta.none_null_size(), 2); // 2 non-null values
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) {
+ // Setup footer with sparse column
+ add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__");
+
+ // Create variant sparse column
+ TabletColumn sparse_column = create_variant_column(2,
"variant_col.__DORIS_VARIANT_SPARSE__", 1,
+
"sparse_col.__DORIS_VARIANT_SPARSE__");
+ _tablet_schema->append_column(sparse_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with map column (sparse column)
+ vectorized::Block block;
+ auto map_column = create_map_column();
+ block.insert({std::move(map_column),
+ std::make_shared<vectorized::DataTypeMap>(
+ std::make_shared<vectorized::DataTypeString>(),
+ std::make_shared<vectorized::DataTypeString>()),
+ "sparse_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_TRUE(status.ok());
+
+ // Check that variant statistics were updated
+ auto& column_meta = _footer->columns(0);
+ EXPECT_TRUE(column_meta.has_variant_statistics());
+}
+
+TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMissingFooterEntry) {
+ // Create variant sub column but don't add corresponding footer entry
+ TabletColumn sub_column = create_variant_column(2,
"variant_col.missing_sub", 1, "missing_sub");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with nullable column
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true}, {"val1", ""});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "missing_sub"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 2);
+ EXPECT_FALSE(status.ok());
+ EXPECT_TRUE(status.is<ErrorCode::NOT_FOUND>());
+}
+
+TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMissingPathInFooter) {
+ // Setup footer with different path than what tablet schema has
+ add_footer_column_with_path(1, "different_path");
+
+ // Create variant sub column with non-matching path
+ TabletColumn sub_column =
+ create_variant_column(2, "variant_col.sub_column", 1111,
"sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with nullable column
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false}, {"val1"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_FALSE(status.ok()) << status.to_string();
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) {
+ // Setup footer with multiple columns
+ add_footer_column_with_path(1, "sub1");
+ add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__");
+ add_footer_column_with_path(2, "another_sub");
+
+ // Create multiple variant columns
+ TabletColumn sub1 = create_variant_column(2, "variant.sub1", 1, "sub1");
+ TabletColumn sparse = create_variant_column(3,
"variant.__DORIS_VARIANT_SPARSE__", 1,
+
"sub2.__DORIS_VARIANT_SPARSE__");
+ TabletColumn sub2 = create_variant_column(4, "variant2.another_sub", 2,
"another_sub");
+
+ _tablet_schema->append_column(sub1);
+ _tablet_schema->append_column(sparse);
+ _tablet_schema->append_column(sub2);
+
+ std::vector<uint32_t> column_ids = {0, 1, 2};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with multiple columns
+ vectorized::Block block;
+
+ auto nullable_col1 = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_col1),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub1"});
+
+ auto map_col = create_map_column();
+ map_col->assume_mutable()->insert_many_defaults(3);
+ block.insert({std::move(map_col),
+ std::make_shared<vectorized::DataTypeMap>(
+ std::make_shared<vectorized::DataTypeString>(),
+ std::make_shared<vectorized::DataTypeString>()),
+ "sparse"});
+
+ auto nullable_col2 = create_nullable_column({true, false, true}, {"", "x",
""});
+ block.insert({std::move(nullable_col2),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "another_sub"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Check that statistics were updated for sub columns
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 2); // sub1: 2
non-null
+ EXPECT_TRUE(_footer->columns(1).has_variant_statistics()); // sparse column
+ EXPECT_EQ(_footer->columns(2).none_null_size(), 1); // another_sub:
2 non-null
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithEmptyBlock) {
+ add_footer_column_with_path(1, "sub_column");
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create empty block
+ vectorized::Block block;
+ auto empty_column = create_nullable_column({}, {});
+ block.insert({std::move(empty_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 0);
+ EXPECT_TRUE(status.ok());
+
+ // No change in statistics for empty block
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithAllNullValues) {
+ add_footer_column_with_path(1, "sub_column");
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ // Create block with all null values
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({true, true, true}, {"", "",
""});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // All null values should result in 0 non-null count
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 0);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithNoPathInfo) {
+ // Create regular column without path info
+ TabletColumn regular_column;
+ regular_column.set_unique_id(1);
+ regular_column.set_name("regular");
+ regular_column.set_type(FieldType::OLAP_FIELD_TYPE_STRING);
+ // No path info set
+
+ _tablet_schema->append_column(regular_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ vectorized::Block block;
+ auto string_column = vectorized::ColumnString::create();
+ string_column->insert_data("test", 4);
+ block.insert(
+ {std::move(string_column),
std::make_shared<vectorized::DataTypeString>(), "regular"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 1);
+ EXPECT_TRUE(status.ok()); // Should skip columns without path info
+}
+
+TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsAccumulatesNonNullCount) {
+ add_footer_column_with_path(1, "sub_column");
+
+ // Set initial non-null count in footer
+ _footer->mutable_columns(0)->set_none_null_size(5);
+
+ TabletColumn sub_column = create_variant_column(2, "variant.sub_column",
1, "sub_column");
+ _tablet_schema->append_column(sub_column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+
+ // Should accumulate: initial 5 + new 2 = 7
+ EXPECT_EQ(_footer->columns(0).none_null_size(), 7);
+}
+
+TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithExtendedSchema) {
+ add_footer_column_with_path(1, "sub_column");
+ TabletColumn column;
+ column.set_unique_id(1);
+ column.set_name("variant");
+ column.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
+ column.set_variant_max_subcolumns_count(0);
+ _tablet_schema->append_column(column);
+
+ std::vector<uint32_t> column_ids = {0};
+ VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
+
+ vectorized::Block block;
+ auto nullable_column = create_nullable_column({false, true, false}, {"a",
"", "c"});
+ block.insert({std::move(nullable_column),
+ std::make_shared<vectorized::DataTypeNullable>(
+ std::make_shared<vectorized::DataTypeString>()),
+ "sub_column"});
+
+ auto status = calculator.calculate_variant_stats(&block, 0, 3);
+ EXPECT_TRUE(status.ok());
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]