This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 980ad60a66b branch-3.1: [fix](variant) enhance
max_sparse_column_statistics_size for variant #55124 (#55752)
980ad60a66b is described below
commit 980ad60a66baee21801a93b7e50523bf0967b470
Author: amory <[email protected]>
AuthorDate: Wed Sep 10 16:47:49 2025 +0800
branch-3.1: [fix](variant) enhance max_sparse_column_statistics_size for
variant #55124 (#55752)
picked from #55124
---
be/src/common/config.cpp | 2 -
be/src/common/config.h | 3 -
be/src/common/consts.h | 1 +
.../segment_v2/variant/variant_column_reader.cpp | 26 ++++++--
.../segment_v2/variant/variant_column_reader.h | 3 +
.../segment_v2/variant_column_writer_impl.cpp | 2 +-
.../rowset/segment_v2/variant_stats_calculator.cpp | 16 ++++-
.../rowset/segment_v2/variant_stats_calculator.h | 4 +-
be/src/olap/tablet_meta.cpp | 4 ++
be/src/olap/tablet_schema.cpp | 6 ++
be/src/olap/tablet_schema.h | 13 ++++
be/src/vec/common/schema_util.cpp | 18 +++---
be/src/vec/common/schema_util.h | 3 +-
be/test/olap/rowset/segment_v2/mock/mock_segment.h | 3 +
.../variant_column_writer_reader_test.cpp | 8 +--
.../segment_v2/variant_stats_calculator_test.cpp | 41 +++++++++----
be/test/testutil/schema_utils.h | 1 +
be/test/vec/common/schema_util_test.cpp | 23 ++++---
.../java/org/apache/doris/catalog/ScalarType.java | 9 +++
.../java/org/apache/doris/catalog/VariantType.java | 24 +++++++-
.../main/java/org/apache/doris/catalog/Column.java | 6 ++
.../apache/doris/common/util/PropertyAnalyzer.java | 23 +++++++
.../doris/nereids/parser/LogicalPlanBuilder.java | 10 ++-
.../org/apache/doris/nereids/types/DataType.java | 3 +-
.../apache/doris/nereids/types/VariantType.java | 27 ++++++--
.../java/org/apache/doris/qe/SessionVariable.java | 14 +++++
.../apache/doris/common/PropertyAnalyzerTest.java | 28 +++++++++
.../org/apache/doris/persist/ScalarTypeTest.java | 1 +
gensrc/proto/olap_file.proto | 2 +
gensrc/thrift/Descriptors.thrift | 1 +
.../cloud_p0/conf/regression-conf-custom.groovy | 1 +
.../pipeline/p0/conf/regression-conf.groovy | 3 +-
...est_variant_compaction_with_sparse_limit.groovy | 71 ++++++++++++++++++----
...est_variant_compaction_with_sparse_limit.groovy | 21 +++----
34 files changed, 337 insertions(+), 84 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index c9e63c54de6..c2f69661741 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1338,8 +1338,6 @@ DEFINE_Bool(enable_snapshot_action, "false");
DEFINE_mInt32(variant_max_merged_tablet_schema_size, "2048");
-DEFINE_mInt32(variant_max_sparse_column_statistics_size, "10000");
-
DEFINE_mBool(enable_column_type_check, "true");
// 128 MB
DEFINE_mInt64(local_exchange_buffer_mem_limit, "134217728");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index 6f214361524..8784bd71f1d 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1410,9 +1410,6 @@ DECLARE_Bool(enable_snapshot_action);
// The max columns size for a tablet schema
DECLARE_mInt32(variant_max_merged_tablet_schema_size);
-// The max sparse column statistics size for a variant column
-DECLARE_mInt32(variant_max_sparse_column_statistics_size);
-
DECLARE_mInt64(local_exchange_buffer_mem_limit);
DECLARE_mInt64(enable_debug_log_timeout_secs);
diff --git a/be/src/common/consts.h b/be/src/common/consts.h
index 2ec9ae12679..32b4b1e7fa4 100644
--- a/be/src/common/consts.h
+++ b/be/src/common/consts.h
@@ -46,5 +46,6 @@ static constexpr int MAX_DECIMALV2_SCALE = 9;
static constexpr int MAX_DECIMALV3_PRECISION = MAX_DECIMAL256_PRECISION;
static constexpr int MAX_DECIMALV3_SCALE = MAX_DECIMALV3_PRECISION;
+static constexpr int DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE = 10000;
} // namespace BeConsts
} // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
index ec0079edc64..f9a2b21958e 100644
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.cpp
@@ -74,9 +74,20 @@ bool VariantColumnReader::exist_in_sparse_column(
}
bool VariantColumnReader::is_exceeded_sparse_column_limit() const {
- return !_statistics->sparse_column_non_null_size.empty() &&
- _statistics->sparse_column_non_null_size.size() >=
- config::variant_max_sparse_column_statistics_size;
+ bool exceeded_sparse_column_limit =
!_statistics->sparse_column_non_null_size.empty() &&
+
_statistics->sparse_column_non_null_size.size() >=
+
_variant_sparse_column_statistics_size;
+ DBUG_EXECUTE_IF("exceeded_sparse_column_limit_must_be_false", {
+ if (exceeded_sparse_column_limit) {
+ throw doris::Exception(
+ ErrorCode::INTERNAL_ERROR,
+ "exceeded_sparse_column_limit_must_be_false,
sparse_column_non_null_size: {} : "
+ " _variant_sparse_column_statistics_size: {}",
+ _statistics->sparse_column_non_null_size.size(),
+ _variant_sparse_column_statistics_size);
+ }
+ })
+ return exceeded_sparse_column_limit;
}
int64_t VariantColumnReader::get_metadata_size() const {
@@ -308,9 +319,7 @@ Status
VariantColumnReader::new_iterator(ColumnIteratorUPtr* iterator,
// Otherwise the prefix is not exist and the sparse column size is reached
limit
// which means the path maybe exist in sparse_column
- bool exceeded_sparse_column_limit =
!_statistics->sparse_column_non_null_size.empty() &&
-
_statistics->sparse_column_non_null_size.size() >=
-
config::variant_max_sparse_column_statistics_size;
+ bool exceeded_sparse_column_limit = is_exceeded_sparse_column_limit();
// If the variant column has extracted columns and is a compaction reader,
then read flat leaves
// Otherwise read hierarchical data, since the variant subcolumns are
flattened in schema_util::get_compaction_schema
@@ -391,6 +400,11 @@ Status VariantColumnReader::init(const
ColumnReaderOptions& opts, const SegmentF
_statistics = std::make_unique<VariantStatistics>();
const ColumnMetaPB& self_column_pb = footer.columns(column_id);
const auto& parent_index =
opts.tablet_schema->inverted_indexs(self_column_pb.unique_id());
+ // record variant_sparse_column_statistics_size from parent column
+ _variant_sparse_column_statistics_size =
+ opts.tablet_schema->column_by_uid(self_column_pb.unique_id())
+ .variant_max_sparse_column_statistics_size();
+
for (int32_t ordinal = 0; ordinal < footer.columns_size(); ++ordinal) {
const ColumnMetaPB& column_pb = footer.columns(ordinal);
// Find all columns belonging to the current variant column
diff --git a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h
b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h
index 21edf5c50bd..f22809eed52 100644
--- a/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h
+++ b/be/src/olap/rowset/segment_v2/variant/variant_column_reader.h
@@ -116,6 +116,9 @@ private:
std::unique_ptr<VariantStatistics> _statistics;
// key: subcolumn path, value: subcolumn indexes
std::unordered_map<std::string, TabletIndexes> _variant_subcolumns_indexes;
+ // variant_sparse_column_statistics_size
+ size_t _variant_sparse_column_statistics_size =
+ BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE;
};
class VariantRootColumnIterator : public ColumnIterator {
diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
index 47890f75d04..6cb59d186da 100644
--- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -349,7 +349,7 @@ Status VariantColumnWriterImpl::_process_sparse_column(
it != sparse_data_paths_statistics.end()) {
++it->second;
} else if (sparse_data_paths_statistics.size() <
- config::variant_max_sparse_column_statistics_size) {
+
_tablet_column->variant_max_sparse_column_statistics_size()) {
sparse_data_paths_statistics.emplace(path, 1);
}
}
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
index aef71372666..168efa547ec 100644
--- a/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.cpp
@@ -17,6 +17,8 @@
#include "olap/rowset/segment_v2/variant_stats_calculator.h"
+#include <gen_cpp/segment_v2.pb.h>
+
#include "common/logging.h"
#include "util/simd/bits.h"
#include "vec/columns/column_nullable.h"
@@ -67,7 +69,13 @@ Status VariantStatsCaculator::calculate_variant_stats(const
vectorized::Block* b
// Check if this is a sparse column or sub column
if (column_path.ends_with("__DORIS_VARIANT_SPARSE__")) {
// This is a sparse column from variant column
- _calculate_sparse_column_stats(*column, column_meta, row_pos,
num_rows);
+ // get variant_max_sparse_column_statistics_size from
tablet_schema
+ size_t variant_max_sparse_column_statistics_size =
+
_tablet_schema->column_by_uid(tablet_column.parent_unique_id())
+ .variant_max_sparse_column_statistics_size();
+ _calculate_sparse_column_stats(*column, column_meta,
+
variant_max_sparse_column_statistics_size, row_pos,
+ num_rows);
} else {
// This is a sub column from variant column
_calculate_sub_column_stats(*column, column_meta, row_pos,
num_rows);
@@ -79,12 +87,14 @@ Status VariantStatsCaculator::calculate_variant_stats(const
vectorized::Block* b
void VariantStatsCaculator::_calculate_sparse_column_stats(const
vectorized::IColumn& column,
ColumnMetaPB*
column_meta,
+ size_t
max_sparse_column_statistics_size,
size_t row_pos,
size_t num_rows) {
// Get or create variant statistics
VariantStatisticsPB* stats = column_meta->mutable_variant_statistics();
// Use the same logic as the original calculate_variant_stats function
- vectorized::schema_util::calculate_variant_stats(column, stats, row_pos,
num_rows);
+ vectorized::schema_util::calculate_variant_stats(
+ column, stats, max_sparse_column_statistics_size, row_pos,
num_rows);
VLOG_DEBUG << "Sparse column stats updated, non-null size count: "
<< stats->sparse_column_non_null_size_size();
@@ -108,4 +118,4 @@ void
VariantStatsCaculator::_calculate_sub_column_stats(const vectorized::IColum
<< " (added " << current_non_null_count << " from current
block)";
}
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
index 6ffd74036cb..221c45b781d 100644
--- a/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
+++ b/be/src/olap/rowset/segment_v2/variant_stats_calculator.h
@@ -45,7 +45,9 @@ private:
// Helper method to calculate sparse column statistics
void _calculate_sparse_column_stats(const vectorized::IColumn& column,
- ColumnMetaPB* column_meta, size_t
row_pos, size_t num_rows);
+ ColumnMetaPB* column_meta,
+ size_t
max_sparse_column_statistics_size, size_t row_pos,
+ size_t num_rows);
// Helper method to calculate sub column statistics
void _calculate_sub_column_stats(const vectorized::IColumn& column,
ColumnMetaPB* column_meta,
diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp
index 3352460fddb..0a246482788 100644
--- a/be/src/olap/tablet_meta.cpp
+++ b/be/src/olap/tablet_meta.cpp
@@ -491,6 +491,10 @@ void TabletMeta::init_column_from_tcolumn(uint32_t
unique_id, const TColumn& tco
column->set_variant_enable_typed_paths_to_sparse(
tcolumn.variant_enable_typed_paths_to_sparse);
}
+ if (tcolumn.__isset.variant_max_sparse_column_statistics_size) {
+ column->set_variant_max_sparse_column_statistics_size(
+ tcolumn.variant_max_sparse_column_statistics_size);
+ }
}
void TabletMeta::remove_rowset_delete_bitmap(const RowsetId& rowset_id, const
Version& version) {
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index 0aa95b1bbeb..255e15546af 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -613,6 +613,10 @@ void TabletColumn::init_from_pb(const ColumnPB& column) {
if (column.has_variant_max_subcolumns_count()) {
_variant_max_subcolumns_count = column.variant_max_subcolumns_count();
}
+ if (column.has_variant_max_sparse_column_statistics_size()) {
+ _variant_max_sparse_column_statistics_size =
+ column.variant_max_sparse_column_statistics_size();
+ }
if (column.has_pattern_type()) {
_pattern_type = column.pattern_type();
}
@@ -704,6 +708,8 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const {
column->set_variant_max_subcolumns_count(_variant_max_subcolumns_count);
column->set_pattern_type(_pattern_type);
column->set_variant_enable_typed_paths_to_sparse(_variant_enable_typed_paths_to_sparse);
+ column->set_variant_max_sparse_column_statistics_size(
+ _variant_max_sparse_column_statistics_size);
}
void TabletColumn::add_sub_column(TabletColumn& sub_column) {
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index ac8da118734..ad4897adc9b 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -224,6 +224,12 @@ public:
void set_variant_max_subcolumns_count(int32_t
variant_max_subcolumns_count) {
_variant_max_subcolumns_count = variant_max_subcolumns_count;
}
+
+ void set_variant_max_sparse_column_statistics_size(
+ int32_t variant_max_sparse_column_statistics_size) {
+ _variant_max_sparse_column_statistics_size =
variant_max_sparse_column_statistics_size;
+ }
+
int32_t variant_max_subcolumns_count() const { return
_variant_max_subcolumns_count; }
void set_variant_enable_typed_paths_to_sparse(bool
variant_enable_typed_paths_to_sparse) {
@@ -234,6 +240,10 @@ public:
return _variant_enable_typed_paths_to_sparse;
}
+ int32_t variant_max_sparse_column_statistics_size() const {
+ return _variant_max_sparse_column_statistics_size;
+ }
+
private:
int32_t _unique_id = -1;
std::string _col_name;
@@ -286,6 +296,9 @@ private:
int32_t _variant_max_subcolumns_count = 0;
PatternTypePB _pattern_type = PatternTypePB::MATCH_NAME_GLOB;
bool _variant_enable_typed_paths_to_sparse = false;
+ // set variant_max_sparse_column_statistics_size
+ int32_t _variant_max_sparse_column_statistics_size =
+ BeConsts::DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATS_SIZE;
};
bool operator==(const TabletColumn& a, const TabletColumn& b);
diff --git a/be/src/vec/common/schema_util.cpp
b/be/src/vec/common/schema_util.cpp
index 68066cc840d..aa21db80d29 100644
--- a/be/src/vec/common/schema_util.cpp
+++ b/be/src/vec/common/schema_util.cpp
@@ -889,7 +889,9 @@ Status check_path_stats(const std::vector<RowsetSharedPtr>&
intputs, RowsetShare
// In input rowsets, some rowsets may have statistics values exceeding
the maximum limit,
// which leads to inaccurate statistics
- if (stats.size() > config::variant_max_sparse_column_statistics_size) {
+ if (stats.size() > output->tablet_schema()
+ ->column_by_uid(uid)
+
.variant_max_sparse_column_statistics_size()) {
// When there is only one segment, we can ensure that the size of
each path in output stats is accurate
if (output->num_segments() == 1) {
for (const auto& [path, size] : stats) {
@@ -1014,7 +1016,8 @@ void
get_compaction_subcolumns(TabletSchema::PathsSetInfo& paths_set_info,
VLOG_DEBUG << "append typed column " << subpath;
} else if (find_data_types == path_to_data_types.end() ||
find_data_types->second.empty() ||
sparse_paths.find(std::string(subpath)) !=
sparse_paths.end() ||
- sparse_paths.size() >=
config::variant_max_sparse_column_statistics_size) {
+ sparse_paths.size() >=
+
parent_column->variant_max_sparse_column_statistics_size()) {
TabletColumn subcolumn;
subcolumn.set_name(column_name);
subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT);
@@ -1111,7 +1114,8 @@ Status get_compaction_schema(const
std::vector<RowsetSharedPtr>& rowsets,
// Calculate statistics about variant data paths from the encoded sparse column
void calculate_variant_stats(const IColumn& encoded_sparse_column,
- segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
+ segment_v2::VariantStatisticsPB* stats,
+ size_t max_sparse_column_statistics_size, size_t
row_pos,
size_t num_rows) {
// Cast input column to ColumnMap type since sparse column is stored as a
map
const auto& map_column = assert_cast<const
ColumnMap&>(encoded_sparse_column);
@@ -1136,19 +1140,17 @@ void calculate_variant_stats(const IColumn&
encoded_sparse_column,
}
// If path doesn't exist and we haven't hit the max statistics
size limit,
// add it with count 1
- else if (count_map.size() <
config::variant_max_sparse_column_statistics_size) {
+ else if (count_map.size() < max_sparse_column_statistics_size) {
count_map.emplace(sparse_path, 1);
}
}
}
- if (stats->sparse_column_non_null_size().size() >
- config::variant_max_sparse_column_statistics_size) {
+ if (stats->sparse_column_non_null_size().size() >
max_sparse_column_statistics_size) {
throw doris::Exception(
ErrorCode::INTERNAL_ERROR,
"Sparse column non null size: {} is greater than max
statistics size: {}",
- stats->sparse_column_non_null_size().size(),
- config::variant_max_sparse_column_statistics_size);
+ stats->sparse_column_non_null_size().size(),
max_sparse_column_statistics_size);
}
}
diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h
index fc5698bf966..ab7fcec2b15 100644
--- a/be/src/vec/common/schema_util.h
+++ b/be/src/vec/common/schema_util.h
@@ -172,7 +172,8 @@ Status check_path_stats(const std::vector<RowsetSharedPtr>&
intputs, RowsetShare
// Calculate statistics about variant data paths from the encoded sparse column
void calculate_variant_stats(const IColumn& encoded_sparse_column,
- segment_v2::VariantStatisticsPB* stats, size_t
row_pos,
+ segment_v2::VariantStatisticsPB* stats,
+ size_t max_sparse_column_statistics_size, size_t
row_pos,
size_t num_rows);
void get_field_info(const Field& field, FieldInfo* info);
diff --git a/be/test/olap/rowset/segment_v2/mock/mock_segment.h
b/be/test/olap/rowset/segment_v2/mock/mock_segment.h
index 9cf443b2df0..7efe16e7586 100644
--- a/be/test/olap/rowset/segment_v2/mock/mock_segment.h
+++ b/be/test/olap/rowset/segment_v2/mock/mock_segment.h
@@ -49,6 +49,9 @@ public:
// Helper methods for test setup
void add_column_uid_mapping(int32_t col_uid, int32_t footer_ordinal) {
+ _tablet_schema->_cols.push_back(std::make_shared<TabletColumn>());
+ _tablet_schema->_cols.back()->set_unique_id(col_uid);
+ _tablet_schema->_field_id_to_index[col_uid] = footer_ordinal;
_column_uid_to_footer_ordinal[col_uid] = footer_ordinal;
}
diff --git
a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
index 8495e7c4e0b..13ca9ebf4ab 100644
--- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
+++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp
@@ -483,15 +483,15 @@ TEST_F(VariantColumnWriterReaderTest,
test_write_data_normal) {
// 13. check statistics size == limit
auto& variant_stats = variant_column_reader->_statistics;
EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() <
- config::variant_max_sparse_column_statistics_size);
- auto limit = config::variant_max_sparse_column_statistics_size -
+ variant_column_reader->_variant_sparse_column_statistics_size);
+ auto limit = variant_column_reader->_variant_sparse_column_statistics_size
-
variant_stats->sparse_column_non_null_size.size();
for (int i = 0; i < limit; ++i) {
std::string key = parent_column.name_lower_case() + ".key10" +
std::to_string(i);
variant_stats->sparse_column_non_null_size[key] = 10000;
}
EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() ==
- config::variant_max_sparse_column_statistics_size);
+ variant_column_reader->_variant_sparse_column_statistics_size);
EXPECT_TRUE(variant_column_reader->is_exceeded_sparse_column_limit());
ColumnIteratorUPtr it2;
@@ -2500,4 +2500,4 @@ TEST_F(VariantColumnWriterReaderTest,
test_read_with_checksum) {
}
}
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
index edbda054825..5fbb2ed514d 100644
--- a/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
+++ b/be/test/olap/rowset/segment_v2/variant_stats_calculator_test.cpp
@@ -71,8 +71,10 @@ protected:
}
// Helper method to create a footer column with path info
- void add_footer_column_with_path(int32_t parent_unique_id, const
std::string& path) {
+ void add_footer_column_with_path(int32_t parent_unique_id, const
std::string& path,
+ uint32_t column_id = 0) {
auto* column_meta = _footer->add_columns();
+ column_meta->set_column_id(column_id);
column_meta->set_unique_id(100 + _footer->columns_size());
auto* path_info = column_meta->mutable_column_path_info();
@@ -202,19 +204,26 @@ TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithSubColumn) {
TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithSparseColumn) {
// Setup footer with sparse column
- add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__");
+ add_footer_column_with_path(-1, "sparse_col");
+ add_footer_column_with_path(1, "sparse_col.__DORIS_VARIANT_SPARSE__", 1);
// Create variant sparse column
+ TabletColumn parent_column = create_variant_column(1, "variant_col", -1,
"sparse_col");
TabletColumn sparse_column = create_variant_column(2,
"variant_col.__DORIS_VARIANT_SPARSE__", 1,
"sparse_col.__DORIS_VARIANT_SPARSE__");
+ _tablet_schema->append_column(parent_column);
_tablet_schema->append_column(sparse_column);
- std::vector<uint32_t> column_ids = {0};
+ std::vector<uint32_t> column_ids = {0, 1};
VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
// Create block with map column (sparse column)
vectorized::Block block;
auto map_column = create_map_column();
+ auto string_column = vectorized::ColumnString::create();
+ // add parant column to block
+ block.insert({std::move(string_column),
std::make_shared<vectorized::DataTypeString>(),
+ "variant_column"});
block.insert({std::move(map_column),
std::make_shared<vectorized::DataTypeMap>(
std::make_shared<vectorized::DataTypeString>(),
@@ -225,7 +234,7 @@ TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithSparseColumn) {
EXPECT_TRUE(status.ok());
// Check that variant statistics were updated
- auto& column_meta = _footer->columns(0);
+ auto& column_meta = _footer->columns(1);
EXPECT_TRUE(column_meta.has_variant_statistics());
}
@@ -275,10 +284,15 @@ TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMissingPathInFooter)
}
TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithMultipleColumns) {
+ // parent column
+ add_footer_column_with_path(-1, "variant");
+ TabletColumn parent_column = create_variant_column(1, "variant", -1,
"variant");
+ _tablet_schema->append_column(parent_column);
+
// Setup footer with multiple columns
- add_footer_column_with_path(1, "sub1");
- add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__");
- add_footer_column_with_path(2, "another_sub");
+ add_footer_column_with_path(1, "sub1", 1);
+ add_footer_column_with_path(1, "sub2.__DORIS_VARIANT_SPARSE__", 2);
+ add_footer_column_with_path(2, "another_sub", 3);
// Create multiple variant columns
TabletColumn sub1 = create_variant_column(2, "variant.sub1", 1, "sub1");
@@ -290,12 +304,17 @@ TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMultipleColumns) {
_tablet_schema->append_column(sparse);
_tablet_schema->append_column(sub2);
- std::vector<uint32_t> column_ids = {0, 1, 2};
+ std::vector<uint32_t> column_ids = {0, 1, 2, 3};
VariantStatsCaculator calculator(_footer.get(), _tablet_schema,
column_ids);
// Create block with multiple columns
vectorized::Block block;
+ // parent column
+ auto string_column = vectorized::ColumnString::create();
+ string_column->insert_data("test", 4);
+ block.insert({std::move(string_column),
std::make_shared<vectorized::DataTypeString>(),
+ "variant_column"});
auto nullable_col1 = create_nullable_column({false, true, false}, {"a",
"", "c"});
block.insert({std::move(nullable_col1),
std::make_shared<vectorized::DataTypeNullable>(
@@ -320,9 +339,9 @@ TEST_F(VariantStatsCalculatorTest,
CalculateVariantStatsWithMultipleColumns) {
EXPECT_TRUE(status.ok());
// Check that statistics were updated for sub columns
- EXPECT_EQ(_footer->columns(0).none_null_size(), 2); // sub1: 2
non-null
- EXPECT_TRUE(_footer->columns(1).has_variant_statistics()); // sparse column
- EXPECT_EQ(_footer->columns(2).none_null_size(), 1); // another_sub:
2 non-null
+ EXPECT_EQ(_footer->columns(1).none_null_size(), 2); // sub1: 2
non-null
+ EXPECT_TRUE(_footer->columns(2).has_variant_statistics()); // sparse column
+ EXPECT_EQ(_footer->columns(3).none_null_size(), 1); // another_sub:
2 non-null
}
TEST_F(VariantStatsCalculatorTest, CalculateVariantStatsWithEmptyBlock) {
diff --git a/be/test/testutil/schema_utils.h b/be/test/testutil/schema_utils.h
index 400d3fcd652..f294a86c462 100644
--- a/be/test/testutil/schema_utils.h
+++ b/be/test/testutil/schema_utils.h
@@ -34,6 +34,7 @@ public:
column_pb->set_is_nullable(is_nullable);
if (column_type == "VARIANT") {
column_pb->set_variant_max_subcolumns_count(variant_max_subcolumns_count);
+ column_pb->set_variant_max_sparse_column_statistics_size(10000);
}
}
diff --git a/be/test/vec/common/schema_util_test.cpp
b/be/test/vec/common/schema_util_test.cpp
index 5dcdf53df06..cb6e38cf007 100644
--- a/be/test/vec/common/schema_util_test.cpp
+++ b/be/test/vec/common/schema_util_test.cpp
@@ -343,7 +343,9 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) {
construct_column_map_with_random_values(column_map, 200, 100,
"key_");
// calculate stats
- schema_util::calculate_variant_stats(*column_map, &stats, 0, 200);
+ size_t max_sparse_column_statistics_size = 10000;
+ schema_util::calculate_variant_stats(*column_map, &stats,
max_sparse_column_statistics_size, 0,
+ 200);
EXPECT_EQ(stats.sparse_column_non_null_size_size(),
key_value_counts.size());
for (const auto& kv : key_value_counts) {
@@ -356,7 +358,8 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) {
column_map->clear();
const auto& key_value_counts2 =
construct_column_map_with_random_values(column_map, 3000, 100,
"key_");
- schema_util::calculate_variant_stats(*column_map, &stats, 0, 3000);
+ schema_util::calculate_variant_stats(*column_map, &stats,
max_sparse_column_statistics_size, 0,
+ 3000);
EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000);
for (const auto& [path, size] : stats.sparse_column_non_null_size()) {
@@ -372,11 +375,10 @@ TEST_F(SchemaUtilTest, calculate_variant_stats) {
// test with max size
column_map->clear();
const auto& key_value_counts3 = construct_column_map_with_random_values(
- column_map, config::variant_max_sparse_column_statistics_size, 5,
"key2_");
- schema_util::calculate_variant_stats(*column_map, &stats, 0,
-
config::variant_max_sparse_column_statistics_size);
- EXPECT_EQ(config::variant_max_sparse_column_statistics_size,
- stats.sparse_column_non_null_size_size());
+ column_map, max_sparse_column_statistics_size, 5, "key2_");
+ schema_util::calculate_variant_stats(*column_map, &stats,
max_sparse_column_statistics_size, 0,
+ max_sparse_column_statistics_size);
+ EXPECT_EQ(max_sparse_column_statistics_size,
stats.sparse_column_non_null_size_size());
for (const auto& [path, size] : stats.sparse_column_non_null_size()) {
auto first_size = key_value_counts.find(path) == key_value_counts.end()
@@ -1686,7 +1688,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns) {
variant.set_unique_id(30);
variant.set_variant_max_subcolumns_count(3);
variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE);
-
+ variant.set_variant_max_sparse_column_statistics_size(10000);
TabletSchemaSPtr schema = std::make_shared<TabletSchema>();
schema->append_column(variant);
@@ -1743,7 +1745,7 @@ TEST_F(SchemaUtilTest, get_compaction_subcolumns) {
output_schema = std::make_shared<TabletSchema>();
sparse_paths.clear();
- for (int i = 0; i < config::variant_max_sparse_column_statistics_size + 1;
++i) {
+ for (int i = 0; i < variant.variant_max_sparse_column_statistics_size() +
1; ++i) {
sparse_paths.insert("dummy" + std::to_string(i));
}
schema_util::get_compaction_subcolumns(paths_set_info, parent_column,
schema,
@@ -1760,6 +1762,7 @@ TEST_F(SchemaUtilTest,
get_compaction_subcolumns_advanced) {
variant.set_variant_max_subcolumns_count(3);
variant.set_aggregation_method(FieldAggregationMethod::OLAP_FIELD_AGGREGATION_NONE);
variant.set_variant_enable_typed_paths_to_sparse(true);
+ variant.set_variant_max_sparse_column_statistics_size(10000);
TabletColumn subcolumn;
subcolumn.set_name("c");
subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_DATEV2);
@@ -1835,7 +1838,7 @@ TEST_F(SchemaUtilTest,
get_compaction_subcolumns_advanced) {
output_schema = std::make_shared<TabletSchema>();
sparse_paths.clear();
- for (int i = 0; i < config::variant_max_sparse_column_statistics_size + 1;
++i) {
+ for (int i = 0; i < variant.variant_max_sparse_column_statistics_size() +
1; ++i) {
sparse_paths.insert("dummy" + std::to_string(i));
}
schema_util::get_compaction_subcolumns(paths_set_info, parent_column,
schema,
diff --git
a/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
b/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
index 53790fbaa4f..e04c3c99b1a 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/ScalarType.java
@@ -1235,4 +1235,13 @@ public class ScalarType extends Type {
}
return false;
}
+
+ public int getVariantMaxSparseColumnStatisticsSize() {
+ // In the past, variant metadata used the ScalarType type.
+ // Now, we use VariantType, which inherits from ScalarType, as the new
metadata storage.
+ if (this instanceof VariantType) {
+ return ((VariantType)
this).getVariantMaxSparseColumnStatisticsSize();
+ }
+ return 0; // The old variant type had a default value of 0.
+ }
}
diff --git
a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java
b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java
index 29342d73ca7..e2a3d76da7f 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/catalog/VariantType.java
@@ -46,6 +46,9 @@ public class VariantType extends ScalarType {
@SerializedName(value = "enableTypedPathsToSparse")
private boolean enableTypedPathsToSparse = false;
+ @SerializedName(value = "variantMaxSparseColumnStatisticsSize")
+ private int variantMaxSparseColumnStatisticsSize = 10000;
+
private Map<String, String> properties = Maps.newHashMap();
public VariantType() {
@@ -53,6 +56,7 @@ public class VariantType extends ScalarType {
this.predefinedFields = Lists.newArrayList();
this.variantMaxSubcolumnsCount = 0;
this.enableTypedPathsToSparse = false;
+ this.variantMaxSparseColumnStatisticsSize = 10000;
}
public VariantType(ArrayList<VariantField> fields) {
@@ -81,7 +85,8 @@ public class VariantType extends ScalarType {
}
public VariantType(ArrayList<VariantField> fields, int
variantMaxSubcolumnsCount,
- boolean
enableTypedPathsToSparse) {
+ boolean
enableTypedPathsToSparse,
+ int
variantMaxSparseColumnStatisticsSize) {
super(PrimitiveType.VARIANT);
Preconditions.checkNotNull(fields);
this.predefinedFields = fields;
@@ -90,6 +95,7 @@ public class VariantType extends ScalarType {
}
this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount;
this.enableTypedPathsToSparse = enableTypedPathsToSparse;
+ this.variantMaxSparseColumnStatisticsSize =
variantMaxSparseColumnStatisticsSize;
}
@Override
@@ -103,7 +109,8 @@ public class VariantType extends ScalarType {
if (!predefinedFields.isEmpty()) {
sb.append(predefinedFields.stream()
.map(variantField ->
variantField.toSql(depth)).collect(Collectors.joining(",")));
- if (variantMaxSubcolumnsCount == 0 && !enableTypedPathsToSparse) {
+ if (variantMaxSubcolumnsCount == 0 && !enableTypedPathsToSparse
+ && variantMaxSparseColumnStatisticsSize == 10000) {
sb.append(">");
return sb.toString();
} else {
@@ -123,6 +130,11 @@ public class VariantType extends ScalarType {
sb.append("\"variant_enable_typed_paths_to_sparse\" = \"")
.append(String.valueOf(enableTypedPathsToSparse)).append("\"");
}
+ if (variantMaxSparseColumnStatisticsSize != 10000) {
+ sb.append(",");
+ sb.append("\"variant_max_sparse_column_statistics_size\" = \"")
+
.append(String.valueOf(variantMaxSparseColumnStatisticsSize)).append("\"");
+ }
sb.append(")>");
return sb.toString();
}
@@ -188,4 +200,12 @@ public class VariantType extends ScalarType {
public void setEnableTypedPathsToSparse(boolean enableTypedPathsToSparse) {
this.enableTypedPathsToSparse = enableTypedPathsToSparse;
}
+
+ public int getVariantMaxSparseColumnStatisticsSize() {
+ return variantMaxSparseColumnStatisticsSize;
+ }
+
+ public void setVariantMaxSparseColumnStatisticsSize(int
variantMaxSparseColumnStatisticsSize) {
+ this.variantMaxSparseColumnStatisticsSize =
variantMaxSparseColumnStatisticsSize;
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java
index ab6b423d976..01993b3714e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Column.java
@@ -683,6 +683,7 @@ public class Column implements GsonPostProcessable {
}
tColumn.setClusterKeyId(this.clusterKeyId);
tColumn.setVariantEnableTypedPathsToSparse(this.getVariantEnableTypedPathsToSparse());
+
tColumn.setVariantMaxSparseColumnStatisticsSize(this.getVariantMaxSparseColumnStatisticsSize());
// ATTN:
// Currently, this `toThrift()` method is only used from
CreateReplicaTask.
// And CreateReplicaTask does not need `defineExpr` field.
@@ -899,6 +900,7 @@ public class Column implements GsonPostProcessable {
} else if (this.type.isVariantType()) {
builder.setVariantMaxSubcolumnsCount(this.getVariantMaxSubcolumnsCount());
builder.setVariantEnableTypedPathsToSparse(this.getVariantEnableTypedPathsToSparse());
+
builder.setVariantMaxSparseColumnStatisticsSize(this.getVariantMaxSparseColumnStatisticsSize());
// variant may contain predefined structured fields
addChildren(builder);
}
@@ -1290,6 +1292,10 @@ public class Column implements GsonPostProcessable {
this.realDefaultValue = refColumn.realDefaultValue;
}
+ public int getVariantMaxSparseColumnStatisticsSize() {
+ return type.isVariantType() ? ((ScalarType)
type).getVariantMaxSparseColumnStatisticsSize() : -1;
+ }
+
public String getExtraInfo() {
return extraInfo;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
index 35664e768e0..e50a40304be 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
@@ -258,6 +258,9 @@ public class PropertyAnalyzer {
public static final String SM4 = "SM4";
public static final String PLAINTEXT = "PLAINTEXT";
+ public static final String
PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE =
+ "variant_max_sparse_column_statistics_size";
+
public enum RewriteType {
PUT, // always put property
REPLACE, // replace if exists property
@@ -1876,6 +1879,26 @@ public class PropertyAnalyzer {
return enableTypedPathsToSparse;
}
+ public static int analyzeVariantMaxSparseColumnStatisticsSize(Map<String,
String> properties, int defuatValue)
+
throws AnalysisException {
+ int maxSparseColumnStatisticsSize = defuatValue;
+ if (properties != null &&
properties.containsKey(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE)) {
+ String maxSparseColumnStatisticsSizeStr =
+
properties.get(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE);
+ try {
+ maxSparseColumnStatisticsSize =
Integer.parseInt(maxSparseColumnStatisticsSizeStr);
+ if (maxSparseColumnStatisticsSize < 0 ||
maxSparseColumnStatisticsSize > 50000) {
+ throw new
AnalysisException("variant_max_sparse_column_statistics_size must between 0 and
50000 ");
+ }
+ } catch (Exception e) {
+ throw new
AnalysisException("variant_max_sparse_column_statistics_size format error:" +
e.getMessage());
+ }
+
+
properties.remove(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE);
+ }
+ return maxSparseColumnStatisticsSize;
+ }
+
public static TEncryptionAlgorithm analyzeTDEAlgorithm(Map<String, String>
properties) throws AnalysisException {
String name;
//if (properties == null ||
!properties.containsKey(PROPERTIES_TDE_ALGORITHM)) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index 4bff83fb40f..1f3ff990004 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -3595,12 +3595,16 @@ public class LogicalPlanBuilder extends
DorisParserBaseVisitor<Object> {
ConnectContext.get().getSessionVariable().getDefaultVariantMaxSubcolumnsCount();
boolean enableTypedPathsToSparse = ConnectContext.get() == null ?
false :
ConnectContext.get().getSessionVariable().getDefaultEnableTypedPathsToSparse();
+ int variantMaxSparseColumnStatisticsSize = ConnectContext.get() ==
null ? 0 :
+
ConnectContext.get().getSessionVariable().getDefaultVariantMaxSparseColumnStatisticsSize();
try {
variantMaxSubcolumnsCount = PropertyAnalyzer
.analyzeVariantMaxSubcolumnsCount(properties, variantMaxSubcolumnsCount);
enableTypedPathsToSparse = PropertyAnalyzer
.analyzeEnableTypedPathsToSparse(properties, enableTypedPathsToSparse);
+ variantMaxSparseColumnStatisticsSize =
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(
+ properties,
variantMaxSparseColumnStatisticsSize);
} catch (org.apache.doris.common.AnalysisException e) {
throw new NotSupportedException(e.getMessage());
}
@@ -3608,7 +3612,8 @@ public class LogicalPlanBuilder extends
DorisParserBaseVisitor<Object> {
if (!properties.isEmpty()) {
throw new NotSupportedException("only support for "
+
PropertyAnalyzer.PROPERTIES_VARIANT_ENABLE_TYPED_PATHS_TO_SPARSE
- + " and " +
PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SUBCOLUMNS_COUNT);
+ + " and " +
PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SUBCOLUMNS_COUNT
+ + " and " +
PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE);
}
if (variantMaxSubcolumnsCount == 0 && !fields.isEmpty()) {
@@ -3616,7 +3621,8 @@ public class LogicalPlanBuilder extends
DorisParserBaseVisitor<Object> {
+ "when variant has fields, but got " +
variantMaxSubcolumnsCount);
}
- return new VariantType(fields, variantMaxSubcolumnsCount,
enableTypedPathsToSparse);
+ return new VariantType(fields, variantMaxSubcolumnsCount,
enableTypedPathsToSparse,
+ variantMaxSparseColumnStatisticsSize);
}
@Override
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
index 0bc1fa127c5..6d1f87340af 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java
@@ -400,7 +400,8 @@ public abstract class DataType {
.collect(ImmutableList.toImmutableList());
return new VariantType(variantFields,
((org.apache.doris.catalog.VariantType)
type).getVariantMaxSubcolumnsCount(),
- ((org.apache.doris.catalog.VariantType)
type).getEnableTypedPathsToSparse());
+ ((org.apache.doris.catalog.VariantType)
type).getEnableTypedPathsToSparse(),
+ ((org.apache.doris.catalog.VariantType)
type).getVariantMaxSparseColumnStatisticsSize());
}
return new VariantType(0);
} else {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java
index f30a328b5db..0d7ee104e2f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/VariantType.java
@@ -46,6 +46,8 @@ public class VariantType extends PrimitiveType {
private boolean enableTypedPathsToSparse = false;
+ private int variantMaxSparseColumnStatisticsSize = 10000;
+
private final List<VariantField> predefinedFields;
// No predefined fields
@@ -61,24 +63,27 @@ public class VariantType extends PrimitiveType {
this.predefinedFields =
ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be
null"));
}
- public VariantType(List<VariantField> fields, int
variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse) {
+ public VariantType(List<VariantField> fields, int
variantMaxSubcolumnsCount, boolean enableTypedPathsToSparse,
+ int variantMaxSparseColumnStatisticsSize) {
this.predefinedFields =
ImmutableList.copyOf(Objects.requireNonNull(fields, "fields should not be
null"));
this.variantMaxSubcolumnsCount = variantMaxSubcolumnsCount;
this.enableTypedPathsToSparse = enableTypedPathsToSparse;
+ this.variantMaxSparseColumnStatisticsSize =
variantMaxSparseColumnStatisticsSize;
}
@Override
public DataType conversion() {
return new
VariantType(predefinedFields.stream().map(VariantField::conversion)
- .collect(Collectors.toList()),
variantMaxSubcolumnsCount,
-
enableTypedPathsToSparse);
+ .collect(Collectors.toList()),
variantMaxSubcolumnsCount, enableTypedPathsToSparse,
+ variantMaxSparseColumnStatisticsSize);
}
@Override
public Type toCatalogDataType() {
org.apache.doris.catalog.VariantType type = new
org.apache.doris.catalog.VariantType(predefinedFields.stream()
.map(VariantField::toCatalogDataType)
- .collect(Collectors.toCollection(ArrayList::new)),
variantMaxSubcolumnsCount, enableTypedPathsToSparse);
+ .collect(Collectors.toCollection(ArrayList::new)),
variantMaxSubcolumnsCount, enableTypedPathsToSparse,
+ variantMaxSparseColumnStatisticsSize);
return type;
}
@@ -97,7 +102,8 @@ public class VariantType extends PrimitiveType {
sb.append("<");
if (!predefinedFields.isEmpty()) {
sb.append(predefinedFields.stream().map(VariantField::toSql).collect(Collectors.joining(",")));
- if (variantMaxSubcolumnsCount == 0 && !enableTypedPathsToSparse) {
+ if (variantMaxSubcolumnsCount == 0 && !enableTypedPathsToSparse
+ && variantMaxSparseColumnStatisticsSize == 10000) {
sb.append(">");
return sb.toString();
} else {
@@ -117,6 +123,12 @@ public class VariantType extends PrimitiveType {
sb.append("\"variant_enable_typed_paths_to_sparse\" = \"")
.append(String.valueOf(enableTypedPathsToSparse)).append("\"");
}
+ if (variantMaxSparseColumnStatisticsSize != 10000) {
+ sb.append(",");
+ sb.append("\"variant_max_sparse_column_statistics_size\" = \"")
+
.append(String.valueOf(variantMaxSparseColumnStatisticsSize))
+ .append("\"");
+ }
sb.append(")>");
return sb.toString();
}
@@ -132,6 +144,7 @@ public class VariantType extends PrimitiveType {
VariantType other = (VariantType) o;
return this.variantMaxSubcolumnsCount ==
other.variantMaxSubcolumnsCount
&& this.enableTypedPathsToSparse ==
other.enableTypedPathsToSparse
+ && this.variantMaxSparseColumnStatisticsSize ==
other.variantMaxSparseColumnStatisticsSize
&& Objects.equals(predefinedFields,
other.predefinedFields);
}
@@ -157,4 +170,8 @@ public class VariantType extends PrimitiveType {
public int getVariantMaxSubcolumnsCount() {
return variantMaxSubcolumnsCount;
}
+
+ public int getVariantMaxSparseColumnStatisticsSize() {
+ return variantMaxSparseColumnStatisticsSize;
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index c625573a4ab..5b959c8b981 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -748,6 +748,9 @@ public class SessionVariable implements Serializable,
Writable {
public static final String PREFER_UDF_OVER_BUILTIN =
"prefer_udf_over_builtin";
public static final String ENABLE_ADD_INDEX_FOR_NEW_DATA =
"enable_add_index_for_new_data";
+ public static final String
DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE =
+
"default_variant_max_sparse_column_statistics_size";
+
/**
* If set false, user couldn't submit analyze SQL and FE won't allocate
any related resources.
*/
@@ -2625,6 +2628,13 @@ public class SessionVariable implements Serializable,
Writable {
})
public boolean enableAddIndexForNewData = false;
+ @VariableMgr.VarAttr(
+ name = DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
+ needForward = true,
+ fuzzy = true
+ )
+ public int defaultVariantMaxSparseColumnStatisticsSize = 10000;
+
// If this fe is in fuzzy mode, then will use initFuzzyModeVariables to
generate some variables,
// not the default value set in the code.
@SuppressWarnings("checkstyle:Indentation")
@@ -4998,5 +5008,9 @@ public class SessionVariable implements Serializable,
Writable {
public boolean getDefaultEnableTypedPathsToSparse() {
return defaultEnableTypedPathsToSparse;
}
+
+ public int getDefaultVariantMaxSparseColumnStatisticsSize() {
+ return defaultVariantMaxSparseColumnStatisticsSize;
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
index ab7291eaf16..ddd813df376 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
@@ -334,4 +334,32 @@ public class PropertyAnalyzerTest {
e.getMessage());
}
}
+
+ @Test
+ public void testAnalyzeVariantMaxSparseColumnStatisticsSize() throws
AnalysisException {
+ Map<String, String> properties = Maps.newHashMap();
+
properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
"-1");
+ try {
+
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0);
+ Assertions.fail("Expected AnalysisException was not thrown");
+ } catch (AnalysisException e) {
+ Assertions.assertNotNull(e.getMessage());
+ }
+ properties.clear();
+
properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
"50001");
+ try {
+
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0);
+ Assertions.fail("Expected AnalysisException was not thrown");
+ } catch (AnalysisException e) {
+ Assertions.assertNotNull(e.getMessage());
+ }
+ properties.clear();
+
properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
"invalid");
+ try {
+
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0);
+ Assertions.fail("Expected AnalysisException was not thrown");
+ } catch (AnalysisException e) {
+ Assertions.assertNotNull(e.getMessage());
+ }
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java
b/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java
index b1f2039e356..3fac71bfc33 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/persist/ScalarTypeTest.java
@@ -36,5 +36,6 @@ public class ScalarTypeTest {
Assert.assertEquals(scalarType.getPrimitiveType(),
scalarType2.getPrimitiveType());
Assert.assertEquals(scalarType.getVariantMaxSubcolumnsCount(), 0);
Assert.assertEquals(scalarType.getVariantEnableTypedPathsToSparse(),
false);
+
Assert.assertEquals(scalarType.getVariantMaxSparseColumnStatisticsSize(), 0);
}
}
diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto
index 8136500491c..1e97d5ad476 100644
--- a/gensrc/proto/olap_file.proto
+++ b/gensrc/proto/olap_file.proto
@@ -344,6 +344,8 @@ message ColumnPB {
optional bool variant_enable_typed_paths_to_sparse = 27 [default = false];
// this field is only used during flexible partial update load
optional bool is_on_update_current_timestamp = 28 [default = false];
+ // variant_max_sparse_column_statistics_size
+ optional int32 variant_max_sparse_column_statistics_size = 29 [default =
10000];
}
// Dictionary of Schema info, to reduce TabletSchemaCloudPB fdb kv size
diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift
index f1ef06103ef..f14a8db707c 100644
--- a/gensrc/thrift/Descriptors.thrift
+++ b/gensrc/thrift/Descriptors.thrift
@@ -51,6 +51,7 @@ struct TColumn {
21: optional TPatternType pattern_type
22: optional bool variant_enable_typed_paths_to_sparse = false;
23: optional bool is_on_update_current_timestamp = false
+ 24: optional i32 variant_max_sparse_column_statistics_size = 10000
}
struct TSlotDescriptor {
diff --git
a/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy
b/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy
index 2b5b4007635..daeb83f3d20 100644
--- a/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy
+++ b/regression-test/pipeline/cloud_p0/conf/regression-conf-custom.groovy
@@ -64,6 +64,7 @@ excludeDirectories = "000_the_start_sentinel_do_not_touch," +
// keep this line
"ccr_mow_syncer_p0," +
"hdfs_vault_p2," +
"inject_hdfs_vault_p0," +
+ "variant_p0/nested," +
"plsql_p0," + // plsql is not developped any more, add by sk.
"zzz_the_end_sentinel_do_not_touch" // keep this line as the last line
diff --git a/regression-test/pipeline/p0/conf/regression-conf.groovy
b/regression-test/pipeline/p0/conf/regression-conf.groovy
index efe158f0887..52ee29f0926 100644
--- a/regression-test/pipeline/p0/conf/regression-conf.groovy
+++ b/regression-test/pipeline/p0/conf/regression-conf.groovy
@@ -87,7 +87,8 @@ excludeDirectories = "000_the_start_sentinel_do_not_touch," +
// keep this line
"nereids_rules_p0/subquery," +
"unique_with_mow_c_p0," +
"workload_manager_p1," +
- "plsql_p0," + // plsql is not developped any more
+ "plsql_p0," + // plsql is not developped any more, add by sk
+ "variant_p0/nested," +
"zzz_the_end_sentinel_do_not_touch"// keep this line as the last line
customConf1 = "test_custom_conf_value"
diff --git
a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy
b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy
index 6e3d6a12ce8..67dd512e0ed 100644
---
a/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy
+++
b/regression-test/suites/fault_injection_p0/test_variant_compaction_with_sparse_limit.groovy
@@ -23,12 +23,6 @@ suite("test_compaction_variant_with_sparse_limit",
"nonConcurrent") {
def backendId_to_backendHttpPort = [:]
getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort);
- def set_be_config = { key, value ->
- for (String backend_id: backendId_to_backendIP.keySet()) {
- def (code, out, err) =
update_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), key, value)
- logger.info("update config: code=" + code + ", out=" + out + ",
err=" + err)
- }
- }
try {
String backend_id = backendId_to_backendIP.keySet()[0]
def (code, out, err) =
show_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id))
@@ -45,16 +39,17 @@ suite("test_compaction_variant_with_sparse_limit",
"nonConcurrent") {
}
}
- set_be_config("variant_max_sparse_column_statistics_size", "2")
- int max_subcolumns_count = Math.floor(Math.random() * 5)
+
+ int max_subcolumns_count = Math.floor(Math.random() * 5)
+ int max_sparse_column_statistics_size = 2
if (max_subcolumns_count == 1) {
max_subcolumns_count = 0
}
def create_table = { tableName, buckets="auto", key_type="DUPLICATE" ->
sql "DROP TABLE IF EXISTS ${tableName}"
- def var_def = "variant
<properties(\"variant_max_subcolumns_count\" = \"${max_subcolumns_count}\")>"
+ def var_def = "variant
<properties(\"variant_max_subcolumns_count\" = \"${max_subcolumns_count}\",
\"variant_max_sparse_column_statistics_size\" =
\"${max_sparse_column_statistics_size}\")>"
if (key_type == "AGGREGATE") {
- var_def = "variant
<properties(\"variant_max_subcolumns_count\" = \"${max_subcolumns_count}\")>
replace"
+ var_def = "variant
<properties(\"variant_max_subcolumns_count\" = \"${max_subcolumns_count}\",
\"variant_max_sparse_column_statistics_size\" =
\"${max_sparse_column_statistics_size}\")> replace"
}
sql """
CREATE TABLE IF NOT EXISTS ${tableName} (
@@ -66,6 +61,25 @@ suite("test_compaction_variant_with_sparse_limit",
"nonConcurrent") {
properties("replication_num" = "1", "disable_auto_compaction"
= "true");
"""
}
+ // check the sparse column must not be read if max_subcolumns_count is 0
+ def check_sparse_column_must_not_be_read = { tableName ->
+ if (max_subcolumns_count == 0) {
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("exist_in_sparse_column_must_be_false")
+ sql """ select v['a'], v['b'], v['c'], v['x'], v['y'],
v['z'], v['m'], v['l'], v['g'], v['z'], v['sala'], v['dddd'] from
${tableName}"""
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("exist_in_sparse_column_must_be_false")
+ }
+ } else if (max_subcolumns_count > 1) {
+ // here will aways false
+ try {
+
GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false")
+ sql """ select v['mmm'] from ${tableName} where k = 30"""
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false")
+ }
+ }
+ }
def key_types = ["DUPLICATE", "UNIQUE", "AGGREGATE"]
// def key_types = ["AGGREGATE"]
for (int i = 0; i < key_types.size(); i++) {
@@ -119,8 +133,41 @@ suite("test_compaction_variant_with_sparse_limit",
"nonConcurrent") {
qt_sql_55 "select cast(v['b'] as string), cast(v['b']['c'] as
string) from ${tableName} where cast(v['b'] as string) != 'null' and
cast(v['b'] as string) != '{}' order by k desc limit 10;"
}
+ } catch (e) {
+ logger.info("catch exception: ${e}")
} finally {
- // set back to default
- set_be_config("variant_max_sparse_column_statistics_size", "10000")
+ sql "DROP TABLE IF EXISTS simple_variant_DUPLICATE"
+ sql "DROP TABLE IF EXISTS simple_variant_UNIQUE"
+ sql "DROP TABLE IF EXISTS simple_variant_AGGREGATE"
+ }
+
+ // test variant_max_sparse_column_statistics_size debug error case
+ sql "DROP TABLE IF EXISTS tn_simple_variant_DUPLICATE"
+ sql """
+ CREATE TABLE IF NOT EXISTS tn_simple_variant_DUPLICATE (
+ k bigint,
+ v variant <properties(\"variant_max_subcolumns_count\" = \"2\",
\"variant_max_sparse_column_statistics_size\" = \"1\")>
+ )
+ DUPLICATE KEY(`k`)
+ DISTRIBUTED BY HASH(k) BUCKETS 1
+ properties("replication_num" = "1", "disable_auto_compaction" =
"true");
+ """
+ // here will always true
+ sql """insert into tn_simple_variant_DUPLICATE values (1, '{"a" : 1, "b" :
2}');"""
+
GetDebugPoint().enableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false")
+ test {
+ sql """ select v['a'] from tn_simple_variant_DUPLICATE where k = 1"""
+ exception null
}
+
+ // here will always false
+ sql """ truncate table tn_simple_variant_DUPLICATE --force ; """
+ sql """insert into tn_simple_variant_DUPLICATE values (1, '{"d" : "ddd",
"s" : "fff", "da": "ddd", "m": 111}');"""
+ test {
+ sql """ select v['m'] from tn_simple_variant_DUPLICATE"""
+ exception "exceeded_sparse_column_limit_must_be_false"
+ }
+
+
GetDebugPoint().disableDebugPointForAllBEs("exceeded_sparse_column_limit_must_be_false")
+
}
diff --git
a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
index 0ab363d5671..d47c486047e 100644
---
a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
+++
b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
@@ -23,12 +23,6 @@ suite("test_compaction_variant_predefine_with_sparse_limit",
"nonConcurrent") {
def backendId_to_backendHttpPort = [:]
getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort);
- def set_be_config = { key, value ->
- for (String backend_id: backendId_to_backendIP.keySet()) {
- def (code, out, err) =
update_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id), key, value)
- logger.info("update config: code=" + code + ", out=" + out + ",
err=" + err)
- }
- }
try {
String backend_id = backendId_to_backendIP.keySet()[0]
def (code, out, err) =
show_be_config(backendId_to_backendIP.get(backend_id),
backendId_to_backendHttpPort.get(backend_id))
@@ -45,13 +39,14 @@
suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") {
}
}
- set_be_config("variant_max_sparse_column_statistics_size", "2")
+ int max_sparse_column_statistics_size = 2
def create_table = { tableName, buckets="auto", key_type="DUPLICATE" ->
sql "DROP TABLE IF EXISTS ${tableName}"
- def var_def = "variant <'sala' : int, 'ddd' : double, 'z' :
double>"
+ def var_def = "variant <MATCH_NAME 'sala' : int, MATCH_NAME 'ddd'
: double, MATCH_NAME 'z' : double,
properties(\"variant_max_sparse_column_statistics_size\" =
\"${max_sparse_column_statistics_size}\")>"
if (key_type == "AGGREGATE") {
- var_def = "variant <'sala' : int, 'ddd' : double, 'z' :
double> replace"
+ var_def = "variant <MATCH_NAME 'sala' : int, MATCH_NAME 'ddd'
: double, MATCH_NAME 'z' : double,
properties(\"variant_max_sparse_column_statistics_size\" =
\"${max_sparse_column_statistics_size}\")> replace"
}
+
sql """
CREATE TABLE IF NOT EXISTS ${tableName} (
k bigint,
@@ -61,6 +56,9 @@ suite("test_compaction_variant_predefine_with_sparse_limit",
"nonConcurrent") {
DISTRIBUTED BY HASH(k) BUCKETS ${buckets}
properties("replication_num" = "1", "disable_auto_compaction"
= "true");
"""
+ def create_tbl_res = sql """ show create table ${tableName} """
+ logger.info("${create_tbl_res}")
+
assertTrue(create_tbl_res.toString().contains("variant_max_sparse_column_statistics_size"))
}
def key_types = ["DUPLICATE", "UNIQUE", "AGGREGATE"]
// def key_types = ["AGGREGATE"]
@@ -132,7 +130,8 @@
suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") {
order_qt_select "select * from ${tableName} order by k, cast(v as
string) limit 5;"
}
} finally {
- // set back to default
- set_be_config("variant_max_sparse_column_statistics_size", "10000")
+ sql "DROP TABLE IF EXISTS simple_variant_DUPLICATE"
+ sql "DROP TABLE IF EXISTS simple_variant_UNIQUE"
+ sql "DROP TABLE IF EXISTS simple_variant_AGGREGATE"
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]