This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new 00b4042dab1 [enhance](variant)Add ut writer var (#49624) 00b4042dab1 is described below commit 00b4042dab1103f96c9546b2fc101423be0a7739 Author: Sun Chenyang <suncheny...@selectdb.com> AuthorDate: Fri Mar 28 16:02:40 2025 +0800 [enhance](variant)Add ut writer var (#49624) --- .../rowset/segment_v2/hierarchical_data_reader.cpp | 2 +- .../segment_v2/variant_column_writer_impl.cpp | 17 +- .../rowset/segment_v2/variant_column_writer_impl.h | 12 +- be/src/vec/columns/column_object.cpp | 16 +- be/src/vec/columns/column_object.h | 2 + be/src/vec/common/schema_util.cpp | 42 +- be/src/vec/common/schema_util.h | 16 +- be/src/vec/olap/olap_data_convertor.cpp | 8 +- be/src/vec/olap/olap_data_convertor.h | 6 + be/test/common/schema_util_test.cpp | 121 ---- .../compaction/util/index_compaction_utils.cpp | 2 +- .../variant_column_writer_reader_test.cpp | 663 +++++++++++++++++++++ .../olap/rowset/variant_with_compaction_test.cpp | 0 be/test/testutil/schema_utils.h | 48 ++ be/test/testutil/variant_util.h | 137 +++++ be/test/vec/common/schema_util_rowset_test.cpp | 265 ++++++++ be/test/vec/common/schema_util_test.cpp | 357 +++++++++++ 17 files changed, 1532 insertions(+), 182 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp index 2367a38821b..185a6d82422 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp @@ -382,7 +382,7 @@ Status HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject& // from "" which is empty path and root if (container_variant.is_null_root()) { // root was created with nrows with Nothing type, resize it to fit the size of sparse column - container_variant.get_root()->resize(sparse_data_offsets.size()); + container_variant.get_subcolumn({})->resize(sparse_data_offsets.size()); // bool added = container_variant.add_sub_column({}, sparse_data_offsets.size()); // if (!added) { // return Status::InternalError("Failed to add subcolumn for sparse column"); diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 34fe6e085ec..ae1f2a8de61 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -403,9 +403,9 @@ Status VariantColumnWriterImpl::_process_sparse_column( return status; } VLOG_DEBUG << "dump sparse " - << vectorized::schema_util::dump_column( - vectorized::ColumnObject::get_sparse_column_type(), - ptr->get_sparse_column()); + << vectorized::Block::dump_column( + ptr->get_sparse_column(), + vectorized::ColumnObject::get_sparse_column_type()); RETURN_IF_ERROR( _sparse_column_writer->append(column->get_nullmap(), column->get_data(), num_rows)); ++column_id; @@ -498,10 +498,11 @@ bool VariantColumnWriterImpl::is_finalized() const { Status VariantColumnWriterImpl::append_data(const uint8_t** ptr, size_t num_rows) { DCHECK(!is_finalized()); - const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(*ptr); + const auto* column = reinterpret_cast<const vectorized::VariantColumnData*>(*ptr); + const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(column->column_data); auto* dst_ptr = assert_cast<vectorized::ColumnObject*>(_column.get()); // TODO: if direct write we could avoid copy - dst_ptr->insert_range_from(src, 0, num_rows); + dst_ptr->insert_range_from(src, column->row_pos, num_rows); return Status::OK(); } @@ -629,9 +630,11 @@ Status VariantSubcolumnWriter::init() { } Status VariantSubcolumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { - const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(*ptr); + const auto* column = reinterpret_cast<const vectorized::VariantColumnData*>(*ptr); + const auto& src = *reinterpret_cast<const vectorized::ColumnObject*>(column->column_data); auto* dst_ptr = assert_cast<vectorized::ColumnObject*>(_column.get()); - dst_ptr->insert_range_from(src, 0, num_rows); + // TODO: if direct write we could avoid copy + dst_ptr->insert_range_from(src, column->row_pos, num_rows); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h index d9974dd6f2e..9f67cf04505 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h @@ -36,8 +36,13 @@ class ColumnWriter; class ScalarColumnWriter; struct VariantStatistics { - // If reached the size of this, we should stop writing statistics for sparse data - constexpr static size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000; + // #ifdef BE_TEST + // static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10; + // #else + // // If reached the size of this, we should stop writing statistics for sparse data + // static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000; + // #endif + static constexpr size_t MAX_SPARSE_DATA_STATISTICS_SIZE = 10000; std::map<std::string, size_t> subcolumns_non_null_size; std::map<std::string, size_t> sparse_column_non_null_size; @@ -96,5 +101,8 @@ private: // hold the references of subcolumns indexes std::vector<std::unique_ptr<TabletIndex>> _subcolumns_indexes; }; + +void _init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, + CompressionTypePB compression_type); } // namespace segment_v2 } // namespace doris \ No newline at end of file diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index 896fad0795d..448ff7df4c1 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -640,6 +640,17 @@ bool ColumnObject::Subcolumn::is_finalized() const { (data.empty() || (data.size() == 1)); } +void ColumnObject::Subcolumn::resize(size_t n) { + if (n == num_rows) { + return; + } + if (n > num_rows) { + insert_many_defaults(n - num_rows); + } else { + pop_back(num_rows - n); + } +} + template <typename Func> MutableColumnPtr ColumnObject::apply_for_columns(Func&& func) const { if (!is_finalized()) { @@ -1107,7 +1118,8 @@ void ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std: row -= num_of_defaults_in_prefix; for (size_t i = 0; i < data.size(); ++i) { const auto& part = data[i]; - if (row < part->size()) { + size_t current_column_size = part->size(); + if (row < current_column_size) { // no need null in sparse column if (!assert_cast<const ColumnNullable&, TypeCheckOnRelease::DISABLE>(*part).is_null_at( row)) { @@ -1129,7 +1141,7 @@ void ColumnObject::Subcolumn::serialize_to_sparse_column(ColumnString* key, std: return; } - row -= part->size(); + row -= current_column_size; } throw doris::Exception(ErrorCode::OUT_OF_BOUND, diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 10078c0ede5..f7c42843f7b 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -204,6 +204,8 @@ public: bool is_empty_nested(size_t row) const; + void resize(size_t n); + private: class LeastCommonType { public: diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 2f8a4de72e7..6298daffd8f 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -534,7 +534,8 @@ Status _parse_variant_columns(Block& block, const std::vector<int>& variant_pos, } if (scalar_root_column->is_column_string()) { - variant_column = ColumnObject::create(var.max_subcolumns_count()); + // now, subcolumns have not been set, so we set it to 0 + variant_column = ColumnObject::create(0); parse_json_to_variant(*variant_column.get(), assert_cast<const ColumnString&>(*scalar_root_column), config); } else { @@ -577,43 +578,6 @@ vectorized::ColumnObject::Subcolumns get_sorted_subcolumns( return sorted; } -// --------------------------- - -std::string dump_column(DataTypePtr type, const ColumnPtr& col) { - Block tmp; - tmp.insert(ColumnWithTypeAndName {col, type, col->get_name()}); - return tmp.dump_data(0, tmp.rows()); -} - -// --------------------------- -Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr& dst) { - auto type_string = std::make_shared<DataTypeString>(); - std::string jsonpath = path.to_jsonpath(); - bool is_nullable = source->is_nullable(); - auto json_type = is_nullable ? make_nullable(std::make_shared<DataTypeJsonb>()) - : std::make_shared<DataTypeJsonb>(); - ColumnsWithTypeAndName arguments { - {source, json_type, ""}, - {type_string->create_column_const(1, Field(String(jsonpath.data(), jsonpath.size()))), - type_string, ""}}; - auto function = - SimpleFunctionFactory::instance().get_function("jsonb_extract", arguments, json_type); - if (!function) { - return Status::InternalError("Not found function jsonb_extract"); - } - Block tmp_block {arguments}; - vectorized::ColumnNumbers argnum; - argnum.emplace_back(0); - argnum.emplace_back(1); - uint32_t result_column = cast_set<uint32_t>(tmp_block.columns()); - tmp_block.insert({nullptr, json_type, ""}); - RETURN_IF_ERROR(function->execute(nullptr, tmp_block, argnum, result_column, source->size())); - dst = tmp_block.get_by_position(result_column) - .column->convert_to_full_column_if_const() - ->assume_mutable(); - return Status::OK(); -} - bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema, int32_t new_col_idx, int32_t old_col_idx) { const auto& column_new = new_schema->column(new_col_idx); @@ -645,8 +609,6 @@ TabletColumn create_sparse_column(const TabletColumn& variant) { return res; } -using PathToNoneNullValues = std::unordered_map<std::string, size_t>; - Status collect_path_stats(const RowsetSharedPtr& rs, std::unordered_map<int32_t, PathToNoneNullValues>& uid_to_path_stats) { SegmentCacheHandle segment_cache; diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index 4c1ee876254..8281cdec7b6 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -54,6 +54,8 @@ struct ColumnWithTypeAndName; const std::string SPARSE_COLUMN_PATH = "__DORIS_VARIANT_SPARSE__"; namespace doris::vectorized::schema_util { +using PathToNoneNullValues = std::unordered_map<std::string, size_t>; + /// Returns number of dimensions in Array type. 0 if type is not array. size_t get_number_of_dimensions(const IDataType& type); @@ -122,17 +124,21 @@ void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, vectorized::ColumnObject::Subcolumns get_sorted_subcolumns( const vectorized::ColumnObject::Subcolumns& subcolumns); -// Extract json data from source with path -Status extract(ColumnPtr source, const PathInData& path, MutableColumnPtr& dst); - -std::string dump_column(DataTypePtr type, const ColumnPtr& col); - bool has_schema_index_diff(const TabletSchema* new_schema, const TabletSchema* old_schema, int32_t new_col_idx, int32_t old_col_idx); // create ColumnMap<String, String> TabletColumn create_sparse_column(const TabletColumn& variant); +// get the subpaths and sparse paths for the variant column +void get_subpaths(const TabletColumn& variant, + const std::unordered_map<int32_t, PathToNoneNullValues>& path_stats, + std::unordered_map<int32_t, TabletSchema::PathsSetInfo>& uid_to_paths_set_info); + +// collect path stats from the rowset +Status collect_path_stats(const RowsetSharedPtr& rs, + std::unordered_map<int32_t, PathToNoneNullValues>& uid_to_path_stats); + // Build the temporary schema for compaction, this will reduce the memory usage of compacting variant columns Status get_compaction_schema(const std::vector<RowsetSharedPtr>& rowsets, TabletSchemaSPtr& target); diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index c2887c41934..c286127d54e 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -1205,6 +1205,8 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorVariant::convert_to_olap() } // Do nothing, the column writer will finally do finalize and write subcolumns one by one // since we are not sure the final column(type and columns) until the end of the last block + // need to return the position of the column data + _variant_column_data = std::make_unique<VariantColumnData>(_value_ptr, _row_pos); return Status::OK(); } @@ -1212,9 +1214,9 @@ const void* OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data() c if (!_value_ptr) { return _root_data_convertor->get_data(); } - // return the ptr of original column, see VariantColumnWriterImpl::append_data - // which will cast to ColumnObject - return _value_ptr; + // return the ptr of VariantColumnData, see VariantColumnWriterImpl::append_data + // which will cast to VariantColumnData + return _variant_column_data.get(); } const void* OlapBlockDataConvertor::OlapColumnDataConvertorVariant::get_data_at( size_t offset) const { diff --git a/be/src/vec/olap/olap_data_convertor.h b/be/src/vec/olap/olap_data_convertor.h index c2a8e6ace47..e0f1184021e 100644 --- a/be/src/vec/olap/olap_data_convertor.h +++ b/be/src/vec/olap/olap_data_convertor.h @@ -72,6 +72,11 @@ public: virtual ~IOlapColumnDataAccessor() = default; }; +struct VariantColumnData { + const void* column_data; + size_t row_pos; +}; + class OlapBlockDataConvertor { public: OlapBlockDataConvertor() = default; @@ -536,6 +541,7 @@ private: private: const void* _value_ptr; std::unique_ptr<OlapColumnDataConvertorVarChar> _root_data_convertor; + std::unique_ptr<VariantColumnData> _variant_column_data; }; private: diff --git a/be/test/common/schema_util_test.cpp b/be/test/common/schema_util_test.cpp deleted file mode 100644 index fb8b23c10cb..00000000000 --- a/be/test/common/schema_util_test.cpp +++ /dev/null @@ -1,121 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/common/schema_util.h" - -#include <gtest/gtest.h> - -namespace doris { - -class SchemaUtilTest : public testing::Test {}; - -void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, - const std::string& index_name, int32_t col_unique_id, - const std::string& column_type, const std::string& column_name, - const IndexType& index_type) { - column_pb->set_unique_id(col_unique_id); - column_pb->set_name(column_name); - column_pb->set_type(column_type); - column_pb->set_is_nullable(true); - column_pb->set_is_bf_column(true); - tablet_index->set_index_id(index_id); - tablet_index->set_index_name(index_name); - tablet_index->set_index_type(index_type); - tablet_index->add_col_unique_id(col_unique_id); -} - -void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t col_unique_id, - std::string_view path, std::vector<TabletColumn>* subcolumns) { - TabletColumn subcol; - subcol.set_type(type); - subcol.set_is_nullable(true); - subcol.set_unique_id(-1); - subcol.set_parent_unique_id(col_unique_id); - vectorized::PathInData col_path(path); - subcol.set_path_info(col_path); - subcol.set_name(col_path.get_path()); - - if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) { - TabletColumn array_item_col; - // double not support inverted index - array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE); - array_item_col.set_is_nullable(true); - array_item_col.set_unique_id(-1); - array_item_col.set_parent_unique_id(col_unique_id); - - subcol.add_sub_column(array_item_col); - } - - schema->append_column(subcol); - subcolumns->emplace_back(std::move(subcol)); -} - -TEST_F(SchemaUtilTest, inherit_column_attributes) { - TabletSchemaPB schema_pb; - schema_pb.set_keys_type(KeysType::DUP_KEYS); - schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); - - construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "key_index", 0, "INT", - "key", IndexType::INVERTED); - construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "VARIANT", - "v1", IndexType::INVERTED); - construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "VARIANT", - "v3", IndexType::INVERTED); - - TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); - tablet_schema->init_from_pb(schema_pb); - std::vector<TabletColumn> subcolumns; - - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.b", &subcolumns); - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, "v1.c", &subcolumns); - - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, "v3.d", &subcolumns); - construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, "v3.a", &subcolumns); - - vectorized::schema_util::inherit_column_attributes(tablet_schema); - for (const auto& col : subcolumns) { - switch (col._parent_col_unique_id) { - case 1: - EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr); - break; - case 3: - EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr); - break; - default: - EXPECT_TRUE(false); - } - } - EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7); - - for (const auto& col : tablet_schema->_cols) { - if (!col->is_extracted_column()) { - continue; - } - switch (col->_parent_col_unique_id) { - case 1: - EXPECT_TRUE(col->is_bf_column()); - break; - case 3: - EXPECT_TRUE(!col->is_bf_column()); - break; - default: - EXPECT_TRUE(false); - } - } -} - -} // namespace doris diff --git a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp index 02353fc5441..4f4a601e63c 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp @@ -416,7 +416,7 @@ class IndexCompactionUtils { // only base compaction can handle delete predicate BaseCompaction compaction(*engine_ref, tablet); compaction._input_rowsets = std::move(rowsets); - compaction.build_basic_info(); + RETURN_IF_ERROR(compaction.build_basic_info()); std::vector<RowsetReaderSharedPtr> input_rs_readers; create_input_rowsets_readers(compaction, input_rs_readers); diff --git a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp new file mode 100644 index 00000000000..df8428ac877 --- /dev/null +++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp @@ -0,0 +1,663 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" +#include "olap/rowset/segment_v2/column_reader.h" +#include "olap/rowset/segment_v2/hierarchical_data_reader.h" +#include "olap/rowset/segment_v2/variant_column_writer_impl.h" +#include "olap/storage_engine.h" +#include "testutil/schema_utils.h" +#include "testutil/variant_util.h" + +using namespace doris::vectorized; + +namespace doris { + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "/ut_dir/variant_column_writer_test"; +constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; + +class VariantColumnWriterReaderTest : public testing::Test { +public: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _current_dir = std::string(buffer); + _absolute_dir = _current_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // tmp dir + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); + std::vector<StorePath> paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); + Status st = tmp_file_dirs->init(); + EXPECT_TRUE(st.ok()) << st.to_json(); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + // storage engine + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + _engine_ref = engine.get(); + _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); + static_cast<void>(_data_dir->update_capacity()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + VariantColumnWriterReaderTest() = default; + ~VariantColumnWriterReaderTest() override = default; + +private: + TabletSchemaSPtr _tablet_schema = nullptr; + StorageEngine* _engine_ref = nullptr; + std::unique_ptr<DataDir> _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _current_dir; +}; + +void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { + EXPECT_TRUE(column_meta.has_column_path_info()); + auto path = std::make_shared<vectorized::PathInData>(); + path->from_protobuf(column_meta.column_path_info()); + EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); + EXPECT_EQ(column_meta.none_null_size(), path_with_size[path->copy_pop_front().get_path()]); +} + +void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { + EXPECT_TRUE(column_meta.has_column_path_info()); + auto path = std::make_shared<vectorized::PathInData>(); + path->from_protobuf(column_meta.column_path_info()); + EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); + for (const auto& [path, size] : + column_meta.variant_statistics().sparse_column_non_null_size()) { + EXPECT_EQ(size, path_with_size[path]); + } + EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__"); +} + +TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { + // 1. create tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1"); + _tablet_schema = std::make_shared<TabletSchema>(); + _tablet_schema->init_from_pb(schema_pb); + + // 2. create tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + tablet_meta->_tablet_id = 10000; + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + + EXPECT_TRUE(_tablet->init().ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + // 3. create file_writer + io::FileWriterPtr file_writer; + auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); + auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 4. create column_writer + SegmentFooterPB footer; + ColumnWriterOptions opts; + opts.meta = footer.add_columns(); + opts.compression_type = CompressionTypePB::LZ4; + opts.file_writer = file_writer.get(); + opts.footer = &footer; + RowsetWriterContext rowset_ctx; + rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; + opts.rowset_ctx = &rowset_ctx; + opts.rowset_ctx->tablet_schema = _tablet_schema; + TabletColumn column = _tablet_schema->column(0); + _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); + + std::unique_ptr<ColumnWriter> writer; + EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); + EXPECT_TRUE(writer->init().ok()); + EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); + + // 5. write data + auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); + auto block = _tablet_schema->create_block(); + auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); + std::unordered_map<int, std::string> inserted_jsonstr; + auto path_with_size = + VariantUtil::fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr); + olap_data_convertor->add_column_data_convertor(column); + olap_data_convertor->set_source_content(&block, 0, 1000); + auto [result, accessor] = olap_data_convertor->convert_column_data(0); + EXPECT_TRUE(result.ok()); + EXPECT_TRUE(accessor != nullptr); + EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); + st = writer->finish(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_data(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_ordinal_index(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_zone_map(); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(file_writer->close().ok()); + footer.set_num_rows(1000); + + // 6. check footer + EXPECT_EQ(footer.columns_size(), 5); + auto column_meta = footer.columns(0); + EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); + + for (int i = 1; i < footer.columns_size() - 1; ++i) { + auto column_meta = footer.columns(i); + check_column_meta(column_meta, path_with_size); + } + check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); + + // 7. check variant reader + io::FileReaderSPtr file_reader; + st = io::global_local_filesystem()->open_file(file_path, &file_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + ColumnReaderOptions read_opts; + std::unique_ptr<ColumnReader> column_reader; + st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); + EXPECT_TRUE(variant_column_reader != nullptr); + + auto subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key0")); + EXPECT_TRUE(subcolumn_reader != nullptr); + subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key1")); + EXPECT_TRUE(subcolumn_reader != nullptr); + subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key2")); + EXPECT_TRUE(subcolumn_reader != nullptr); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9"))); + auto size = variant_column_reader->get_metadata_size(); + EXPECT_GT(size, 0); + + // 8. check statistics + auto statistics = variant_column_reader->get_stats(); + for (const auto& [path, size] : statistics->subcolumns_non_null_size) { + EXPECT_EQ(path_with_size[path], size); + } + for (const auto& [path, size] : statistics->sparse_column_non_null_size) { + EXPECT_EQ(path_with_size[path], size); + } + + // 9. check hier reader + ColumnIterator* it; + TabletColumn parent_column = _tablet_schema->column(0); + StorageReadOptions storage_read_opts; + storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; + st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + ColumnIteratorOptions column_iter_opts; + OlapReaderStatistics stats; + column_iter_opts.stats = &stats; + column_iter_opts.file_reader = file_reader.get(); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + MutableColumnPtr new_column_object = ColumnObject::create(3); + size_t nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int i = 0; i < 1000; ++i) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(i, &value); + + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, inserted_jsonstr[i]); + } + + std::vector<rowid_t> row_ids; + for (int i = 0; i < 1000; ++i) { + if (i % 7 == 0) { + row_ids.push_back(i); + } + } + new_column_object = ColumnObject::create(3); + st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + for (int i = 0; i < row_ids.size(); ++i) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(i, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]); + } + + auto read_to_column_object = [&]() { + new_column_object = ColumnObject::create(3); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + EXPECT_EQ(nrows, 1000); + }; + + // 10. check sparse extract reader + for (int i = 3; i < 10; ++i) { + std::string key = ".key" + std::to_string(i); + TabletColumn subcolumn_in_sparse; + subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key); + subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id()); + subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() + key)); + subcolumn_in_sparse.set_variant_max_subcolumns_count( + parent_column.variant_max_subcolumns_count()); + subcolumn_in_sparse.set_is_nullable(true); + + st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + read_to_column_object(); + + for (int row = 0; row < 1000; ++row) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(row, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + if (inserted_jsonstr[row].find(key) != std::string::npos) { + if (i % 2 == 0) { + EXPECT_EQ(value, "88"); + } else { + EXPECT_EQ(value, "str99"); + } + } + } + } + + // 11. check leaf reader + auto check_leaf_reader = [&]() { + for (int i = 0; i < 3; ++i) { + std::string key = ".key" + std::to_string(i); + TabletColumn subcolumn; + subcolumn.set_name(parent_column.name_lower_case() + key); + subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type()); + subcolumn.set_parent_unique_id(parent_column.unique_id()); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + key)); + subcolumn.set_variant_max_subcolumns_count( + parent_column.variant_max_subcolumns_count()); + subcolumn.set_is_nullable(true); + + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto column_type = DataTypeFactory::instance().create_data_type(subcolumn, false); + auto read_column = column_type->create_column(); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, read_column); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int row = 0; row < 1000; ++row) { + const std::string& value = column_type->to_string(*read_column, row); + if (inserted_jsonstr[row].find(key) != std::string::npos) { + if (i % 2 == 0) { + EXPECT_EQ(value, "88"); + } else { + EXPECT_EQ(value, "str99"); + } + } + } + } + }; + check_leaf_reader(); + + // 12. check empty + TabletColumn subcolumn; + subcolumn.set_name(parent_column.name_lower_case() + ".key10"); + subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcolumn.set_parent_unique_id(parent_column.unique_id()); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); + subcolumn.set_is_nullable(true); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); + + // 13. check statistics size == limit + auto& variant_stats = variant_column_reader->_statistics; + EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE); + auto limit = VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE - + variant_stats->sparse_column_non_null_size.size(); + for (int i = 0; i < limit; ++i) { + std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); + variant_stats->sparse_column_non_null_size[key] = 10000; + } + EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE); + + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto check_empty_column = [&]() { + for (int row = 0; row < 1000; ++row) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(row, &value); + + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, "{}"); + } + }; + + read_to_column_object(); + check_empty_column(); + + // construct tablet schema for compaction + storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION; + storage_read_opts.tablet_schema = _tablet_schema; + std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; + TabletSchema::PathsSetInfo paths_set_info; + paths_set_info.sub_path_set.insert("key0"); + paths_set_info.sub_path_set.insert("key3"); + paths_set_info.sub_path_set.insert("key4"); + paths_set_info.sparse_path_set.insert("key1"); + paths_set_info.sparse_path_set.insert("key2"); + paths_set_info.sparse_path_set.insert("key5"); + paths_set_info.sparse_path_set.insert("key6"); + paths_set_info.sparse_path_set.insert("key7"); + paths_set_info.sparse_path_set.insert("key8"); + paths_set_info.sparse_path_set.insert("key9"); + uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info; + _tablet_schema->set_path_set_info(uid_to_paths_set_info); + + // 14. check compaction subcolumn reader + check_leaf_reader(); + + // 15. check compaction root reader + st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 16. check compacton sparse column + TabletColumn sparse_column = schema_util::create_sparse_column(parent_column); + st = variant_column_reader->new_iterator(&it, sparse_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + auto column_type = DataTypeFactory::instance().create_data_type(sparse_column, false); + auto read_column = column_type->create_column(); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, read_column); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int row = 0; row < 1000; ++row) { + const std::string& value = column_type->to_string(*read_column, row); + EXPECT_TRUE(value.find("key0") == std::string::npos) + << "row: " << row << ", value: " << value; + EXPECT_TRUE(value.find("key3") == std::string::npos) + << "row: " << row << ", value: " << value; + EXPECT_TRUE(value.find("key4") == std::string::npos) + << "row: " << row << ", value: " << value; + } + + // 17. check limit = 10000 + subcolumn.set_name(parent_column.name_lower_case() + ".key10"); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); + + for (int i = 0; i < limit; ++i) { + std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); + variant_stats->sparse_column_non_null_size.erase(key); + } + + // 18. check compacton sparse extract column + subcolumn.set_name(parent_column.name_lower_case() + ".key3"); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key3")); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); + + // 19. check compaction default column + subcolumn.set_name(parent_column.name_lower_case() + ".key10"); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +} + +TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { + // 1. create tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", 10); + _tablet_schema = std::make_shared<TabletSchema>(); + _tablet_schema->init_from_pb(schema_pb); + + // 2. create tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + tablet_meta->_tablet_id = 10000; + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + // 3. create file_writer + io::FileWriterPtr file_writer; + auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); + auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 4. create column_writer + SegmentFooterPB footer; + ColumnWriterOptions opts; + opts.meta = footer.add_columns(); + opts.compression_type = CompressionTypePB::LZ4; + opts.file_writer = file_writer.get(); + opts.footer = &footer; + RowsetWriterContext rowset_ctx; + rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; + opts.rowset_ctx = &rowset_ctx; + opts.rowset_ctx->tablet_schema = _tablet_schema; + TabletColumn column = _tablet_schema->column(0); + _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); + + std::unique_ptr<ColumnWriter> writer; + EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); + EXPECT_TRUE(writer->init().ok()); + EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); + + // 5. write data + auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); + auto block = _tablet_schema->create_block(); + auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); + std::unordered_map<int, std::string> inserted_jsonstr; + auto path_with_size = VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000, + &inserted_jsonstr); + olap_data_convertor->add_column_data_convertor(column); + olap_data_convertor->set_source_content(&block, 0, 1000); + auto [result, accessor] = olap_data_convertor->convert_column_data(0); + EXPECT_TRUE(result.ok()); + EXPECT_TRUE(accessor != nullptr); + EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); + st = writer->finish(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_data(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_ordinal_index(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_zone_map(); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(file_writer->close().ok()); + footer.set_num_rows(1000); + + // 6. check footer + EXPECT_EQ(footer.columns_size(), 12); + auto column_meta = footer.columns(0); + EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); + + for (int i = 1; i < footer.columns_size() - 1; ++i) { + auto column_meta = footer.columns(i); + check_column_meta(column_meta, path_with_size); + } + check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); + + // 7. check variant reader + io::FileReaderSPtr file_reader; + st = io::global_local_filesystem()->open_file(file_path, &file_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + ColumnReaderOptions read_opts; + std::unique_ptr<ColumnReader> column_reader; + st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); + EXPECT_TRUE(variant_column_reader != nullptr); + + // 8. check statistics + auto statistics = variant_column_reader->get_stats(); + for (const auto& [path, size] : statistics->subcolumns_non_null_size) { + std::cout << "path: " << path << ", size: " << size << std::endl; + EXPECT_EQ(path_with_size[path], size); + } + for (const auto& [path, size] : statistics->sparse_column_non_null_size) { + std::cout << "sparse path: " << path << ", size: " << size << std::endl; + EXPECT_EQ(path_with_size[path], size); + } + + // 9. check root + ColumnIterator* it; + TabletColumn parent_column = _tablet_schema->column(0); + StorageReadOptions storage_read_opts; + storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; + st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + ColumnIteratorOptions column_iter_opts; + OlapReaderStatistics stats; + column_iter_opts.stats = &stats; + column_iter_opts.file_reader = file_reader.get(); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + MutableColumnPtr new_column_object = ColumnObject::create(3); + size_t nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int i = 0; i < 1000; ++i) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(i, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, inserted_jsonstr[i]); + } + + auto read_to_column_object = [&]() { + new_column_object = ColumnObject::create(10); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + EXPECT_EQ(nrows, 1000); + }; + + auto check_key_stats = [&](const std::string& key_num) { + std::string key = ".key" + key_num; + TabletColumn subcolumn_in_nested; + subcolumn_in_nested.set_name(parent_column.name_lower_case() + key); + subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id()); + subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() + key)); + subcolumn_in_nested.set_variant_max_subcolumns_count( + parent_column.variant_max_subcolumns_count()); + subcolumn_in_nested.set_is_nullable(true); + + st = variant_column_reader->new_iterator(&it, subcolumn_in_nested, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + read_to_column_object(); + + size_t key_count = 0; + size_t key_nested_count = 0; + for (int row = 0; row < 1000; ++row) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(row, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + if (value.find("nested" + key_num) != std::string::npos) { + key_nested_count++; + } else if (value.find("88") != std::string::npos) { + key_count++; + } + } + EXPECT_EQ(key_count, path_with_size["key" + key_num]); + EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested" + key_num]); + }; + + for (int i = 3; i < 10; ++i) { + check_key_stats(std::to_string(i)); + } + + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +} + +} // namespace doris \ No newline at end of file diff --git a/be/test/olap/rowset/variant_with_compaction_test.cpp b/be/test/olap/rowset/variant_with_compaction_test.cpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/be/test/testutil/schema_utils.h b/be/test/testutil/schema_utils.h new file mode 100644 index 00000000000..fdb8354f975 --- /dev/null +++ b/be/test/testutil/schema_utils.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "vec/common/schema_util.h" + +namespace doris { + +class SchemaUtils { +public: + static void construct_column(ColumnPB* column_pb, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + int variant_max_subcolumns_count = 3, bool is_key = false) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_key(is_key); + column_pb->set_is_nullable(false); + if (column_type == "VARIANT") { + column_pb->set_variant_max_subcolumns_count(variant_max_subcolumns_count); + } + } + + static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t index_id, + const std::string& index_name, int32_t col_unique_id) { + tablet_index->set_index_id(index_id); + tablet_index->set_index_name(index_name); + tablet_index->set_index_type(IndexType::INVERTED); + tablet_index->add_col_unique_id(col_unique_id); + } +}; + +} // namespace doris diff --git a/be/test/testutil/variant_util.h b/be/test/testutil/variant_util.h new file mode 100644 index 00000000000..27b9000ce55 --- /dev/null +++ b/be/test/testutil/variant_util.h @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "vec/columns/column_object.h" +#include "vec/columns/column_string.h" +#include "vec/common/schema_util.h" +#include "vec/data_types/data_type_string.h" +#include "vec/json/parse2column.h" + +namespace doris { + +using namespace vectorized; + +class VariantUtil { +public: + static schema_util::PathToNoneNullValues fill_string_column_with_test_data( + auto& column_string, int size, std::unordered_map<int, std::string>* inserted_jsonstr) { + schema_util::PathToNoneNullValues all_path_stats; + std::srand(42); + for (int i = 0; i < size; i++) { + std::string json_str = "{"; + int num_pairs = std::rand() % 10 + 1; + for (int j = 0; j < num_pairs; j++) { + std::string key = "key" + std::to_string(j); + if (j % 2 == 0) { + int value = 88; + json_str += "\"" + key + "\":" + std::to_string(value); + } else { + std::string value = "str" + std::to_string(99); + json_str += "\"" + key + "\":\"" + value + "\""; + } + if (j < num_pairs - 1) { + json_str += ","; + } + all_path_stats[key] += 1; + } + json_str += "}"; + vectorized::Field str(json_str); + column_string->insert_data(json_str.data(), json_str.size()); + (*inserted_jsonstr)[i] = json_str; + } + return all_path_stats; + } + + static schema_util::PathToNoneNullValues fill_string_column_with_nested_test_data( + auto& column_string, int size, std::unordered_map<int, std::string>* inserted_jsonstr) { + schema_util::PathToNoneNullValues all_path_stats; + std::srand(42); + for (int i = 0; i < size; i++) { + std::string json_str = "{"; + + int num_paths = std::rand() % 9 + 2; + int current_path = 0; + + json_str += "\"key0\":{"; + + json_str += "\"key1\":{"; + + json_str += "\"key2\":" + std::to_string(88) + ","; + json_str += "\"key3\":\"" + std::to_string(88) + "\""; + json_str += "},"; + json_str += "\"key4\":" + std::to_string(88); + json_str += "},"; + + all_path_stats["key0.key1.key2"] += 1; + all_path_stats["key0.key1.key3"] += 1; + all_path_stats["key0.key4"] += 1; + current_path += 3; + + while (current_path < num_paths) { + std::string key = "key" + std::to_string(current_path); + if (std::rand() % 2 == 0) { + json_str += "\"" + key + "\":{"; + json_str += + "\"nested" + std::to_string(current_path) + "\":" + std::to_string(88); + json_str += "},"; + all_path_stats[key + ".nested" + std::to_string(current_path)] += 1; + } else { + // 添加简单路径 + json_str += "\"" + key + "\":\"" + std::to_string(88) + "\","; + all_path_stats[key] += 1; + } + current_path++; + } + + json_str = json_str.substr(0, json_str.length() - 1); + json_str += "}"; + + vectorized::Field str(json_str); + column_string->insert_data(json_str.data(), json_str.size()); + (*inserted_jsonstr)[i] = json_str; + } + return all_path_stats; + } + + static schema_util::PathToNoneNullValues fill_object_column_with_test_data( + auto& column_object, int size, std::unordered_map<int, std::string>* inserted_jsonstr) { + auto type_string = std::make_shared<vectorized::DataTypeString>(); + auto column = type_string->create_column(); + auto column_string = assert_cast<ColumnString*>(column.get()); + auto res = fill_string_column_with_test_data(column_string, size, inserted_jsonstr); + vectorized::ParseConfig config; + config.enable_flatten_nested = false; + parse_json_to_variant(*column_object, *column_string, config); + return res; + } + + static schema_util::PathToNoneNullValues fill_object_column_with_nested_test_data( + auto& column_object, int size, std::unordered_map<int, std::string>* inserted_jsonstr) { + auto type_string = std::make_shared<vectorized::DataTypeString>(); + auto column = type_string->create_column(); + auto column_string = assert_cast<ColumnString*>(column.get()); + auto res = fill_string_column_with_nested_test_data(column_string, size, inserted_jsonstr); + vectorized::ParseConfig config; + config.enable_flatten_nested = false; + parse_json_to_variant(*column_object, *column_string, config); + return res; + } +}; + +} // namespace doris diff --git a/be/test/vec/common/schema_util_rowset_test.cpp b/be/test/vec/common/schema_util_rowset_test.cpp new file mode 100644 index 00000000000..eb4b2ad4c39 --- /dev/null +++ b/be/test/vec/common/schema_util_rowset_test.cpp @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gmock/gmock-more-matchers.h> +#include <gtest/gtest.h> + +#include "olap/rowset/beta_rowset_writer.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/segment_v2/variant_column_writer_impl.h" +#include "olap/storage_engine.h" +#include "olap/tablet_schema.h" +#include "vec/common/schema_util.h" +#include "vec/json/parse2column.h" + +using namespace doris::vectorized; + +using namespace doris::segment_v2; + +using namespace doris; + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "/ut_dir/schema_util_test"; +constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; + +class SchemaUtilRowsetTest : public testing::Test { +protected: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _curreent_dir = std::string(buffer); + _absolute_dir = _curreent_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // tmp dir + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); + std::vector<StorePath> paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); + EXPECT_TRUE(tmp_file_dirs->init().ok()); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + // storage engine + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + _engine_ref = engine.get(); + _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); + static_cast<void>(_data_dir->update_capacity()); + EXPECT_TRUE(_data_dir->init(true).ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + void TearDown() override { + //EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + +public: + SchemaUtilRowsetTest() = default; + virtual ~SchemaUtilRowsetTest() = default; + +private: + StorageEngine* _engine_ref = nullptr; + std::unique_ptr<DataDir> _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _curreent_dir; +}; + +static void construct_column(ColumnPB* column_pb, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + bool is_key = false) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_key(is_key); + column_pb->set_is_nullable(false); + if (column_type == "VARIANT") { + column_pb->set_variant_max_subcolumns_count(3); + } +} + +// static void construct_tablet_index(TabletIndexPB* tablet_index, int64_t index_id, const std::string& index_name, int32_t col_unique_id) { +// tablet_index->set_index_id(index_id); +// tablet_index->set_index_name(index_name); +// tablet_index->set_index_type(IndexType::INVERTED); +// tablet_index->add_col_unique_id(col_unique_id); +// } + +static std::unordered_map<int32_t, schema_util::PathToNoneNullValues> all_path_stats; +static void fill_string_column_with_test_data(auto& column_string, int size, int uid) { + std::srand(42); + for (int i = 0; i < size; i++) { + std::string json_str = "{"; + int num_pairs = std::rand() % 10 + 1; + for (int j = 0; j < num_pairs; j++) { + std::string key = "key" + std::to_string(j); + if (std::rand() % 2 == 0) { + int value = std::rand() % 100; + json_str += "\"" + key + "\" : " + std::to_string(value); + } else { + std::string value = "str" + std::to_string(std::rand() % 100); + json_str += "\"" + key + "\" : \"" + value + "\""; + } + if (j < num_pairs - 1) { + json_str += ", "; + } + all_path_stats[uid][key] += 1; + } + json_str += "}"; + vectorized::Field str(json_str); + column_string->insert_data(json_str.data(), json_str.size()); + } +} + +static void fill_varaint_column(auto& variant_column, int size, int uid) { + auto type_string = std::make_shared<vectorized::DataTypeString>(); + auto column = type_string->create_column(); + auto column_string = assert_cast<ColumnString*>(column.get()); + fill_string_column_with_test_data(column_string, size, uid); + vectorized::ParseConfig config; + config.enable_flatten_nested = false; + parse_json_to_variant(*variant_column, *column_string, config); +} + +static void fill_block_with_test_data(vectorized::Block* block, int size) { + auto columns = block->mutate_columns(); + // insert key + for (int i = 0; i < size; i++) { + vectorized::Field key = i; + columns[0]->insert(key); + } + + // insert v1 + fill_varaint_column(columns[1], size, 1); + + // insert v2 + for (int i = 0; i < size; i++) { + vectorized::Field v2("V2"); + columns[2]->insert(v2); + } + + // insert v3 + fill_varaint_column(columns[3], size, 3); + + // insert v4 + for (int i = 0; i < size; i++) { + vectorized::Field v4(i); + columns[4]->insert(v4); + } +} +static int64_t inc_id = 1000; +static RowsetWriterContext rowset_writer_context(const std::unique_ptr<DataDir>& data_dir, + const TabletSchemaSPtr& schema, + const std::string& tablet_path) { + RowsetWriterContext context; + RowsetId rowset_id; + rowset_id.init(inc_id); + context.rowset_id = rowset_id; + context.rowset_type = BETA_ROWSET; + context.data_dir = data_dir.get(); + context.rowset_state = VISIBLE; + context.tablet_schema = schema; + context.tablet_path = tablet_path; + context.version = Version(inc_id, inc_id); + context.max_rows_per_segment = 200; + inc_id++; + return context; +} + +static RowsetSharedPtr create_rowset(auto& rowset_writer, const TabletSchemaSPtr& tablet_schema) { + vectorized::Block block = tablet_schema->create_block(); + fill_block_with_test_data(&block, 1000); + auto st = rowset_writer->add_block(&block); + EXPECT_TRUE(st.ok()) << st.msg(); + st = rowset_writer->flush(); + EXPECT_TRUE(st.ok()) << st.msg(); + + RowsetSharedPtr rowset; + EXPECT_TRUE(rowset_writer->build(rowset).ok()); + EXPECT_TRUE(rowset->num_segments() == 5); + return rowset; +} + +TEST_F(SchemaUtilRowsetTest, collect_path_stats_and_get_compaction_schema) { + // 1.create tablet schema + TabletSchemaPB schema_pb; + construct_column(schema_pb.add_column(), 0, "INT", "key", true); + construct_column(schema_pb.add_column(), 1, "VARIANT", "v1"); + construct_column(schema_pb.add_column(), 2, "STRING", "v2"); + construct_column(schema_pb.add_column(), 3, "VARIANT", "v3"); + construct_column(schema_pb.add_column(), 4, "INT", "v4"); + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + tablet_schema->init_from_pb(schema_pb); + + // 2. create tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(tablet_schema)); + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + // 3. create rowset + std::vector<RowsetSharedPtr> rowsets; + for (int i = 0; i < 5; i++) { + const auto& res = RowsetFactory::create_rowset_writer( + *_engine_ref, + rowset_writer_context(_data_dir, tablet_schema, _tablet->tablet_path()), false); + EXPECT_TRUE(res.has_value()) << res.error(); + const auto& rowset_writer = res.value(); + auto rowset = create_rowset(rowset_writer, tablet_schema); + EXPECT_TRUE(_tablet->add_rowset(rowset).ok()); + rowsets.push_back(rowset); + } + + std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; + for (const auto& rowset : rowsets) { + auto st = schema_util::collect_path_stats(rowset, path_stats); + EXPECT_TRUE(st.ok()) << st.msg(); + } + + for (const auto& [uid, path_stats] : path_stats) { + for (const auto& [path, size] : path_stats) { + EXPECT_EQ(all_path_stats[uid][path], size); + } + } + + // 4. get compaction schema + TabletSchemaSPtr compaction_schema = tablet_schema; + auto st = schema_util::get_compaction_schema(rowsets, compaction_schema); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 5. check compaction schema + std::unordered_map<int32_t, std::vector<std::string>> compaction_schema_map; + for (const auto& column : compaction_schema->columns()) { + if (column->parent_unique_id() > 0) { + compaction_schema_map[column->parent_unique_id()].push_back(column->name()); + } + } + for (auto& [uid, paths] : compaction_schema_map) { + EXPECT_EQ(paths.size(), 4); + std::sort(paths.begin(), paths.end()); + EXPECT_TRUE(paths[0].ends_with("__DORIS_VARIANT_SPARSE__")); + EXPECT_TRUE(paths[1].ends_with("key0")); + EXPECT_TRUE(paths[2].ends_with("key1")); + EXPECT_TRUE(paths[3].ends_with("key2")); + } +} diff --git a/be/test/vec/common/schema_util_test.cpp b/be/test/vec/common/schema_util_test.cpp new file mode 100644 index 00000000000..ffa790ddf49 --- /dev/null +++ b/be/test/vec/common/schema_util_test.cpp @@ -0,0 +1,357 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/common/schema_util.h" + +#include <gmock/gmock-more-matchers.h> +#include <gtest/gtest.h> + +#include "olap/rowset/segment_v2/variant_column_writer_impl.h" + +using namespace doris::vectorized; + +using namespace doris::segment_v2; + +using namespace doris; + +class SchemaUtilTest : public testing::Test { +public: + SchemaUtilTest() = default; + virtual ~SchemaUtilTest() = default; +}; + +void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, + const std::string& index_name, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name, + const IndexType& index_type) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_nullable(true); + column_pb->set_is_bf_column(true); + tablet_index->set_index_id(index_id); + tablet_index->set_index_name(index_name); + tablet_index->set_index_type(index_type); + tablet_index->add_col_unique_id(col_unique_id); +} + +void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t col_unique_id, + std::string_view path, std::vector<TabletColumn>* subcolumns) { + TabletColumn subcol; + subcol.set_type(type); + subcol.set_is_nullable(true); + subcol.set_unique_id(-1); + subcol.set_parent_unique_id(col_unique_id); + vectorized::PathInData col_path(path); + subcol.set_path_info(col_path); + subcol.set_name(col_path.get_path()); + + if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) { + TabletColumn array_item_col; + // double not support inverted index + array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE); + array_item_col.set_is_nullable(true); + array_item_col.set_unique_id(-1); + array_item_col.set_parent_unique_id(col_unique_id); + + subcol.add_sub_column(array_item_col); + } + + schema->append_column(subcol); + subcolumns->emplace_back(std::move(subcol)); +} + +// void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, +// int32_t col_unique_id, std::string_view path, +// std::vector<TabletColumn>* subcolumns) { +// TabletColumn subcol; +// subcol.set_type(type); +// subcol.set_is_nullable(true); +// subcol.set_unique_id(-1); +// subcol.set_parent_unique_id(col_unique_id); +// vectorized::PathInData col_path(path); +// subcol.set_path_info(col_path); +// subcol.set_name(col_path.get_path()); +// schema->append_column(subcol); +// subcolumns->emplace_back(std::move(subcol)); +// } + +TEST_F(SchemaUtilTest, inherit_column_attributes) { + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); + + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, "key_index", 0, "INT", + "key", IndexType::INVERTED); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "VARIANT", + "v1", IndexType::INVERTED); + construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "VARIANT", + "v3", IndexType::INVERTED); + + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + tablet_schema->init_from_pb(schema_pb); + std::vector<TabletColumn> subcolumns; + + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_STRING, 1, "v1.b", &subcolumns); + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_INT, 1, "v1.c", &subcolumns); + + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_ARRAY, 3, "v3.d", &subcolumns); + construct_subcolumn(tablet_schema, FieldType::OLAP_FIELD_TYPE_FLOAT, 3, "v3.a", &subcolumns); + + schema_util::inherit_column_attributes(tablet_schema); + for (const auto& col : subcolumns) { + switch (col._parent_col_unique_id) { + case 1: + EXPECT_TRUE(tablet_schema->inverted_index(col) != nullptr); + break; + case 3: + EXPECT_TRUE(tablet_schema->inverted_index(col) == nullptr); + break; + default: + EXPECT_TRUE(false); + } + } + EXPECT_EQ(tablet_schema->inverted_indexes().size(), 7); + + for (const auto& col : tablet_schema->_cols) { + if (!col->is_extracted_column()) { + continue; + } + switch (col->_parent_col_unique_id) { + case 1: + EXPECT_TRUE(col->is_bf_column()); + break; + case 3: + EXPECT_TRUE(!col->is_bf_column()); + break; + default: + EXPECT_TRUE(false); + } + } +} + +static std::unordered_map<std::string, int> construct_column_map_with_random_values( + auto& column_map, int key_size, int value_size, const std::string& prefix) { + std::unordered_map<std::string, int> key_value_counts; + auto& key = assert_cast<ColumnString&>(column_map->get_keys()); + auto& value = assert_cast<ColumnString&>(column_map->get_values()); + auto& offsets = column_map->get_offsets(); + + std::srand(42); + + for (int i = 0; i < key_size; ++i) { + std::string current_key = prefix + std::to_string(i); + + int value_count = std::rand() % value_size + 1; + key_value_counts[current_key] = value_count; + + for (int j = 0; j < value_count; ++j) { + key.insert_data(current_key.data(), current_key.size()); + auto value_str = prefix + std::to_string(j); + value.insert_data(value_str.data(), value_str.size()); + } + offsets.push_back(key.size()); + } + + return key_value_counts; +} + +TEST_F(SchemaUtilTest, calculate_variant_stats) { + VariantStatisticsPB stats; + auto column_map = ColumnMap::create(ColumnString::create(), ColumnString::create(), + ColumnArray::ColumnOffsets::create()); + + const auto& key_value_counts = + construct_column_map_with_random_values(column_map, 200, 100, "key_"); + + // calculate stats + schema_util::calculate_variant_stats(*column_map, &stats, 0, 200); + EXPECT_EQ(stats.sparse_column_non_null_size_size(), key_value_counts.size()); + + for (const auto& kv : key_value_counts) { + auto it = stats.sparse_column_non_null_size().find(kv.first); + EXPECT_NE(it, stats.sparse_column_non_null_size().end()); + EXPECT_EQ(it->second, kv.second); + } + + // test with different key size + column_map->clear(); + const auto& key_value_counts2 = + construct_column_map_with_random_values(column_map, 3000, 100, "key_"); + schema_util::calculate_variant_stats(*column_map, &stats, 0, 3000); + EXPECT_EQ(stats.sparse_column_non_null_size_size(), 3000); + + for (const auto& [path, size] : stats.sparse_column_non_null_size()) { + auto first_size = key_value_counts.find(path) == key_value_counts.end() + ? 0 + : key_value_counts.find(path)->second; + auto second_size = key_value_counts2.find(path) == key_value_counts2.end() + ? 0 + : key_value_counts2.find(path)->second; + EXPECT_EQ(size, first_size + second_size); + } + + // test with max size + column_map->clear(); + const auto& key_value_counts3 = construct_column_map_with_random_values( + column_map, VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, 5, "key2_"); + schema_util::calculate_variant_stats(*column_map, &stats, 0, + VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE); + EXPECT_EQ(VariantStatistics::MAX_SPARSE_DATA_STATISTICS_SIZE, + stats.sparse_column_non_null_size_size()); + + for (const auto& [path, size] : stats.sparse_column_non_null_size()) { + auto first_size = key_value_counts.find(path) == key_value_counts.end() + ? 0 + : key_value_counts.find(path)->second; + auto second_size = key_value_counts2.find(path) == key_value_counts2.end() + ? 0 + : key_value_counts2.find(path)->second; + auto third_size = key_value_counts3.find(path) == key_value_counts3.end() + ? 0 + : key_value_counts3.find(path)->second; + EXPECT_EQ(size, first_size + second_size + third_size); + } +} + +TEST_F(SchemaUtilTest, get_subpaths) { + TabletColumn variant; + variant.set_unique_id(1); + variant.set_variant_max_subcolumns_count(3); + + std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; + path_stats[1] = { + {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; + + // get subpaths + std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; + schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); + + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != + uid_to_paths_set_info[1].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != + uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != + uid_to_paths_set_info[1].sparse_path_set.end()); +} + +TEST_F(SchemaUtilTest, get_subpaths_equal_to_max) { + TabletColumn variant; + variant.set_unique_id(1); + variant.set_variant_max_subcolumns_count(3); + + std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; + path_stats[1] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}}; + + std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; + schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); + + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != + uid_to_paths_set_info[1].sub_path_set.end()); +} + +TEST_F(SchemaUtilTest, get_subpaths_multiple_variants) { + TabletColumn variant1; + variant1.set_unique_id(1); + variant1.set_variant_max_subcolumns_count(3); + + TabletColumn variant2; + variant2.set_unique_id(2); + variant2.set_variant_max_subcolumns_count(2); + + TabletColumn variant3; + variant3.set_unique_id(3); + variant3.set_variant_max_subcolumns_count(4); + + std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; + path_stats[1] = { + {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; + path_stats[2] = {{"path1", 1000}, {"path2", 800}}; + path_stats[3] = {{"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}}; + path_stats[4] = { + {"path1", 1000}, {"path2", 800}, {"path3", 500}, {"path4", 300}, {"path5", 200}}; + + std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; + schema_util::get_subpaths(variant1, path_stats, uid_to_paths_set_info); + schema_util::get_subpaths(variant2, path_stats, uid_to_paths_set_info); + schema_util::get_subpaths(variant3, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 3); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 2); + + EXPECT_EQ(uid_to_paths_set_info[2].sub_path_set.size(), 2); + EXPECT_EQ(uid_to_paths_set_info[2].sparse_path_set.size(), 0); + + EXPECT_EQ(uid_to_paths_set_info[3].sub_path_set.size(), 4); + EXPECT_EQ(uid_to_paths_set_info[3].sparse_path_set.size(), 0); + + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path1") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path2") != + uid_to_paths_set_info[1].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sub_path_set.find("path3") != + uid_to_paths_set_info[1].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path4") != + uid_to_paths_set_info[1].sparse_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[1].sparse_path_set.find("path5") != + uid_to_paths_set_info[1].sparse_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path1") != + uid_to_paths_set_info[2].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[2].sub_path_set.find("path2") != + uid_to_paths_set_info[2].sub_path_set.end()); + + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path1") != + uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path2") != + uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path3") != + uid_to_paths_set_info[3].sub_path_set.end()); + EXPECT_TRUE(uid_to_paths_set_info[3].sub_path_set.find("path4") != + uid_to_paths_set_info[3].sub_path_set.end()); +} + +TEST_F(SchemaUtilTest, get_subpaths_no_path_stats) { + TabletColumn variant; + variant.set_unique_id(1); + variant.set_variant_max_subcolumns_count(3); + + std::unordered_map<int32_t, schema_util::PathToNoneNullValues> path_stats; + path_stats[2] = {{"path1", 1000}, {"path2", 800}}; + + std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; + schema_util::get_subpaths(variant, path_stats, uid_to_paths_set_info); + + EXPECT_EQ(uid_to_paths_set_info[1].sub_path_set.size(), 0); + EXPECT_EQ(uid_to_paths_set_info[1].sparse_path_set.size(), 0); +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org