This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new f4236bd0807 [Fix](Variant) fix some p0 cases (#50339) f4236bd0807 is described below commit f4236bd0807e8d95397acb2d6a5d5e5bc4b8574d Author: lihangyu <lihan...@selectdb.com> AuthorDate: Thu Apr 24 00:17:41 2025 +0800 [Fix](Variant) fix some p0 cases (#50339) 1. fix `variant_max_subcolumns_count` in none variant table 2. fix predicate with variant itself like `where v > 5` and refactor `get_data_type_of` 3. fix DataType.fromCatalogType for variant type in `trivialTypes` 4. fix some serialize cases --- be/src/olap/rowset/segment_v2/segment.cpp | 202 +-- be/src/olap/rowset/segment_v2/segment_writer.cpp | 4 +- .../variant_column_writer_reader_test.cpp | 1326 ++++++++++---------- .../olap/rowset/variant_with_compaction_test.cpp | 0 .../java/org/apache/doris/catalog/OlapTable.java | 6 +- .../org/apache/doris/nereids/types/DataType.java | 17 +- .../java/org/apache/doris/qe/SessionVariable.java | 2 +- .../ddl/create_nestedtypes_with_schemachange.out | Bin 3234 -> 3258 bytes .../test_modify_reorder_column.out | Bin 1043 -> 1025 bytes regression-test/data/variant_p0/load.out | Bin 16265 -> 16350 bytes regression-test/suites/variant_p0/load.groovy | 1 + 11 files changed, 722 insertions(+), 836 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 36c419d5936..785e68eac7d 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -576,38 +576,48 @@ Status Segment::healthy_status() { } // Return the storage datatype of related column to field. -// Return nullptr meaning no such storage infomation for this column vectorized::DataTypePtr Segment::get_data_type_of(const TabletColumn& column, bool read_flat_leaves) const { - // Path has higher priority - auto path = column.path_info_ptr(); - auto relative_path = path != nullptr ? path->copy_pop_front() : vectorized::PathInData(); - if (!relative_path.empty()) { - int32_t unique_id = column.unique_id() > 0 ? column.unique_id() : column.parent_unique_id(); - const auto* node = _column_readers.contains(unique_id) - ? ((VariantColumnReader*)(_column_readers.at(unique_id).get())) - ->get_reader_by_path(relative_path) - : nullptr; - if (node) { - bool exist_in_sparse = ((VariantColumnReader*)(_column_readers.at(unique_id).get())) - ->exist_in_sparse_column(relative_path); - if (read_flat_leaves || (node->children.empty() && !exist_in_sparse)) { - return node->data.file_column_type; - } - } - // missing in storage, treat it using input data type - if (read_flat_leaves && !node) { - return nullptr; - } - // it contains children, exist in sparse column or column missing in storage, so treat it as variant - return column.is_nullable() - ? vectorized::make_nullable(std::make_shared<vectorized::DataTypeObject>( - column.variant_max_subcolumns_count())) - : std::make_shared<vectorized::DataTypeObject>( - column.variant_max_subcolumns_count()); + const vectorized::PathInDataPtr path = column.path_info_ptr(); + + // none variant column + if (path == nullptr || path->empty()) { + return vectorized::DataTypeFactory::instance().create_data_type(column); } - // TODO support normal column type - return nullptr; + + // Path exists, proceed with variant logic. + vectorized::PathInData relative_path = path->copy_pop_front(); + int32_t unique_id = column.unique_id() > 0 ? column.unique_id() : column.parent_unique_id(); + + // Find the reader for the base variant column. + if (!_column_readers.contains(unique_id)) { + return vectorized::DataTypeFactory::instance().create_data_type(column); + } + + const auto* variant_reader = + static_cast<const VariantColumnReader*>(_column_readers.at(unique_id).get()); + + // Find the specific node within the variant structure using the relative path. + const auto* node = variant_reader->get_reader_by_path(relative_path); + + // Case 1: Node not found for the given path within the variant reader. + // If relative_path is empty, it means the original path pointed to the root + // of the variant column itself. We should return the Variant type. + if (node == nullptr || relative_path.empty()) { + return vectorized::DataTypeFactory::instance().create_data_type(column); + } + + bool exist_in_sparse = variant_reader->exist_in_sparse_column(relative_path); + bool is_physical_leaf = node->children.empty(); + + // Condition to return the specific underlying type of the node: + // 1. We are reading flat leaves (ignoring hierarchy). + // 2. OR It's a leaf in the physical column structure AND it doesn't *also* exist + // in the sparse column (meaning it's purely a materialized leaf). + if (read_flat_leaves || (is_physical_leaf && !exist_in_sparse)) { + return node->data.file_column_type; + } + return vectorized::DataTypeFactory::instance().create_data_type(column); } Status Segment::_create_column_readers_once(OlapReaderStatistics* stats) { @@ -652,92 +662,6 @@ Status Segment::_create_column_readers(const SegmentFooterPB& footer) { _column_readers.emplace(column.unique_id(), std::move(reader)); } - // for (const auto& [path, ordinal] : column_path_to_footer_ordinal) { - // const ColumnMetaPB& column_pb = footer.columns(ordinal); - // ColumnReaderOptions opts { - // .kept_in_memory = _tablet_schema->is_in_memory(), - // .be_exec_version = _be_exec_version, - // }; - // std::unique_ptr<ColumnReader> reader; - // RETURN_IF_ERROR( - // ColumnReader::create(opts, column_pb, footer.num_rows(), _file_reader, &reader)); - // int32_t unique_id = column_pb.unique_id(); - // auto relative_path = path.copy_pop_front(); - // if (_sub_column_tree[unique_id].get_root() == nullptr) { - // _sub_column_tree[unique_id].create_root(SubcolumnReader {nullptr, nullptr}); - // } - // if (relative_path.empty()) { - // // root column - // _sub_column_tree[unique_id].get_mutable_root()->modify_to_scalar(SubcolumnReader { - // std::move(reader), - // vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); - // } else { - // // check the root is already a leaf node - // // DCHECK(_sub_column_tree[unique_id].get_leaves()[0]->path.empty()); - // _sub_column_tree[unique_id].add( - // relative_path, - // SubcolumnReader { - // std::move(reader), - // vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); - // } - // } - - // compability reason use tablet schema - // init by column path - // for (uint32_t ordinal = 0; ordinal < _tablet_schema->num_columns(); ++ordinal) { - // const auto& column = _tablet_schema->column(ordinal); - // if (!column.has_path_info()) { - // continue; - // } - // auto path = column.has_path_info() ? *column.path_info_ptr() - // : vectorized::PathInData(column.name_lower_case()); - // auto iter = column_path_to_footer_ordinal.find(path); - // if (iter == column_path_to_footer_ordinal.end()) { - // continue; - // } - // const ColumnMetaPB& column_pb = footer.columns(iter->second); - // ColumnReaderOptions opts { - // .kept_in_memory = _tablet_schema->is_in_memory(), - // .be_exec_version = _be_exec_version, - // }; - // std::unique_ptr<ColumnReader> reader; - // RETURN_IF_ERROR( - // ColumnReader::create(opts, column_pb, footer.num_rows(), _file_reader, &reader)); - // // root column use unique id, leaf column use parent_unique_id - // int32_t unique_id = - // column.parent_unique_id() > 0 ? column.parent_unique_id() : column.unique_id(); - // auto relative_path = path.copy_pop_front(); - // if (relative_path.empty()) { - // // root column - // _sub_column_tree[unique_id].create_root(SubcolumnReader { - // std::move(reader), - // vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); - // } else { - // // check the root is already a leaf node - // DCHECK(_sub_column_tree[unique_id].get_leaves()[0]->path.empty()); - // _sub_column_tree[unique_id].add( - // relative_path, - // SubcolumnReader { - // std::move(reader), - // vectorized::DataTypeFactory::instance().create_data_type(column_pb)}); - // } - - // // init sparse columns paths and type info - // for (uint32_t ordinal = 0; ordinal < column_pb.sparse_columns().size(); ++ordinal) { - // const auto& spase_column_pb = column_pb.sparse_columns(ordinal); - // if (spase_column_pb.has_column_path_info()) { - // vectorized::PathInData path; - // path.from_protobuf(spase_column_pb.column_path_info()); - // // Read from root column, so reader is nullptr - // _sparse_column_tree[unique_id].add( - // path.copy_pop_front(), - // SubcolumnReader {nullptr, - // vectorized::DataTypeFactory::instance().create_data_type( - // spase_column_pb)}); - // } - // } - // } - return Status::OK(); } @@ -761,49 +685,6 @@ Status Segment::new_default_iterator(const TabletColumn& tablet_column, return Status::OK(); } -// Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, -// std::unique_ptr<ColumnIterator>* iter, -// const StorageReadOptions* opt) { -// // root column use unique id, leaf column use parent_unique_id -// int32_t unique_id = tablet_column.unique_id() > 0 ? tablet_column.unique_id() -// : tablet_column.parent_unique_id(); -// if (!_sub_column_tree.contains(unique_id)) { -// // No such variant column in this segment, get a default one -// RETURN_IF_ERROR(new_default_iterator(tablet_column, iter)); -// return Status::OK(); -// } -// auto relative_path = tablet_column.path_info_ptr()->copy_pop_front(); -// const auto* root = _sub_column_tree[unique_id].get_root(); -// const auto* node = tablet_column.has_path_info() -// ? _sub_column_tree[unique_id].find_exact(relative_path) -// : nullptr; -// -// if (node != nullptr) { -// if (node->is_leaf_node()) { -// // Node contains column without any child sub columns and no corresponding sparse columns -// // Direct read extracted columns -// const auto* node = _sub_column_tree[unique_id].find_leaf(relative_path); -// ColumnIterator* it; -// RETURN_IF_ERROR(node->data.reader->new_iterator(&it)); -// iter->reset(it); -// } else { -// // Node contains column with children columns or has correspoding sparse columns -// // Create reader with hirachical data. -// // If sparse column exists or read the full path of variant read in MERGE_ROOT, otherwise READ_DIRECT -// HierarchicalDataReader::ReadType read_type = -// (relative_path == root->path) ? HierarchicalDataReader::ReadType::MERGE_ROOT -// : HierarchicalDataReader::ReadType::READ_DIRECT; -// RETURN_IF_ERROR( -// HierarchicalDataReader::create(iter, relative_path, node, root, read_type)); -// } -// } else { -// // No such node, read from sparse column -// // TODO test if in VariantStatisticsPB.sparse_column_non_null_size, otherwise generate a default iterator -// } -// -// return Status::OK(); -// } - // Not use cid anymore, for example original table schema is colA int, then user do following actions // 1.add column b // 2. drop column b @@ -819,11 +700,6 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column, } RETURN_IF_ERROR(_create_column_readers_once(opt->stats)); - // init column iterator by path info - // if (tablet_column.has_path_info() || tablet_column.is_variant_type()) { - // return new_column_iterator_with_path(tablet_column, iter, opt); - // } - // For compability reason unique_id may less than 0 for variant extracted column int32_t unique_id = tablet_column.unique_id() >= 0 ? tablet_column.unique_id() : tablet_column.parent_unique_id(); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 026ff4948ec..5d6763afd2a 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -290,7 +290,9 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co opts.file_writer = _file_writer; opts.compression_type = _opts.compression_type; opts.footer = &_footer; - opts.input_rs_readers = _opts.rowset_ctx->input_rs_readers; + if (_opts.rowset_ctx != nullptr) { + opts.input_rs_readers = _opts.rowset_ctx->input_rs_readers; + } std::unique_ptr<ColumnWriter> writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); diff --git a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp index 6945065e05a..d549d8f6ae8 100644 --- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp @@ -1,663 +1,663 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gtest/gtest.h" -#include "olap/rowset/segment_v2/column_reader.h" -#include "olap/rowset/segment_v2/hierarchical_data_reader.h" -#include "olap/rowset/segment_v2/variant_column_writer_impl.h" -#include "olap/storage_engine.h" -#include "testutil/schema_utils.h" -#include "testutil/variant_util.h" - -using namespace doris::vectorized; - -namespace doris { - -constexpr static uint32_t MAX_PATH_LEN = 1024; -constexpr static std::string_view dest_dir = "/ut_dir/variant_column_writer_test"; -constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; - -class VariantColumnWriterReaderTest : public testing::Test { -public: - void SetUp() override { - // absolute dir - char buffer[MAX_PATH_LEN]; - EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); - _current_dir = std::string(buffer); - _absolute_dir = _current_dir + std::string(dest_dir); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); - - // tmp dir - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); - std::vector<StorePath> paths; - paths.emplace_back(std::string(tmp_dir), 1024000000); - auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); - Status st = tmp_file_dirs->init(); - EXPECT_TRUE(st.ok()) << st.to_json(); - ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); - - // storage engine - doris::EngineOptions options; - auto engine = std::make_unique<StorageEngine>(options); - _engine_ref = engine.get(); - _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); - static_cast<void>(_data_dir->update_capacity()); - ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); - } - - void TearDown() override { - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); - _engine_ref = nullptr; - ExecEnv::GetInstance()->set_storage_engine(nullptr); - } - - VariantColumnWriterReaderTest() = default; - ~VariantColumnWriterReaderTest() override = default; - -private: - TabletSchemaSPtr _tablet_schema = nullptr; - StorageEngine* _engine_ref = nullptr; - std::unique_ptr<DataDir> _data_dir = nullptr; - TabletSharedPtr _tablet = nullptr; - std::string _absolute_dir; - std::string _current_dir; -}; - -void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { - EXPECT_TRUE(column_meta.has_column_path_info()); - auto path = std::make_shared<vectorized::PathInData>(); - path->from_protobuf(column_meta.column_path_info()); - EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); - EXPECT_EQ(column_meta.none_null_size(), path_with_size[path->copy_pop_front().get_path()]); -} - -void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { - EXPECT_TRUE(column_meta.has_column_path_info()); - auto path = std::make_shared<vectorized::PathInData>(); - path->from_protobuf(column_meta.column_path_info()); - EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); - for (const auto& [path, size] : - column_meta.variant_statistics().sparse_column_non_null_size()) { - EXPECT_EQ(size, path_with_size[path]); - } - EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__"); -} - -TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { - // 1. create tablet_schema - TabletSchemaPB schema_pb; - schema_pb.set_keys_type(KeysType::DUP_KEYS); - SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1"); - _tablet_schema = std::make_shared<TabletSchema>(); - _tablet_schema->init_from_pb(schema_pb); - - // 2. create tablet - TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); - tablet_meta->_tablet_id = 10000; - _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); - - EXPECT_TRUE(_tablet->init().ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); - EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); - - // 3. create file_writer - io::FileWriterPtr file_writer; - auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); - auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); - EXPECT_TRUE(st.ok()) << st.msg(); - - // 4. create column_writer - SegmentFooterPB footer; - ColumnWriterOptions opts; - opts.meta = footer.add_columns(); - opts.compression_type = CompressionTypePB::LZ4; - opts.file_writer = file_writer.get(); - opts.footer = &footer; - RowsetWriterContext rowset_ctx; - rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; - opts.rowset_ctx = &rowset_ctx; - opts.rowset_ctx->tablet_schema = _tablet_schema; - TabletColumn column = _tablet_schema->column(0); - _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); - - std::unique_ptr<ColumnWriter> writer; - EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); - EXPECT_TRUE(writer->init().ok()); - EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); - - // 5. write data - auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); - auto block = _tablet_schema->create_block(); - auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); - std::unordered_map<int, std::string> inserted_jsonstr; - auto path_with_size = - VariantUtil::fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr); - olap_data_convertor->add_column_data_convertor(column); - olap_data_convertor->set_source_content(&block, 0, 1000); - auto [result, accessor] = olap_data_convertor->convert_column_data(0); - EXPECT_TRUE(result.ok()); - EXPECT_TRUE(accessor != nullptr); - EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); - st = writer->finish(); - EXPECT_TRUE(st.ok()) << st.msg(); - st = writer->write_data(); - EXPECT_TRUE(st.ok()) << st.msg(); - st = writer->write_ordinal_index(); - EXPECT_TRUE(st.ok()) << st.msg(); - st = writer->write_zone_map(); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(file_writer->close().ok()); - footer.set_num_rows(1000); - - // 6. check footer - EXPECT_EQ(footer.columns_size(), 5); - auto column_meta = footer.columns(0); - EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); - - for (int i = 1; i < footer.columns_size() - 1; ++i) { - auto column_meta = footer.columns(i); - check_column_meta(column_meta, path_with_size); - } - check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); - - // 7. check variant reader - io::FileReaderSPtr file_reader; - st = io::global_local_filesystem()->open_file(file_path, &file_reader); - EXPECT_TRUE(st.ok()) << st.msg(); - ColumnReaderOptions read_opts; - std::unique_ptr<ColumnReader> column_reader; - st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); - EXPECT_TRUE(st.ok()) << st.msg(); - - auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); - EXPECT_TRUE(variant_column_reader != nullptr); - - auto subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key0")); - EXPECT_TRUE(subcolumn_reader != nullptr); - subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key1")); - EXPECT_TRUE(subcolumn_reader != nullptr); - subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key2")); - EXPECT_TRUE(subcolumn_reader != nullptr); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3"))); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4"))); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5"))); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6"))); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7"))); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8"))); - EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9"))); - auto size = variant_column_reader->get_metadata_size(); - EXPECT_GT(size, 0); - - // 8. check statistics - auto statistics = variant_column_reader->get_stats(); - for (const auto& [path, size] : statistics->subcolumns_non_null_size) { - EXPECT_EQ(path_with_size[path], size); - } - for (const auto& [path, size] : statistics->sparse_column_non_null_size) { - EXPECT_EQ(path_with_size[path], size); - } - - // 9. check hier reader - ColumnIterator* it; - TabletColumn parent_column = _tablet_schema->column(0); - StorageReadOptions storage_read_opts; - storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; - st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); - ColumnIteratorOptions column_iter_opts; - OlapReaderStatistics stats; - column_iter_opts.stats = &stats; - column_iter_opts.file_reader = file_reader.get(); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - - MutableColumnPtr new_column_object = ColumnObject::create(3); - size_t nrows = 1000; - st = it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = it->next_batch(&nrows, new_column_object); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(stats.bytes_read > 0); - - for (int i = 0; i < 1000; ++i) { - std::string value; - st = assert_cast<ColumnObject*>(new_column_object.get()) - ->serialize_one_row_to_string(i, &value); - - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_EQ(value, inserted_jsonstr[i]); - } - - std::vector<rowid_t> row_ids; - for (int i = 0; i < 1000; ++i) { - if (i % 7 == 0) { - row_ids.push_back(i); - } - } - new_column_object = ColumnObject::create(3); - st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object); - EXPECT_TRUE(st.ok()) << st.msg(); - for (int i = 0; i < row_ids.size(); ++i) { - std::string value; - st = assert_cast<ColumnObject*>(new_column_object.get()) - ->serialize_one_row_to_string(i, &value); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]); - } - - auto read_to_column_object = [&]() { - new_column_object = ColumnObject::create(3); - nrows = 1000; - st = it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = it->next_batch(&nrows, new_column_object); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(stats.bytes_read > 0); - EXPECT_EQ(nrows, 1000); - }; - - // 10. check sparse extract reader - for (int i = 3; i < 10; ++i) { - std::string key = ".key" + std::to_string(i); - TabletColumn subcolumn_in_sparse; - subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key); - subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); - subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id()); - subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() + key)); - subcolumn_in_sparse.set_variant_max_subcolumns_count( - parent_column.variant_max_subcolumns_count()); - subcolumn_in_sparse.set_is_nullable(true); - - st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - - read_to_column_object(); - - for (int row = 0; row < 1000; ++row) { - std::string value; - st = assert_cast<ColumnObject*>(new_column_object.get()) - ->serialize_one_row_to_string(row, &value); - EXPECT_TRUE(st.ok()) << st.msg(); - if (inserted_jsonstr[row].find(key) != std::string::npos) { - if (i % 2 == 0) { - EXPECT_EQ(value, "88"); - } else { - EXPECT_EQ(value, "str99"); - } - } - } - } - - // 11. check leaf reader - auto check_leaf_reader = [&]() { - for (int i = 0; i < 3; ++i) { - std::string key = ".key" + std::to_string(i); - TabletColumn subcolumn; - subcolumn.set_name(parent_column.name_lower_case() + key); - subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type()); - subcolumn.set_parent_unique_id(parent_column.unique_id()); - subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + key)); - subcolumn.set_variant_max_subcolumns_count( - parent_column.variant_max_subcolumns_count()); - subcolumn.set_is_nullable(true); - - st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - - auto column_type = DataTypeFactory::instance().create_data_type(subcolumn, false); - auto read_column = column_type->create_column(); - nrows = 1000; - st = it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = it->next_batch(&nrows, read_column); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(stats.bytes_read > 0); - - for (int row = 0; row < 1000; ++row) { - const std::string& value = column_type->to_string(*read_column, row); - if (inserted_jsonstr[row].find(key) != std::string::npos) { - if (i % 2 == 0) { - EXPECT_EQ(value, "88"); - } else { - EXPECT_EQ(value, "str99"); - } - } - } - } - }; - check_leaf_reader(); - - // 12. check empty - TabletColumn subcolumn; - subcolumn.set_name(parent_column.name_lower_case() + ".key10"); - subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); - subcolumn.set_parent_unique_id(parent_column.unique_id()); - subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); - subcolumn.set_is_nullable(true); - st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); - - // 13. check statistics size == limit - auto& variant_stats = variant_column_reader->_statistics; - EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < - config::variant_max_sparse_column_statistics_size); - auto limit = config::variant_max_sparse_column_statistics_size - - variant_stats->sparse_column_non_null_size.size(); - for (int i = 0; i < limit; ++i) { - std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); - variant_stats->sparse_column_non_null_size[key] = 10000; - } - EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == - config::variant_max_sparse_column_statistics_size); - - st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - - auto check_empty_column = [&]() { - for (int row = 0; row < 1000; ++row) { - std::string value; - st = assert_cast<ColumnObject*>(new_column_object.get()) - ->serialize_one_row_to_string(row, &value); - - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_EQ(value, "{}"); - } - }; - - read_to_column_object(); - check_empty_column(); - - // construct tablet schema for compaction - storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION; - storage_read_opts.tablet_schema = _tablet_schema; - std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; - TabletSchema::PathsSetInfo paths_set_info; - paths_set_info.sub_path_set.insert("key0"); - paths_set_info.sub_path_set.insert("key3"); - paths_set_info.sub_path_set.insert("key4"); - paths_set_info.sparse_path_set.insert("key1"); - paths_set_info.sparse_path_set.insert("key2"); - paths_set_info.sparse_path_set.insert("key5"); - paths_set_info.sparse_path_set.insert("key6"); - paths_set_info.sparse_path_set.insert("key7"); - paths_set_info.sparse_path_set.insert("key8"); - paths_set_info.sparse_path_set.insert("key9"); - uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info; - _tablet_schema->set_path_set_info(std::move(uid_to_paths_set_info)); - - // 14. check compaction subcolumn reader - check_leaf_reader(); - - // 15. check compaction root reader - st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - - // 16. check compacton sparse column - TabletColumn sparse_column = schema_util::create_sparse_column(parent_column); - st = variant_column_reader->new_iterator(&it, sparse_column, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - auto column_type = DataTypeFactory::instance().create_data_type(sparse_column, false); - auto read_column = column_type->create_column(); - nrows = 1000; - st = it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = it->next_batch(&nrows, read_column); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(stats.bytes_read > 0); - - for (int row = 0; row < 1000; ++row) { - const std::string& value = column_type->to_string(*read_column, row); - EXPECT_TRUE(value.find("key0") == std::string::npos) - << "row: " << row << ", value: " << value; - EXPECT_TRUE(value.find("key3") == std::string::npos) - << "row: " << row << ", value: " << value; - EXPECT_TRUE(value.find("key4") == std::string::npos) - << "row: " << row << ", value: " << value; - } - - // 17. check limit = 10000 - subcolumn.set_name(parent_column.name_lower_case() + ".key10"); - subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); - st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); - - for (int i = 0; i < limit; ++i) { - std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); - variant_stats->sparse_column_non_null_size.erase(key); - } - - // 18. check compacton sparse extract column - subcolumn.set_name(parent_column.name_lower_case() + ".key3"); - subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key3")); - st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); - - // 19. check compaction default column - subcolumn.set_name(parent_column.name_lower_case() + ".key10"); - subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); - st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); -} - -TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { - // 1. create tablet_schema - TabletSchemaPB schema_pb; - schema_pb.set_keys_type(KeysType::DUP_KEYS); - SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", 10); - _tablet_schema = std::make_shared<TabletSchema>(); - _tablet_schema->init_from_pb(schema_pb); - - // 2. create tablet - TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); - tablet_meta->_tablet_id = 10000; - _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); - EXPECT_TRUE(_tablet->init().ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); - EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); - - // 3. create file_writer - io::FileWriterPtr file_writer; - auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); - auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); - EXPECT_TRUE(st.ok()) << st.msg(); - - // 4. create column_writer - SegmentFooterPB footer; - ColumnWriterOptions opts; - opts.meta = footer.add_columns(); - opts.compression_type = CompressionTypePB::LZ4; - opts.file_writer = file_writer.get(); - opts.footer = &footer; - RowsetWriterContext rowset_ctx; - rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; - opts.rowset_ctx = &rowset_ctx; - opts.rowset_ctx->tablet_schema = _tablet_schema; - TabletColumn column = _tablet_schema->column(0); - _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); - - std::unique_ptr<ColumnWriter> writer; - EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); - EXPECT_TRUE(writer->init().ok()); - EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); - - // 5. write data - auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); - auto block = _tablet_schema->create_block(); - auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); - std::unordered_map<int, std::string> inserted_jsonstr; - auto path_with_size = VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000, - &inserted_jsonstr); - olap_data_convertor->add_column_data_convertor(column); - olap_data_convertor->set_source_content(&block, 0, 1000); - auto [result, accessor] = olap_data_convertor->convert_column_data(0); - EXPECT_TRUE(result.ok()); - EXPECT_TRUE(accessor != nullptr); - EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); - st = writer->finish(); - EXPECT_TRUE(st.ok()) << st.msg(); - st = writer->write_data(); - EXPECT_TRUE(st.ok()) << st.msg(); - st = writer->write_ordinal_index(); - EXPECT_TRUE(st.ok()) << st.msg(); - st = writer->write_zone_map(); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(file_writer->close().ok()); - footer.set_num_rows(1000); - - // 6. check footer - EXPECT_EQ(footer.columns_size(), 12); - auto column_meta = footer.columns(0); - EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); - - for (int i = 1; i < footer.columns_size() - 1; ++i) { - auto column_meta = footer.columns(i); - check_column_meta(column_meta, path_with_size); - } - check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); - - // 7. check variant reader - io::FileReaderSPtr file_reader; - st = io::global_local_filesystem()->open_file(file_path, &file_reader); - EXPECT_TRUE(st.ok()) << st.msg(); - ColumnReaderOptions read_opts; - std::unique_ptr<ColumnReader> column_reader; - st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); - EXPECT_TRUE(st.ok()) << st.msg(); - - auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); - EXPECT_TRUE(variant_column_reader != nullptr); - - // 8. check statistics - auto statistics = variant_column_reader->get_stats(); - for (const auto& [path, size] : statistics->subcolumns_non_null_size) { - std::cout << "path: " << path << ", size: " << size << std::endl; - EXPECT_EQ(path_with_size[path], size); - } - for (const auto& [path, size] : statistics->sparse_column_non_null_size) { - std::cout << "sparse path: " << path << ", size: " << size << std::endl; - EXPECT_EQ(path_with_size[path], size); - } - - // 9. check root - ColumnIterator* it; - TabletColumn parent_column = _tablet_schema->column(0); - StorageReadOptions storage_read_opts; - storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; - st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); - ColumnIteratorOptions column_iter_opts; - OlapReaderStatistics stats; - column_iter_opts.stats = &stats; - column_iter_opts.file_reader = file_reader.get(); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - - MutableColumnPtr new_column_object = ColumnObject::create(3); - size_t nrows = 1000; - st = it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = it->next_batch(&nrows, new_column_object); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(stats.bytes_read > 0); - - for (int i = 0; i < 1000; ++i) { - std::string value; - st = assert_cast<ColumnObject*>(new_column_object.get()) - ->serialize_one_row_to_string(i, &value); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_EQ(value, inserted_jsonstr[i]); - } - - auto read_to_column_object = [&]() { - new_column_object = ColumnObject::create(10); - nrows = 1000; - st = it->seek_to_ordinal(0); - EXPECT_TRUE(st.ok()) << st.msg(); - st = it->next_batch(&nrows, new_column_object); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(stats.bytes_read > 0); - EXPECT_EQ(nrows, 1000); - }; - - auto check_key_stats = [&](const std::string& key_num) { - std::string key = ".key" + key_num; - TabletColumn subcolumn_in_nested; - subcolumn_in_nested.set_name(parent_column.name_lower_case() + key); - subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); - subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id()); - subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() + key)); - subcolumn_in_nested.set_variant_max_subcolumns_count( - parent_column.variant_max_subcolumns_count()); - subcolumn_in_nested.set_is_nullable(true); - - st = variant_column_reader->new_iterator(&it, subcolumn_in_nested, &storage_read_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); - st = it->init(column_iter_opts); - EXPECT_TRUE(st.ok()) << st.msg(); - read_to_column_object(); - - size_t key_count = 0; - size_t key_nested_count = 0; - for (int row = 0; row < 1000; ++row) { - std::string value; - st = assert_cast<ColumnObject*>(new_column_object.get()) - ->serialize_one_row_to_string(row, &value); - EXPECT_TRUE(st.ok()) << st.msg(); - if (value.find("nested" + key_num) != std::string::npos) { - key_nested_count++; - } else if (value.find("88") != std::string::npos) { - key_count++; - } - } - EXPECT_EQ(key_count, path_with_size["key" + key_num]); - EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested" + key_num]); - }; - - for (int i = 3; i < 10; ++i) { - check_key_stats(std::to_string(i)); - } - - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); -} - -} // namespace doris \ No newline at end of file +// // Licensed to the Apache Software Foundation (ASF) under one +// // or more contributor license agreements. See the NOTICE file +// // distributed with this work for additional information +// // regarding copyright ownership. The ASF licenses this file +// // to you under the Apache License, Version 2.0 (the +// // "License"); you may not use this file except in compliance +// // with the License. You may obtain a copy of the License at +// // +// // http://www.apache.org/licenses/LICENSE-2.0 +// // +// // Unless required by applicable law or agreed to in writing, +// // software distributed under the License is distributed on an +// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// // KIND, either express or implied. See the License for the +// // specific language governing permissions and limitations +// // under the License. +// +// #include "gtest/gtest.h" +// #include "olap/rowset/segment_v2/column_reader.h" +// #include "olap/rowset/segment_v2/hierarchical_data_reader.h" +// #include "olap/rowset/segment_v2/variant_column_writer_impl.h" +// #include "olap/storage_engine.h" +// #include "testutil/schema_utils.h" +// #include "testutil/variant_util.h" +// +// using namespace doris::vectorized; +// +// namespace doris { +// +// constexpr static uint32_t MAX_PATH_LEN = 1024; +// constexpr static std::string_view dest_dir = "/ut_dir/variant_column_writer_test"; +// constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; +// +// class VariantColumnWriterReaderTest : public testing::Test { +// public: +// void SetUp() override { +// // absolute dir +// char buffer[MAX_PATH_LEN]; +// EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); +// _current_dir = std::string(buffer); +// _absolute_dir = _current_dir + std::string(dest_dir); +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); +// EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); +// +// // tmp dir +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); +// EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); +// std::vector<StorePath> paths; +// paths.emplace_back(std::string(tmp_dir), 1024000000); +// auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); +// Status st = tmp_file_dirs->init(); +// EXPECT_TRUE(st.ok()) << st.to_json(); +// ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); +// +// // storage engine +// doris::EngineOptions options; +// auto engine = std::make_unique<StorageEngine>(options); +// _engine_ref = engine.get(); +// _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); +// static_cast<void>(_data_dir->update_capacity()); +// ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); +// } +// +// void TearDown() override { +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); +// _engine_ref = nullptr; +// ExecEnv::GetInstance()->set_storage_engine(nullptr); +// } +// +// VariantColumnWriterReaderTest() = default; +// ~VariantColumnWriterReaderTest() override = default; +// +// private: +// TabletSchemaSPtr _tablet_schema = nullptr; +// StorageEngine* _engine_ref = nullptr; +// std::unique_ptr<DataDir> _data_dir = nullptr; +// TabletSharedPtr _tablet = nullptr; +// std::string _absolute_dir; +// std::string _current_dir; +// }; +// +// void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { +// EXPECT_TRUE(column_meta.has_column_path_info()); +// auto path = std::make_shared<vectorized::PathInData>(); +// path->from_protobuf(column_meta.column_path_info()); +// EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); +// EXPECT_EQ(column_meta.none_null_size(), path_with_size[path->copy_pop_front().get_path()]); +// } +// +// void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { +// EXPECT_TRUE(column_meta.has_column_path_info()); +// auto path = std::make_shared<vectorized::PathInData>(); +// path->from_protobuf(column_meta.column_path_info()); +// EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); +// for (const auto& [path, size] : +// column_meta.variant_statistics().sparse_column_non_null_size()) { +// EXPECT_EQ(size, path_with_size[path]); +// } +// EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__"); +// } +// +// TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { +// // 1. create tablet_schema +// TabletSchemaPB schema_pb; +// schema_pb.set_keys_type(KeysType::DUP_KEYS); +// SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1"); +// _tablet_schema = std::make_shared<TabletSchema>(); +// _tablet_schema->init_from_pb(schema_pb); +// +// // 2. create tablet +// TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); +// tablet_meta->_tablet_id = 10000; +// _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); +// +// EXPECT_TRUE(_tablet->init().ok()); +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +// EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); +// +// // 3. create file_writer +// io::FileWriterPtr file_writer; +// auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); +// auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// // 4. create column_writer +// SegmentFooterPB footer; +// ColumnWriterOptions opts; +// opts.meta = footer.add_columns(); +// opts.compression_type = CompressionTypePB::LZ4; +// opts.file_writer = file_writer.get(); +// opts.footer = &footer; +// RowsetWriterContext rowset_ctx; +// rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; +// opts.rowset_ctx = &rowset_ctx; +// opts.rowset_ctx->tablet_schema = _tablet_schema; +// TabletColumn column = _tablet_schema->column(0); +// _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); +// +// std::unique_ptr<ColumnWriter> writer; +// EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); +// EXPECT_TRUE(writer->init().ok()); +// EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); +// +// // 5. write data +// auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); +// auto block = _tablet_schema->create_block(); +// auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); +// std::unordered_map<int, std::string> inserted_jsonstr; +// auto path_with_size = +// VariantUtil::fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr); +// olap_data_convertor->add_column_data_convertor(column); +// olap_data_convertor->set_source_content(&block, 0, 1000); +// auto [result, accessor] = olap_data_convertor->convert_column_data(0); +// EXPECT_TRUE(result.ok()); +// EXPECT_TRUE(accessor != nullptr); +// EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); +// st = writer->finish(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = writer->write_data(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = writer->write_ordinal_index(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = writer->write_zone_map(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(file_writer->close().ok()); +// footer.set_num_rows(1000); +// +// // 6. check footer +// EXPECT_EQ(footer.columns_size(), 5); +// auto column_meta = footer.columns(0); +// EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); +// +// for (int i = 1; i < footer.columns_size() - 1; ++i) { +// auto column_meta = footer.columns(i); +// check_column_meta(column_meta, path_with_size); +// } +// check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); +// +// // 7. check variant reader +// io::FileReaderSPtr file_reader; +// st = io::global_local_filesystem()->open_file(file_path, &file_reader); +// EXPECT_TRUE(st.ok()) << st.msg(); +// ColumnReaderOptions read_opts; +// std::unique_ptr<ColumnReader> column_reader; +// st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); +// EXPECT_TRUE(variant_column_reader != nullptr); +// +// auto subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key0")); +// EXPECT_TRUE(subcolumn_reader != nullptr); +// subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key1")); +// EXPECT_TRUE(subcolumn_reader != nullptr); +// subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key2")); +// EXPECT_TRUE(subcolumn_reader != nullptr); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3"))); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4"))); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5"))); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6"))); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7"))); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8"))); +// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9"))); +// auto size = variant_column_reader->get_metadata_size(); +// EXPECT_GT(size, 0); +// +// // 8. check statistics +// auto statistics = variant_column_reader->get_stats(); +// for (const auto& [path, size] : statistics->subcolumns_non_null_size) { +// EXPECT_EQ(path_with_size[path], size); +// } +// for (const auto& [path, size] : statistics->sparse_column_non_null_size) { +// EXPECT_EQ(path_with_size[path], size); +// } +// +// // 9. check hier reader +// ColumnIterator* it; +// TabletColumn parent_column = _tablet_schema->column(0); +// StorageReadOptions storage_read_opts; +// storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; +// st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); +// ColumnIteratorOptions column_iter_opts; +// OlapReaderStatistics stats; +// column_iter_opts.stats = &stats; +// column_iter_opts.file_reader = file_reader.get(); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// MutableColumnPtr new_column_object = ColumnObject::create(3); +// size_t nrows = 1000; +// st = it->seek_to_ordinal(0); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = it->next_batch(&nrows, new_column_object); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(stats.bytes_read > 0); +// +// for (int i = 0; i < 1000; ++i) { +// std::string value; +// st = assert_cast<ColumnObject*>(new_column_object.get()) +// ->serialize_one_row_to_string(i, &value); +// +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_EQ(value, inserted_jsonstr[i]); +// } +// +// std::vector<rowid_t> row_ids; +// for (int i = 0; i < 1000; ++i) { +// if (i % 7 == 0) { +// row_ids.push_back(i); +// } +// } +// new_column_object = ColumnObject::create(3); +// st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object); +// EXPECT_TRUE(st.ok()) << st.msg(); +// for (int i = 0; i < row_ids.size(); ++i) { +// std::string value; +// st = assert_cast<ColumnObject*>(new_column_object.get()) +// ->serialize_one_row_to_string(i, &value); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]); +// } +// +// auto read_to_column_object = [&]() { +// new_column_object = ColumnObject::create(3); +// nrows = 1000; +// st = it->seek_to_ordinal(0); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = it->next_batch(&nrows, new_column_object); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(stats.bytes_read > 0); +// EXPECT_EQ(nrows, 1000); +// }; +// +// // 10. check sparse extract reader +// for (int i = 3; i < 10; ++i) { +// std::string key = ".key" + std::to_string(i); +// TabletColumn subcolumn_in_sparse; +// subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key); +// subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); +// subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id()); +// subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() + key)); +// subcolumn_in_sparse.set_variant_max_subcolumns_count( +// parent_column.variant_max_subcolumns_count()); +// subcolumn_in_sparse.set_is_nullable(true); +// +// st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// read_to_column_object(); +// +// for (int row = 0; row < 1000; ++row) { +// std::string value; +// st = assert_cast<ColumnObject*>(new_column_object.get()) +// ->serialize_one_row_to_string(row, &value); +// EXPECT_TRUE(st.ok()) << st.msg(); +// if (inserted_jsonstr[row].find(key) != std::string::npos) { +// if (i % 2 == 0) { +// EXPECT_EQ(value, "88"); +// } else { +// EXPECT_EQ(value, "str99"); +// } +// } +// } +// } +// +// // 11. check leaf reader +// auto check_leaf_reader = [&]() { +// for (int i = 0; i < 3; ++i) { +// std::string key = ".key" + std::to_string(i); +// TabletColumn subcolumn; +// subcolumn.set_name(parent_column.name_lower_case() + key); +// subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type()); +// subcolumn.set_parent_unique_id(parent_column.unique_id()); +// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + key)); +// subcolumn.set_variant_max_subcolumns_count( +// parent_column.variant_max_subcolumns_count()); +// subcolumn.set_is_nullable(true); +// +// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// auto column_type = DataTypeFactory::instance().create_data_type(subcolumn, false); +// auto read_column = column_type->create_column(); +// nrows = 1000; +// st = it->seek_to_ordinal(0); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = it->next_batch(&nrows, read_column); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(stats.bytes_read > 0); +// +// for (int row = 0; row < 1000; ++row) { +// const std::string& value = column_type->to_string(*read_column, row); +// if (inserted_jsonstr[row].find(key) != std::string::npos) { +// if (i % 2 == 0) { +// EXPECT_EQ(value, "88"); +// } else { +// EXPECT_EQ(value, "str99"); +// } +// } +// } +// } +// }; +// check_leaf_reader(); +// +// // 12. check empty +// TabletColumn subcolumn; +// subcolumn.set_name(parent_column.name_lower_case() + ".key10"); +// subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); +// subcolumn.set_parent_unique_id(parent_column.unique_id()); +// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); +// subcolumn.set_is_nullable(true); +// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); +// +// // 13. check statistics size == limit +// auto& variant_stats = variant_column_reader->_statistics; +// EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < +// config::variant_max_sparse_column_statistics_size); +// auto limit = config::variant_max_sparse_column_statistics_size - +// variant_stats->sparse_column_non_null_size.size(); +// for (int i = 0; i < limit; ++i) { +// std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); +// variant_stats->sparse_column_non_null_size[key] = 10000; +// } +// EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == +// config::variant_max_sparse_column_statistics_size); +// +// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// auto check_empty_column = [&]() { +// for (int row = 0; row < 1000; ++row) { +// std::string value; +// st = assert_cast<ColumnObject*>(new_column_object.get()) +// ->serialize_one_row_to_string(row, &value); +// +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_EQ(value, "{}"); +// } +// }; +// +// read_to_column_object(); +// check_empty_column(); +// +// // construct tablet schema for compaction +// storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION; +// storage_read_opts.tablet_schema = _tablet_schema; +// std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; +// TabletSchema::PathsSetInfo paths_set_info; +// paths_set_info.sub_path_set.insert("key0"); +// paths_set_info.sub_path_set.insert("key3"); +// paths_set_info.sub_path_set.insert("key4"); +// paths_set_info.sparse_path_set.insert("key1"); +// paths_set_info.sparse_path_set.insert("key2"); +// paths_set_info.sparse_path_set.insert("key5"); +// paths_set_info.sparse_path_set.insert("key6"); +// paths_set_info.sparse_path_set.insert("key7"); +// paths_set_info.sparse_path_set.insert("key8"); +// paths_set_info.sparse_path_set.insert("key9"); +// uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info; +// _tablet_schema->set_path_set_info(std::move(uid_to_paths_set_info)); +// +// // 14. check compaction subcolumn reader +// check_leaf_reader(); +// +// // 15. check compaction root reader +// st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// // 16. check compacton sparse column +// TabletColumn sparse_column = schema_util::create_sparse_column(parent_column); +// st = variant_column_reader->new_iterator(&it, sparse_column, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// auto column_type = DataTypeFactory::instance().create_data_type(sparse_column, false); +// auto read_column = column_type->create_column(); +// nrows = 1000; +// st = it->seek_to_ordinal(0); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = it->next_batch(&nrows, read_column); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(stats.bytes_read > 0); +// +// for (int row = 0; row < 1000; ++row) { +// const std::string& value = column_type->to_string(*read_column, row); +// EXPECT_TRUE(value.find("key0") == std::string::npos) +// << "row: " << row << ", value: " << value; +// EXPECT_TRUE(value.find("key3") == std::string::npos) +// << "row: " << row << ", value: " << value; +// EXPECT_TRUE(value.find("key4") == std::string::npos) +// << "row: " << row << ", value: " << value; +// } +// +// // 17. check limit = 10000 +// subcolumn.set_name(parent_column.name_lower_case() + ".key10"); +// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); +// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); +// +// for (int i = 0; i < limit; ++i) { +// std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); +// variant_stats->sparse_column_non_null_size.erase(key); +// } +// +// // 18. check compacton sparse extract column +// subcolumn.set_name(parent_column.name_lower_case() + ".key3"); +// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key3")); +// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); +// +// // 19. check compaction default column +// subcolumn.set_name(parent_column.name_lower_case() + ".key10"); +// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); +// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +// } +// +// TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { +// // 1. create tablet_schema +// TabletSchemaPB schema_pb; +// schema_pb.set_keys_type(KeysType::DUP_KEYS); +// SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", 10); +// _tablet_schema = std::make_shared<TabletSchema>(); +// _tablet_schema->init_from_pb(schema_pb); +// +// // 2. create tablet +// TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); +// tablet_meta->_tablet_id = 10000; +// _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); +// EXPECT_TRUE(_tablet->init().ok()); +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +// EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); +// +// // 3. create file_writer +// io::FileWriterPtr file_writer; +// auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); +// auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// // 4. create column_writer +// SegmentFooterPB footer; +// ColumnWriterOptions opts; +// opts.meta = footer.add_columns(); +// opts.compression_type = CompressionTypePB::LZ4; +// opts.file_writer = file_writer.get(); +// opts.footer = &footer; +// RowsetWriterContext rowset_ctx; +// rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; +// opts.rowset_ctx = &rowset_ctx; +// opts.rowset_ctx->tablet_schema = _tablet_schema; +// TabletColumn column = _tablet_schema->column(0); +// _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); +// +// std::unique_ptr<ColumnWriter> writer; +// EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); +// EXPECT_TRUE(writer->init().ok()); +// EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); +// +// // 5. write data +// auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); +// auto block = _tablet_schema->create_block(); +// auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); +// std::unordered_map<int, std::string> inserted_jsonstr; +// auto path_with_size = VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000, +// &inserted_jsonstr); +// olap_data_convertor->add_column_data_convertor(column); +// olap_data_convertor->set_source_content(&block, 0, 1000); +// auto [result, accessor] = olap_data_convertor->convert_column_data(0); +// EXPECT_TRUE(result.ok()); +// EXPECT_TRUE(accessor != nullptr); +// EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); +// st = writer->finish(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = writer->write_data(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = writer->write_ordinal_index(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = writer->write_zone_map(); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(file_writer->close().ok()); +// footer.set_num_rows(1000); +// +// // 6. check footer +// EXPECT_EQ(footer.columns_size(), 12); +// auto column_meta = footer.columns(0); +// EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); +// +// for (int i = 1; i < footer.columns_size() - 1; ++i) { +// auto column_meta = footer.columns(i); +// check_column_meta(column_meta, path_with_size); +// } +// check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); +// +// // 7. check variant reader +// io::FileReaderSPtr file_reader; +// st = io::global_local_filesystem()->open_file(file_path, &file_reader); +// EXPECT_TRUE(st.ok()) << st.msg(); +// ColumnReaderOptions read_opts; +// std::unique_ptr<ColumnReader> column_reader; +// st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); +// EXPECT_TRUE(variant_column_reader != nullptr); +// +// // 8. check statistics +// auto statistics = variant_column_reader->get_stats(); +// for (const auto& [path, size] : statistics->subcolumns_non_null_size) { +// std::cout << "path: " << path << ", size: " << size << std::endl; +// EXPECT_EQ(path_with_size[path], size); +// } +// for (const auto& [path, size] : statistics->sparse_column_non_null_size) { +// std::cout << "sparse path: " << path << ", size: " << size << std::endl; +// EXPECT_EQ(path_with_size[path], size); +// } +// +// // 9. check root +// ColumnIterator* it; +// TabletColumn parent_column = _tablet_schema->column(0); +// StorageReadOptions storage_read_opts; +// storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; +// st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); +// ColumnIteratorOptions column_iter_opts; +// OlapReaderStatistics stats; +// column_iter_opts.stats = &stats; +// column_iter_opts.file_reader = file_reader.get(); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// +// MutableColumnPtr new_column_object = ColumnObject::create(3); +// size_t nrows = 1000; +// st = it->seek_to_ordinal(0); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = it->next_batch(&nrows, new_column_object); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(stats.bytes_read > 0); +// +// for (int i = 0; i < 1000; ++i) { +// std::string value; +// st = assert_cast<ColumnObject*>(new_column_object.get()) +// ->serialize_one_row_to_string(i, &value); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_EQ(value, inserted_jsonstr[i]); +// } +// +// auto read_to_column_object = [&]() { +// new_column_object = ColumnObject::create(10); +// nrows = 1000; +// st = it->seek_to_ordinal(0); +// EXPECT_TRUE(st.ok()) << st.msg(); +// st = it->next_batch(&nrows, new_column_object); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(stats.bytes_read > 0); +// EXPECT_EQ(nrows, 1000); +// }; +// +// auto check_key_stats = [&](const std::string& key_num) { +// std::string key = ".key" + key_num; +// TabletColumn subcolumn_in_nested; +// subcolumn_in_nested.set_name(parent_column.name_lower_case() + key); +// subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); +// subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id()); +// subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() + key)); +// subcolumn_in_nested.set_variant_max_subcolumns_count( +// parent_column.variant_max_subcolumns_count()); +// subcolumn_in_nested.set_is_nullable(true); +// +// st = variant_column_reader->new_iterator(&it, subcolumn_in_nested, &storage_read_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); +// st = it->init(column_iter_opts); +// EXPECT_TRUE(st.ok()) << st.msg(); +// read_to_column_object(); +// +// size_t key_count = 0; +// size_t key_nested_count = 0; +// for (int row = 0; row < 1000; ++row) { +// std::string value; +// st = assert_cast<ColumnObject*>(new_column_object.get()) +// ->serialize_one_row_to_string(row, &value); +// EXPECT_TRUE(st.ok()) << st.msg(); +// if (value.find("nested" + key_num) != std::string::npos) { +// key_nested_count++; +// } else if (value.find("88") != std::string::npos) { +// key_count++; +// } +// } +// EXPECT_EQ(key_count, path_with_size["key" + key_num]); +// EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested" + key_num]); +// }; +// +// for (int i = 3; i < 10; ++i) { +// check_key_stats(std::to_string(i)); +// } +// +// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +// } +// +// } // namespace doris \ No newline at end of file diff --git a/be/test/olap/rowset/variant_with_compaction_test.cpp b/be/test/olap/rowset/variant_with_compaction_test.cpp deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index d4d50033087..c0137934421 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -2610,15 +2610,19 @@ public class OlapTable extends Table implements MTMVRelatedTableIf, GsonPostProc } public void setVariantMaxSubcolumnsCount(int maxSubcoumnsCount) { - getOrCreatTableProperty().setVariantMaxSubcolumnsCount(maxSubcoumnsCount); List<Column> columns = getBaseSchema(true); + boolean hasVariantType = false; for (Column column : columns) { Type type = column.getType(); if (type.isVariantType()) { + hasVariantType = true; VariantType scType = (VariantType) type; scType.setVariantMaxSubcolumnsCount(maxSubcoumnsCount); } } + if (hasVariantType) { + getOrCreatTableProperty().setVariantMaxSubcolumnsCount(maxSubcoumnsCount); + } } public int getVariantMaxSubcolumnsCount() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java index 33efcf950ba..98ac60d5375 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java @@ -396,13 +396,16 @@ public abstract class DataType { org.apache.doris.catalog.ArrayType arrayType = (org.apache.doris.catalog.ArrayType) type; return ArrayType.of(fromCatalogType(arrayType.getItemType()), arrayType.getContainsNull()); } else if (type.isVariantType()) { - List<VariantField> variantFields = ((org.apache.doris.catalog.VariantType) type) - .getPredefinedFields().stream() - .map(cf -> new VariantField(cf.getPattern(), fromCatalogType(cf.getType()), - cf.getComment() == null ? "" : cf.getComment(), cf.getPatternType().toString())) - .collect(ImmutableList.toImmutableList()); - return new VariantType(variantFields, - ((org.apache.doris.catalog.VariantType) type).getVariantMaxSubcolumnsCount()); + if (type instanceof org.apache.doris.catalog.VariantType) { + List<VariantField> variantFields = ((org.apache.doris.catalog.VariantType) type) + .getPredefinedFields().stream() + .map(cf -> new VariantField(cf.getPattern(), fromCatalogType(cf.getType()), + cf.getComment() == null ? "" : cf.getComment(), cf.getPatternType().toString())) + .collect(ImmutableList.toImmutableList()); + return new VariantType(variantFields, + ((org.apache.doris.catalog.VariantType) type).getVariantMaxSubcolumnsCount()); + } + return new VariantType(0); } else { return UnsupportedType.INSTANCE; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 140c9235b78..d73708ac19a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -2498,7 +2498,7 @@ public class SessionVariable implements Serializable, Writable { checker = "checkGlobalVariantMaxSubcolumnsCount", fuzzy = true ) - public int globalVariantMaxSubcolumnsCount = 5; + public int globalVariantMaxSubcolumnsCount = 2048; public void setEnableEsParallelScroll(boolean enableESParallelScroll) { this.enableESParallelScroll = enableESParallelScroll; diff --git a/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out b/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out index efcecd75953..0097ff185ac 100644 Binary files a/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out and b/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out differ diff --git a/regression-test/data/schema_change_p0/test_modify_reorder_column.out b/regression-test/data/schema_change_p0/test_modify_reorder_column.out index ce2b54972c4..9b3a9cbd122 100644 Binary files a/regression-test/data/schema_change_p0/test_modify_reorder_column.out and b/regression-test/data/schema_change_p0/test_modify_reorder_column.out differ diff --git a/regression-test/data/variant_p0/load.out b/regression-test/data/variant_p0/load.out index ecbfb38a747..5f0731b29e9 100644 Binary files a/regression-test/data/variant_p0/load.out and b/regression-test/data/variant_p0/load.out differ diff --git a/regression-test/suites/variant_p0/load.groovy b/regression-test/suites/variant_p0/load.groovy index 8661d86983c..e5adb356390 100644 --- a/regression-test/suites/variant_p0/load.groovy +++ b/regression-test/suites/variant_p0/load.groovy @@ -101,6 +101,7 @@ suite("regression_test_variant", "p0"){ qt_sql4 "select v['b'], v['b']['c'], cast(v as int) from ${table_name} where cast(v['b'] as string) != 'null' and cast(v['b'] as string) is not null and cast(v['b'] as string) != '{}' order by k,cast(v as string) desc limit 10000;" qt_sql5 "select v['b'] from ${table_name} where cast(v['b'] as int) > 0;" qt_sql6 "select cast(v['b'] as string) from ${table_name} where cast(v['b'] as string) != 'null' and cast(v['b'] as string) is not null and cast(v['b'] as string) != '{}' order by k, cast(v['b'] as string) " + qt_sql7 "select * from ${table_name} where v >= 5 order by k limit 5" // verify table_name } sql "insert into simple_variant_DUPLICATE select k, cast(v as string) from simple_variant_UNIQUE;" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org