This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/variant-sparse by this push: new 23a520dae92 [fix](ut) fix variant_column_writer_reader_test.cpp column_object_test.cpp 23a520dae92 is described below commit 23a520dae92f192a4f6935f8e8de19d4b2c143ad Author: Sun Chenyang <suncheny...@selectdb.com> AuthorDate: Thu Apr 24 23:25:44 2025 +0800 [fix](ut) fix variant_column_writer_reader_test.cpp column_object_test.cpp --- be/src/olap/rowset/segment_v2/column_reader.h | 2 +- .../variant_column_writer_reader_test.cpp | 1326 ++++++++--------- be/test/vec/columns/column_object_test.cpp | 1568 ++++++++++---------- 3 files changed, 1453 insertions(+), 1443 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index c0c6ca06882..363bc010f16 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -89,7 +89,7 @@ struct ColumnReaderOptions { int be_exec_version = -1; - const TabletSchemaSPtr tablet_schema = nullptr; + TabletSchemaSPtr tablet_schema = nullptr; }; struct ColumnIteratorOptions { diff --git a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp index 0138d83adb5..d1c20ca2306 100644 --- a/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp +++ b/be/test/olap/rowset/segment_v2/variant_column_writer_reader_test.cpp @@ -1,663 +1,663 @@ -// // Licensed to the Apache Software Foundation (ASF) under one -// // or more contributor license agreements. See the NOTICE file -// // distributed with this work for additional information -// // regarding copyright ownership. The ASF licenses this file -// // to you under the Apache License, Version 2.0 (the -// // "License"); you may not use this file except in compliance -// // with the License. You may obtain a copy of the License at -// // -// // http://www.apache.org/licenses/LICENSE-2.0 -// // -// // Unless required by applicable law or agreed to in writing, -// // software distributed under the License is distributed on an -// // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// // KIND, either express or implied. See the License for the -// // specific language governing permissions and limitations -// // under the License. -// -// #include "gtest/gtest.h" -// #include "olap/rowset/segment_v2/column_reader.h" -// #include "olap/rowset/segment_v2/hierarchical_data_reader.h" -// #include "olap/rowset/segment_v2/variant_column_writer_impl.h" -// #include "olap/storage_engine.h" -// #include "testutil/schema_utils.h" -// #include "testutil/variant_util.h" -// -// using namespace doris::vectorized; -// -// namespace doris { -// -// constexpr static uint32_t MAX_PATH_LEN = 1024; -// constexpr static std::string_view dest_dir = "/ut_dir/variant_column_writer_test"; -// constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; -// -// class VariantColumnWriterReaderTest : public testing::Test { -// public: -// void SetUp() override { -// // absolute dir -// char buffer[MAX_PATH_LEN]; -// EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); -// _current_dir = std::string(buffer); -// _absolute_dir = _current_dir + std::string(dest_dir); -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); -// EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); -// -// // tmp dir -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); -// EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); -// std::vector<StorePath> paths; -// paths.emplace_back(std::string(tmp_dir), 1024000000); -// auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); -// Status st = tmp_file_dirs->init(); -// EXPECT_TRUE(st.ok()) << st.to_json(); -// ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); -// -// // storage engine -// doris::EngineOptions options; -// auto engine = std::make_unique<StorageEngine>(options); -// _engine_ref = engine.get(); -// _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); -// static_cast<void>(_data_dir->update_capacity()); -// ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); -// } -// -// void TearDown() override { -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); -// _engine_ref = nullptr; -// ExecEnv::GetInstance()->set_storage_engine(nullptr); -// } -// -// VariantColumnWriterReaderTest() = default; -// ~VariantColumnWriterReaderTest() override = default; -// -// private: -// TabletSchemaSPtr _tablet_schema = nullptr; -// StorageEngine* _engine_ref = nullptr; -// std::unique_ptr<DataDir> _data_dir = nullptr; -// TabletSharedPtr _tablet = nullptr; -// std::string _absolute_dir; -// std::string _current_dir; -// }; -// -// void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { -// EXPECT_TRUE(column_meta.has_column_path_info()); -// auto path = std::make_shared<vectorized::PathInData>(); -// path->from_protobuf(column_meta.column_path_info()); -// EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); -// EXPECT_EQ(column_meta.none_null_size(), path_with_size[path->copy_pop_front().get_path()]); -// } -// -// void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { -// EXPECT_TRUE(column_meta.has_column_path_info()); -// auto path = std::make_shared<vectorized::PathInData>(); -// path->from_protobuf(column_meta.column_path_info()); -// EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); -// for (const auto& [path, size] : -// column_meta.variant_statistics().sparse_column_non_null_size()) { -// EXPECT_EQ(size, path_with_size[path]); -// } -// EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__"); -// } -// -// TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { -// // 1. create tablet_schema -// TabletSchemaPB schema_pb; -// schema_pb.set_keys_type(KeysType::DUP_KEYS); -// SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1"); -// _tablet_schema = std::make_shared<TabletSchema>(); -// _tablet_schema->init_from_pb(schema_pb); -// -// // 2. create tablet -// TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); -// tablet_meta->_tablet_id = 10000; -// _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); -// -// EXPECT_TRUE(_tablet->init().ok()); -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); -// EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); -// -// // 3. create file_writer -// io::FileWriterPtr file_writer; -// auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); -// auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// // 4. create column_writer -// SegmentFooterPB footer; -// ColumnWriterOptions opts; -// opts.meta = footer.add_columns(); -// opts.compression_type = CompressionTypePB::LZ4; -// opts.file_writer = file_writer.get(); -// opts.footer = &footer; -// RowsetWriterContext rowset_ctx; -// rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; -// opts.rowset_ctx = &rowset_ctx; -// opts.rowset_ctx->tablet_schema = _tablet_schema; -// TabletColumn column = _tablet_schema->column(0); -// _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); -// -// std::unique_ptr<ColumnWriter> writer; -// EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); -// EXPECT_TRUE(writer->init().ok()); -// EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); -// -// // 5. write data -// auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); -// auto block = _tablet_schema->create_block(); -// auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); -// std::unordered_map<int, std::string> inserted_jsonstr; -// auto path_with_size = -// VariantUtil::fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr); -// olap_data_convertor->add_column_data_convertor(column); -// olap_data_convertor->set_source_content(&block, 0, 1000); -// auto [result, accessor] = olap_data_convertor->convert_column_data(0); -// EXPECT_TRUE(result.ok()); -// EXPECT_TRUE(accessor != nullptr); -// EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); -// st = writer->finish(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = writer->write_data(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = writer->write_ordinal_index(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = writer->write_zone_map(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(file_writer->close().ok()); -// footer.set_num_rows(1000); -// -// // 6. check footer -// EXPECT_EQ(footer.columns_size(), 5); -// auto column_meta = footer.columns(0); -// EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); -// -// for (int i = 1; i < footer.columns_size() - 1; ++i) { -// auto column_meta = footer.columns(i); -// check_column_meta(column_meta, path_with_size); -// } -// check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); -// -// // 7. check variant reader -// io::FileReaderSPtr file_reader; -// st = io::global_local_filesystem()->open_file(file_path, &file_reader); -// EXPECT_TRUE(st.ok()) << st.msg(); -// ColumnReaderOptions read_opts; -// std::unique_ptr<ColumnReader> column_reader; -// st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); -// EXPECT_TRUE(variant_column_reader != nullptr); -// -// auto subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key0")); -// EXPECT_TRUE(subcolumn_reader != nullptr); -// subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key1")); -// EXPECT_TRUE(subcolumn_reader != nullptr); -// subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key2")); -// EXPECT_TRUE(subcolumn_reader != nullptr); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3"))); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4"))); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5"))); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6"))); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7"))); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8"))); -// EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9"))); -// auto size = variant_column_reader->get_metadata_size(); -// EXPECT_GT(size, 0); -// -// // 8. check statistics -// auto statistics = variant_column_reader->get_stats(); -// for (const auto& [path, size] : statistics->subcolumns_non_null_size) { -// EXPECT_EQ(path_with_size[path], size); -// } -// for (const auto& [path, size] : statistics->sparse_column_non_null_size) { -// EXPECT_EQ(path_with_size[path], size); -// } -// -// // 9. check hier reader -// ColumnIterator* it; -// TabletColumn parent_column = _tablet_schema->column(0); -// StorageReadOptions storage_read_opts; -// storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; -// st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); -// ColumnIteratorOptions column_iter_opts; -// OlapReaderStatistics stats; -// column_iter_opts.stats = &stats; -// column_iter_opts.file_reader = file_reader.get(); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// MutableColumnPtr new_column_object = ColumnObject::create(3); -// size_t nrows = 1000; -// st = it->seek_to_ordinal(0); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = it->next_batch(&nrows, new_column_object); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(stats.bytes_read > 0); -// -// for (int i = 0; i < 1000; ++i) { -// std::string value; -// st = assert_cast<ColumnObject*>(new_column_object.get()) -// ->serialize_one_row_to_string(i, &value); -// -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_EQ(value, inserted_jsonstr[i]); -// } -// -// std::vector<rowid_t> row_ids; -// for (int i = 0; i < 1000; ++i) { -// if (i % 7 == 0) { -// row_ids.push_back(i); -// } -// } -// new_column_object = ColumnObject::create(3); -// st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object); -// EXPECT_TRUE(st.ok()) << st.msg(); -// for (int i = 0; i < row_ids.size(); ++i) { -// std::string value; -// st = assert_cast<ColumnObject*>(new_column_object.get()) -// ->serialize_one_row_to_string(i, &value); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]); -// } -// -// auto read_to_column_object = [&]() { -// new_column_object = ColumnObject::create(3); -// nrows = 1000; -// st = it->seek_to_ordinal(0); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = it->next_batch(&nrows, new_column_object); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(stats.bytes_read > 0); -// EXPECT_EQ(nrows, 1000); -// }; -// -// // 10. check sparse extract reader -// for (int i = 3; i < 10; ++i) { -// std::string key = ".key" + std::to_string(i); -// TabletColumn subcolumn_in_sparse; -// subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key); -// subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); -// subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id()); -// subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() + key)); -// subcolumn_in_sparse.set_variant_max_subcolumns_count( -// parent_column.variant_max_subcolumns_count()); -// subcolumn_in_sparse.set_is_nullable(true); -// -// st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// read_to_column_object(); -// -// for (int row = 0; row < 1000; ++row) { -// std::string value; -// st = assert_cast<ColumnObject*>(new_column_object.get()) -// ->serialize_one_row_to_string(row, &value); -// EXPECT_TRUE(st.ok()) << st.msg(); -// if (inserted_jsonstr[row].find(key) != std::string::npos) { -// if (i % 2 == 0) { -// EXPECT_EQ(value, "88"); -// } else { -// EXPECT_EQ(value, "str99"); -// } -// } -// } -// } -// -// // 11. check leaf reader -// auto check_leaf_reader = [&]() { -// for (int i = 0; i < 3; ++i) { -// std::string key = ".key" + std::to_string(i); -// TabletColumn subcolumn; -// subcolumn.set_name(parent_column.name_lower_case() + key); -// subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type()); -// subcolumn.set_parent_unique_id(parent_column.unique_id()); -// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + key)); -// subcolumn.set_variant_max_subcolumns_count( -// parent_column.variant_max_subcolumns_count()); -// subcolumn.set_is_nullable(true); -// -// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// auto column_type = DataTypeFactory::instance().create_data_type(subcolumn, false); -// auto read_column = column_type->create_column(); -// nrows = 1000; -// st = it->seek_to_ordinal(0); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = it->next_batch(&nrows, read_column); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(stats.bytes_read > 0); -// -// for (int row = 0; row < 1000; ++row) { -// const std::string& value = column_type->to_string(*read_column, row); -// if (inserted_jsonstr[row].find(key) != std::string::npos) { -// if (i % 2 == 0) { -// EXPECT_EQ(value, "88"); -// } else { -// EXPECT_EQ(value, "str99"); -// } -// } -// } -// } -// }; -// check_leaf_reader(); -// -// // 12. check empty -// TabletColumn subcolumn; -// subcolumn.set_name(parent_column.name_lower_case() + ".key10"); -// subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); -// subcolumn.set_parent_unique_id(parent_column.unique_id()); -// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); -// subcolumn.set_is_nullable(true); -// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); -// -// // 13. check statistics size == limit -// auto& variant_stats = variant_column_reader->_statistics; -// EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < -// config::variant_max_sparse_column_statistics_size); -// auto limit = config::variant_max_sparse_column_statistics_size - -// variant_stats->sparse_column_non_null_size.size(); -// for (int i = 0; i < limit; ++i) { -// std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); -// variant_stats->sparse_column_non_null_size[key] = 10000; -// } -// EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == -// config::variant_max_sparse_column_statistics_size); -// -// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// auto check_empty_column = [&]() { -// for (int row = 0; row < 1000; ++row) { -// std::string value; -// st = assert_cast<ColumnObject*>(new_column_object.get()) -// ->serialize_one_row_to_string(row, &value); -// -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_EQ(value, "{}"); -// } -// }; -// -// read_to_column_object(); -// check_empty_column(); -// -// // construct tablet schema for compaction -// storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION; -// storage_read_opts.tablet_schema = _tablet_schema; -// std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; -// TabletSchema::PathsSetInfo paths_set_info; -// paths_set_info.sub_path_set.insert("key0"); -// paths_set_info.sub_path_set.insert("key3"); -// paths_set_info.sub_path_set.insert("key4"); -// paths_set_info.sparse_path_set.insert("key1"); -// paths_set_info.sparse_path_set.insert("key2"); -// paths_set_info.sparse_path_set.insert("key5"); -// paths_set_info.sparse_path_set.insert("key6"); -// paths_set_info.sparse_path_set.insert("key7"); -// paths_set_info.sparse_path_set.insert("key8"); -// paths_set_info.sparse_path_set.insert("key9"); -// uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info; -// _tablet_schema->set_path_set_info(std::move(uid_to_paths_set_info)); -// -// // 14. check compaction subcolumn reader -// check_leaf_reader(); -// -// // 15. check compaction root reader -// st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// // 16. check compacton sparse column -// TabletColumn sparse_column = schema_util::create_sparse_column(parent_column); -// st = variant_column_reader->new_iterator(&it, sparse_column, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// auto column_type = DataTypeFactory::instance().create_data_type(sparse_column, false); -// auto read_column = column_type->create_column(); -// nrows = 1000; -// st = it->seek_to_ordinal(0); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = it->next_batch(&nrows, read_column); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(stats.bytes_read > 0); -// -// for (int row = 0; row < 1000; ++row) { -// const std::string& value = column_type->to_string(*read_column, row); -// EXPECT_TRUE(value.find("key0") == std::string::npos) -// << "row: " << row << ", value: " << value; -// EXPECT_TRUE(value.find("key3") == std::string::npos) -// << "row: " << row << ", value: " << value; -// EXPECT_TRUE(value.find("key4") == std::string::npos) -// << "row: " << row << ", value: " << value; -// } -// -// // 17. check limit = 10000 -// subcolumn.set_name(parent_column.name_lower_case() + ".key10"); -// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); -// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); -// -// for (int i = 0; i < limit; ++i) { -// std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); -// variant_stats->sparse_column_non_null_size.erase(key); -// } -// -// // 18. check compacton sparse extract column -// subcolumn.set_name(parent_column.name_lower_case() + ".key3"); -// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key3")); -// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); -// -// // 19. check compaction default column -// subcolumn.set_name(parent_column.name_lower_case() + ".key10"); -// subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); -// st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); -// } -// -// TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { -// // 1. create tablet_schema -// TabletSchemaPB schema_pb; -// schema_pb.set_keys_type(KeysType::DUP_KEYS); -// SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", 10); -// _tablet_schema = std::make_shared<TabletSchema>(); -// _tablet_schema->init_from_pb(schema_pb); -// -// // 2. create tablet -// TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); -// tablet_meta->_tablet_id = 10000; -// _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); -// EXPECT_TRUE(_tablet->init().ok()); -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); -// EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); -// -// // 3. create file_writer -// io::FileWriterPtr file_writer; -// auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); -// auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// // 4. create column_writer -// SegmentFooterPB footer; -// ColumnWriterOptions opts; -// opts.meta = footer.add_columns(); -// opts.compression_type = CompressionTypePB::LZ4; -// opts.file_writer = file_writer.get(); -// opts.footer = &footer; -// RowsetWriterContext rowset_ctx; -// rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; -// opts.rowset_ctx = &rowset_ctx; -// opts.rowset_ctx->tablet_schema = _tablet_schema; -// TabletColumn column = _tablet_schema->column(0); -// _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); -// -// std::unique_ptr<ColumnWriter> writer; -// EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); -// EXPECT_TRUE(writer->init().ok()); -// EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); -// -// // 5. write data -// auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); -// auto block = _tablet_schema->create_block(); -// auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); -// std::unordered_map<int, std::string> inserted_jsonstr; -// auto path_with_size = VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000, -// &inserted_jsonstr); -// olap_data_convertor->add_column_data_convertor(column); -// olap_data_convertor->set_source_content(&block, 0, 1000); -// auto [result, accessor] = olap_data_convertor->convert_column_data(0); -// EXPECT_TRUE(result.ok()); -// EXPECT_TRUE(accessor != nullptr); -// EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); -// st = writer->finish(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = writer->write_data(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = writer->write_ordinal_index(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = writer->write_zone_map(); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(file_writer->close().ok()); -// footer.set_num_rows(1000); -// -// // 6. check footer -// EXPECT_EQ(footer.columns_size(), 12); -// auto column_meta = footer.columns(0); -// EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); -// -// for (int i = 1; i < footer.columns_size() - 1; ++i) { -// auto column_meta = footer.columns(i); -// check_column_meta(column_meta, path_with_size); -// } -// check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); -// -// // 7. check variant reader -// io::FileReaderSPtr file_reader; -// st = io::global_local_filesystem()->open_file(file_path, &file_reader); -// EXPECT_TRUE(st.ok()) << st.msg(); -// ColumnReaderOptions read_opts; -// std::unique_ptr<ColumnReader> column_reader; -// st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); -// EXPECT_TRUE(variant_column_reader != nullptr); -// -// // 8. check statistics -// auto statistics = variant_column_reader->get_stats(); -// for (const auto& [path, size] : statistics->subcolumns_non_null_size) { -// std::cout << "path: " << path << ", size: " << size << std::endl; -// EXPECT_EQ(path_with_size[path], size); -// } -// for (const auto& [path, size] : statistics->sparse_column_non_null_size) { -// std::cout << "sparse path: " << path << ", size: " << size << std::endl; -// EXPECT_EQ(path_with_size[path], size); -// } -// -// // 9. check root -// ColumnIterator* it; -// TabletColumn parent_column = _tablet_schema->column(0); -// StorageReadOptions storage_read_opts; -// storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; -// st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); -// ColumnIteratorOptions column_iter_opts; -// OlapReaderStatistics stats; -// column_iter_opts.stats = &stats; -// column_iter_opts.file_reader = file_reader.get(); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// -// MutableColumnPtr new_column_object = ColumnObject::create(3); -// size_t nrows = 1000; -// st = it->seek_to_ordinal(0); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = it->next_batch(&nrows, new_column_object); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(stats.bytes_read > 0); -// -// for (int i = 0; i < 1000; ++i) { -// std::string value; -// st = assert_cast<ColumnObject*>(new_column_object.get()) -// ->serialize_one_row_to_string(i, &value); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_EQ(value, inserted_jsonstr[i]); -// } -// -// auto read_to_column_object = [&]() { -// new_column_object = ColumnObject::create(10); -// nrows = 1000; -// st = it->seek_to_ordinal(0); -// EXPECT_TRUE(st.ok()) << st.msg(); -// st = it->next_batch(&nrows, new_column_object); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(stats.bytes_read > 0); -// EXPECT_EQ(nrows, 1000); -// }; -// -// auto check_key_stats = [&](const std::string& key_num) { -// std::string key = ".key" + key_num; -// TabletColumn subcolumn_in_nested; -// subcolumn_in_nested.set_name(parent_column.name_lower_case() + key); -// subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); -// subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id()); -// subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() + key)); -// subcolumn_in_nested.set_variant_max_subcolumns_count( -// parent_column.variant_max_subcolumns_count()); -// subcolumn_in_nested.set_is_nullable(true); -// -// st = variant_column_reader->new_iterator(&it, subcolumn_in_nested, &storage_read_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); -// st = it->init(column_iter_opts); -// EXPECT_TRUE(st.ok()) << st.msg(); -// read_to_column_object(); -// -// size_t key_count = 0; -// size_t key_nested_count = 0; -// for (int row = 0; row < 1000; ++row) { -// std::string value; -// st = assert_cast<ColumnObject*>(new_column_object.get()) -// ->serialize_one_row_to_string(row, &value); -// EXPECT_TRUE(st.ok()) << st.msg(); -// if (value.find("nested" + key_num) != std::string::npos) { -// key_nested_count++; -// } else if (value.find("88") != std::string::npos) { -// key_count++; -// } -// } -// EXPECT_EQ(key_count, path_with_size["key" + key_num]); -// EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested" + key_num]); -// }; -// -// for (int i = 3; i < 10; ++i) { -// check_key_stats(std::to_string(i)); -// } -// -// EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); -// } -// -// } // namespace doris \ No newline at end of file +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" +#include "olap/rowset/segment_v2/column_reader.h" +#include "olap/rowset/segment_v2/hierarchical_data_reader.h" +#include "olap/rowset/segment_v2/variant_column_writer_impl.h" +#include "olap/storage_engine.h" +#include "testutil/schema_utils.h" +#include "testutil/variant_util.h" + +using namespace doris::vectorized; + +namespace doris { + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "/ut_dir/variant_column_writer_test"; +constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; + +class VariantColumnWriterReaderTest : public testing::Test { +public: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _current_dir = std::string(buffer); + _absolute_dir = _current_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // tmp dir + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); + std::vector<StorePath> paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); + Status st = tmp_file_dirs->init(); + EXPECT_TRUE(st.ok()) << st.to_json(); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + // storage engine + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + _engine_ref = engine.get(); + _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); + static_cast<void>(_data_dir->update_capacity()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + VariantColumnWriterReaderTest() = default; + ~VariantColumnWriterReaderTest() override = default; + +private: + TabletSchemaSPtr _tablet_schema = nullptr; + StorageEngine* _engine_ref = nullptr; + std::unique_ptr<DataDir> _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _current_dir; +}; + +void check_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { + EXPECT_TRUE(column_meta.has_column_path_info()); + auto path = std::make_shared<vectorized::PathInData>(); + path->from_protobuf(column_meta.column_path_info()); + EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); + EXPECT_EQ(column_meta.none_null_size(), path_with_size[path->copy_pop_front().get_path()]); +} + +void check_sparse_column_meta(const ColumnMetaPB& column_meta, auto& path_with_size) { + EXPECT_TRUE(column_meta.has_column_path_info()); + auto path = std::make_shared<vectorized::PathInData>(); + path->from_protobuf(column_meta.column_path_info()); + EXPECT_EQ(column_meta.column_path_info().parrent_column_unique_id(), 1); + for (const auto& [path, size] : + column_meta.variant_statistics().sparse_column_non_null_size()) { + EXPECT_EQ(size, path_with_size[path]); + } + EXPECT_EQ(path->copy_pop_front().get_path(), "__DORIS_VARIANT_SPARSE__"); +} + +TEST_F(VariantColumnWriterReaderTest, test_write_data_normal) { + // 1. create tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1"); + _tablet_schema = std::make_shared<TabletSchema>(); + _tablet_schema->init_from_pb(schema_pb); + + // 2. create tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + tablet_meta->_tablet_id = 10000; + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + + EXPECT_TRUE(_tablet->init().ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + // 3. create file_writer + io::FileWriterPtr file_writer; + auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); + auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 4. create column_writer + SegmentFooterPB footer; + ColumnWriterOptions opts; + opts.meta = footer.add_columns(); + opts.compression_type = CompressionTypePB::LZ4; + opts.file_writer = file_writer.get(); + opts.footer = &footer; + RowsetWriterContext rowset_ctx; + rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; + opts.rowset_ctx = &rowset_ctx; + opts.rowset_ctx->tablet_schema = _tablet_schema; + TabletColumn column = _tablet_schema->column(0); + _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); + + std::unique_ptr<ColumnWriter> writer; + EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); + EXPECT_TRUE(writer->init().ok()); + EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); + + // 5. write data + auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); + auto block = _tablet_schema->create_block(); + auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); + std::unordered_map<int, std::string> inserted_jsonstr; + auto path_with_size = + VariantUtil::fill_object_column_with_test_data(column_object, 1000, &inserted_jsonstr); + olap_data_convertor->add_column_data_convertor(column); + olap_data_convertor->set_source_content(&block, 0, 1000); + auto [result, accessor] = olap_data_convertor->convert_column_data(0); + EXPECT_TRUE(result.ok()); + EXPECT_TRUE(accessor != nullptr); + EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); + st = writer->finish(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_data(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_ordinal_index(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_zone_map(); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(file_writer->close().ok()); + footer.set_num_rows(1000); + + // 6. check footer + EXPECT_EQ(footer.columns_size(), 5); + auto column_meta = footer.columns(0); + EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); + + for (int i = 1; i < footer.columns_size() - 1; ++i) { + auto column_meta = footer.columns(i); + check_column_meta(column_meta, path_with_size); + } + check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); + + // 7. check variant reader + io::FileReaderSPtr file_reader; + st = io::global_local_filesystem()->open_file(file_path, &file_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + ColumnReaderOptions read_opts; + read_opts.tablet_schema = _tablet_schema; + std::unique_ptr<ColumnReader> column_reader; + st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); + EXPECT_TRUE(variant_column_reader != nullptr); + + auto subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key0")); + EXPECT_TRUE(subcolumn_reader != nullptr); + subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key1")); + EXPECT_TRUE(subcolumn_reader != nullptr); + subcolumn_reader = variant_column_reader->get_reader_by_path(PathInData("key2")); + EXPECT_TRUE(subcolumn_reader != nullptr); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key3"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key4"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key5"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key6"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key7"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key8"))); + EXPECT_TRUE(variant_column_reader->exist_in_sparse_column(PathInData("key9"))); + auto size = variant_column_reader->get_metadata_size(); + EXPECT_GT(size, 0); + + // 8. check statistics + auto statistics = variant_column_reader->get_stats(); + for (const auto& [path, size] : statistics->subcolumns_non_null_size) { + EXPECT_EQ(path_with_size[path], size); + } + for (const auto& [path, size] : statistics->sparse_column_non_null_size) { + EXPECT_EQ(path_with_size[path], size); + } + + // 9. check hier reader + ColumnIterator* it; + TabletColumn parent_column = _tablet_schema->column(0); + StorageReadOptions storage_read_opts; + storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; + st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + ColumnIteratorOptions column_iter_opts; + OlapReaderStatistics stats; + column_iter_opts.stats = &stats; + column_iter_opts.file_reader = file_reader.get(); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + MutableColumnPtr new_column_object = ColumnObject::create(3); + size_t nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int i = 0; i < 1000; ++i) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(i, &value); + + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, inserted_jsonstr[i]); + } + + std::vector<rowid_t> row_ids; + for (int i = 0; i < 1000; ++i) { + if (i % 7 == 0) { + row_ids.push_back(i); + } + } + new_column_object = ColumnObject::create(3); + st = it->read_by_rowids(row_ids.data(), row_ids.size(), new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + for (int i = 0; i < row_ids.size(); ++i) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(i, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, inserted_jsonstr[row_ids[i]]); + } + + auto read_to_column_object = [&]() { + new_column_object = ColumnObject::create(3); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + EXPECT_EQ(nrows, 1000); + }; + + // 10. check sparse extract reader + for (int i = 3; i < 10; ++i) { + std::string key = ".key" + std::to_string(i); + TabletColumn subcolumn_in_sparse; + subcolumn_in_sparse.set_name(parent_column.name_lower_case() + key); + subcolumn_in_sparse.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcolumn_in_sparse.set_parent_unique_id(parent_column.unique_id()); + subcolumn_in_sparse.set_path_info(PathInData(parent_column.name_lower_case() + key)); + subcolumn_in_sparse.set_variant_max_subcolumns_count( + parent_column.variant_max_subcolumns_count()); + subcolumn_in_sparse.set_is_nullable(true); + + st = variant_column_reader->new_iterator(&it, subcolumn_in_sparse, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + read_to_column_object(); + + for (int row = 0; row < 1000; ++row) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(row, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + if (inserted_jsonstr[row].find(key) != std::string::npos) { + if (i % 2 == 0) { + EXPECT_EQ(value, "88"); + } else { + EXPECT_EQ(value, "str99"); + } + } + } + } + + // 11. check leaf reader + auto check_leaf_reader = [&]() { + for (int i = 0; i < 3; ++i) { + std::string key = ".key" + std::to_string(i); + TabletColumn subcolumn; + subcolumn.set_name(parent_column.name_lower_case() + key); + subcolumn.set_type((FieldType)(int)footer.columns(i + 1).type()); + subcolumn.set_parent_unique_id(parent_column.unique_id()); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + key)); + subcolumn.set_variant_max_subcolumns_count( + parent_column.variant_max_subcolumns_count()); + subcolumn.set_is_nullable(true); + + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<FileColumnIterator*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto column_type = DataTypeFactory::instance().create_data_type(subcolumn, false); + auto read_column = column_type->create_column(); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, read_column); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int row = 0; row < 1000; ++row) { + const std::string& value = column_type->to_string(*read_column, row); + if (inserted_jsonstr[row].find(key) != std::string::npos) { + if (i % 2 == 0) { + EXPECT_EQ(value, "88"); + } else { + EXPECT_EQ(value, "str99"); + } + } + } + } + }; + check_leaf_reader(); + + // 12. check empty + TabletColumn subcolumn; + subcolumn.set_name(parent_column.name_lower_case() + ".key10"); + subcolumn.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcolumn.set_parent_unique_id(parent_column.unique_id()); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); + subcolumn.set_is_nullable(true); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); + + // 13. check statistics size == limit + auto& variant_stats = variant_column_reader->_statistics; + EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() < + config::variant_max_sparse_column_statistics_size); + auto limit = config::variant_max_sparse_column_statistics_size - + variant_stats->sparse_column_non_null_size.size(); + for (int i = 0; i < limit; ++i) { + std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); + variant_stats->sparse_column_non_null_size[key] = 10000; + } + EXPECT_TRUE(variant_stats->sparse_column_non_null_size.size() == + config::variant_max_sparse_column_statistics_size); + + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto check_empty_column = [&]() { + for (int row = 0; row < 1000; ++row) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(row, &value); + + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, "{}"); + } + }; + + read_to_column_object(); + check_empty_column(); + + // construct tablet schema for compaction + storage_read_opts.io_ctx.reader_type = ReaderType::READER_BASE_COMPACTION; + storage_read_opts.tablet_schema = _tablet_schema; + std::unordered_map<int32_t, TabletSchema::PathsSetInfo> uid_to_paths_set_info; + TabletSchema::PathsSetInfo paths_set_info; + paths_set_info.sub_path_set.insert("key0"); + paths_set_info.sub_path_set.insert("key3"); + paths_set_info.sub_path_set.insert("key4"); + paths_set_info.sparse_path_set.insert("key1"); + paths_set_info.sparse_path_set.insert("key2"); + paths_set_info.sparse_path_set.insert("key5"); + paths_set_info.sparse_path_set.insert("key6"); + paths_set_info.sparse_path_set.insert("key7"); + paths_set_info.sparse_path_set.insert("key8"); + paths_set_info.sparse_path_set.insert("key9"); + uid_to_paths_set_info[parent_column.unique_id()] = paths_set_info; + _tablet_schema->set_path_set_info(std::move(uid_to_paths_set_info)); + + // 14. check compaction subcolumn reader + check_leaf_reader(); + + // 15. check compaction root reader + st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<VariantRootColumnIterator*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 16. check compacton sparse column + TabletColumn sparse_column = schema_util::create_sparse_column(parent_column); + st = variant_column_reader->new_iterator(&it, sparse_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnMergeReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + auto column_type = DataTypeFactory::instance().create_data_type(sparse_column, false); + auto read_column = column_type->create_column(); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, read_column); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int row = 0; row < 1000; ++row) { + const std::string& value = column_type->to_string(*read_column, row); + EXPECT_TRUE(value.find("key0") == std::string::npos) + << "row: " << row << ", value: " << value; + EXPECT_TRUE(value.find("key3") == std::string::npos) + << "row: " << row << ", value: " << value; + EXPECT_TRUE(value.find("key4") == std::string::npos) + << "row: " << row << ", value: " << value; + } + + // 17. check limit = 10000 + subcolumn.set_name(parent_column.name_lower_case() + ".key10"); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); + + for (int i = 0; i < limit; ++i) { + std::string key = parent_column.name_lower_case() + ".key10" + std::to_string(i); + variant_stats->sparse_column_non_null_size.erase(key); + } + + // 18. check compacton sparse extract column + subcolumn.set_name(parent_column.name_lower_case() + ".key3"); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key3")); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<SparseColumnExtractReader*>(it) != nullptr); + + // 19. check compaction default column + subcolumn.set_name(parent_column.name_lower_case() + ".key10"); + subcolumn.set_path_info(PathInData(parent_column.name_lower_case() + ".key10")); + st = variant_column_reader->new_iterator(&it, subcolumn, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<DefaultValueColumnIterator*>(it) != nullptr); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +} + +TEST_F(VariantColumnWriterReaderTest, test_write_data_advanced) { + // 1. create tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + SchemaUtils::construct_column(schema_pb.add_column(), 1, "VARIANT", "V1", 10); + _tablet_schema = std::make_shared<TabletSchema>(); + _tablet_schema->init_from_pb(schema_pb); + + // 2. create tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + tablet_meta->_tablet_id = 10000; + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + // 3. create file_writer + io::FileWriterPtr file_writer; + auto file_path = local_segment_path(_tablet->tablet_path(), "0", 0); + auto st = io::global_local_filesystem()->create_file(file_path, &file_writer); + EXPECT_TRUE(st.ok()) << st.msg(); + + // 4. create column_writer + SegmentFooterPB footer; + ColumnWriterOptions opts; + opts.meta = footer.add_columns(); + opts.compression_type = CompressionTypePB::LZ4; + opts.file_writer = file_writer.get(); + opts.footer = &footer; + RowsetWriterContext rowset_ctx; + rowset_ctx.write_type = DataWriteType::TYPE_DIRECT; + opts.rowset_ctx = &rowset_ctx; + opts.rowset_ctx->tablet_schema = _tablet_schema; + TabletColumn column = _tablet_schema->column(0); + _init_column_meta(opts.meta, 0, column, CompressionTypePB::LZ4); + + std::unique_ptr<ColumnWriter> writer; + EXPECT_TRUE(ColumnWriter::create(opts, &column, file_writer.get(), &writer).ok()); + EXPECT_TRUE(writer->init().ok()); + EXPECT_TRUE(assert_cast<VariantColumnWriter*>(writer.get()) != nullptr); + + // 5. write data + auto olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); + auto block = _tablet_schema->create_block(); + auto column_object = (*std::move(block.get_by_position(0).column)).mutate(); + std::unordered_map<int, std::string> inserted_jsonstr; + auto path_with_size = VariantUtil::fill_object_column_with_nested_test_data(column_object, 1000, + &inserted_jsonstr); + olap_data_convertor->add_column_data_convertor(column); + olap_data_convertor->set_source_content(&block, 0, 1000); + auto [result, accessor] = olap_data_convertor->convert_column_data(0); + EXPECT_TRUE(result.ok()); + EXPECT_TRUE(accessor != nullptr); + EXPECT_TRUE(writer->append(accessor->get_nullmap(), accessor->get_data(), 1000).ok()); + st = writer->finish(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_data(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_ordinal_index(); + EXPECT_TRUE(st.ok()) << st.msg(); + st = writer->write_zone_map(); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(file_writer->close().ok()); + footer.set_num_rows(1000); + + // 6. check footer + EXPECT_EQ(footer.columns_size(), 12); + auto column_meta = footer.columns(0); + EXPECT_EQ(column_meta.type(), (int)FieldType::OLAP_FIELD_TYPE_VARIANT); + + for (int i = 1; i < footer.columns_size() - 1; ++i) { + auto column_meta = footer.columns(i); + check_column_meta(column_meta, path_with_size); + } + check_sparse_column_meta(footer.columns(footer.columns_size() - 1), path_with_size); + + // 7. check variant reader + io::FileReaderSPtr file_reader; + st = io::global_local_filesystem()->open_file(file_path, &file_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + ColumnReaderOptions read_opts; + read_opts.tablet_schema = _tablet_schema; + std::unique_ptr<ColumnReader> column_reader; + st = ColumnReader::create(read_opts, footer, 0, 1000, file_reader, &column_reader); + EXPECT_TRUE(st.ok()) << st.msg(); + + auto variant_column_reader = assert_cast<VariantColumnReader*>(column_reader.get()); + EXPECT_TRUE(variant_column_reader != nullptr); + + // 8. check statistics + auto statistics = variant_column_reader->get_stats(); + for (const auto& [path, size] : statistics->subcolumns_non_null_size) { + EXPECT_EQ(path_with_size[path], size); + } + for (const auto& [path, size] : statistics->sparse_column_non_null_size) { + EXPECT_EQ(path_with_size[path], size); + } + + // 9. check root + ColumnIterator* it; + TabletColumn parent_column = _tablet_schema->column(0); + StorageReadOptions storage_read_opts; + storage_read_opts.io_ctx.reader_type = ReaderType::READER_QUERY; + st = variant_column_reader->new_iterator(&it, parent_column, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + ColumnIteratorOptions column_iter_opts; + OlapReaderStatistics stats; + column_iter_opts.stats = &stats; + column_iter_opts.file_reader = file_reader.get(); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + + MutableColumnPtr new_column_object = ColumnObject::create(3); + size_t nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + + for (int i = 0; i < 1000; ++i) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(i, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_EQ(value, inserted_jsonstr[i]); + } + + auto read_to_column_object = [&]() { + new_column_object = ColumnObject::create(10); + nrows = 1000; + st = it->seek_to_ordinal(0); + EXPECT_TRUE(st.ok()) << st.msg(); + st = it->next_batch(&nrows, new_column_object); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(stats.bytes_read > 0); + EXPECT_EQ(nrows, 1000); + }; + + auto check_key_stats = [&](const std::string& key_num) { + std::string key = ".key" + key_num; + TabletColumn subcolumn_in_nested; + subcolumn_in_nested.set_name(parent_column.name_lower_case() + key); + subcolumn_in_nested.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + subcolumn_in_nested.set_parent_unique_id(parent_column.unique_id()); + subcolumn_in_nested.set_path_info(PathInData(parent_column.name_lower_case() + key)); + subcolumn_in_nested.set_variant_max_subcolumns_count( + parent_column.variant_max_subcolumns_count()); + subcolumn_in_nested.set_is_nullable(true); + + st = variant_column_reader->new_iterator(&it, subcolumn_in_nested, &storage_read_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + EXPECT_TRUE(assert_cast<HierarchicalDataReader*>(it) != nullptr); + st = it->init(column_iter_opts); + EXPECT_TRUE(st.ok()) << st.msg(); + read_to_column_object(); + + size_t key_count = 0; + size_t key_nested_count = 0; + for (int row = 0; row < 1000; ++row) { + std::string value; + st = assert_cast<ColumnObject*>(new_column_object.get()) + ->serialize_one_row_to_string(row, &value); + EXPECT_TRUE(st.ok()) << st.msg(); + if (value.find("nested" + key_num) != std::string::npos) { + key_nested_count++; + } else if (value.find("88") != std::string::npos) { + key_count++; + } + } + EXPECT_EQ(key_count, path_with_size["key" + key_num]); + EXPECT_EQ(key_nested_count, path_with_size["key" + key_num + ".nested" + key_num]); + }; + + for (int i = 3; i < 10; ++i) { + check_key_stats(std::to_string(i)); + } + + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); +} + +} // namespace doris \ No newline at end of file diff --git a/be/test/vec/columns/column_object_test.cpp b/be/test/vec/columns/column_object_test.cpp index b08e57dd19e..7fd827c8919 100644 --- a/be/test/vec/columns/column_object_test.cpp +++ b/be/test/vec/columns/column_object_test.cpp @@ -5,9 +5,9 @@ // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at -// + // http://www.apache.org/licenses/LICENSE-2.0 -// + // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -15,790 +15,800 @@ // specific language governing permissions and limitations // under the License. -// #include "vec/columns/column_object.h" -// -// #include <gmock/gmock-more-matchers.h> -// #include <gtest/gtest.h> -// #include <rapidjson/prettywriter.h> -// #include <stdio.h> -// -// #include "runtime/jsonb_value.h" -// #include "testutil/variant_util.h" -// #include "vec/common/string_ref.h" -// #include "vec/core/field.h" -// #include "vec/core/types.h" -// #include "vec/data_types/data_type_array.h" -// #include "vec/data_types/data_type_factory.hpp" -// -// using namespace doris::vectorized; -// -// using namespace doris; -// // #define ADD_SUB_COLUMN(key) \ -// // varaint->add_sub_column(PathInData(std::string_view(key)), 0); -// -// void convert_field_to_rapidjson(const vectorized::Field& field, rapidjson::Value& target, -// rapidjson::Document::AllocatorType& allocator) { -// switch (field.get_type()) { -// case vectorized::Field::Types::Null: -// target.SetNull(); -// break; -// case vectorized::Field::Types::Int64: -// target.SetInt64(field.get<Int64>()); -// break; -// case vectorized::Field::Types::Float64: -// target.SetDouble(field.get<Float64>()); -// break; -// case vectorized::Field::Types::JSONB: { -// const auto& val = field.get<JsonbField>(); -// JsonbValue* json_val = JsonbDocument::createValue(val.get_value(), val.get_size()); -// convert_jsonb_to_rapidjson(*json_val, target, allocator); -// break; -// } -// case vectorized::Field::Types::String: { -// const String& val = field.get<String>(); -// target.SetString(val.data(), cast_set<rapidjson::SizeType>(val.size())); -// break; -// } -// case vectorized::Field::Types::Array: { -// const vectorized::Array& array = field.get<Array>(); -// target.SetArray(); -// for (const vectorized::Field& item : array) { -// rapidjson::Value val; -// convert_field_to_rapidjson(item, val, allocator); -// target.PushBack(val, allocator); -// } -// break; -// } -// case vectorized::Field::Types::VariantMap: { -// const vectorized::VariantMap& map = field.get<VariantMap>(); -// target.SetObject(); -// for (const auto& item : map) { -// if (item.second.is_null()) { -// continue; -// } -// rapidjson::Value key; -// key.SetString(item.first.get_path().data(), -// cast_set<rapidjson::SizeType>(item.first.get_path().size())); -// rapidjson::Value val; -// convert_field_to_rapidjson(item.second, val, allocator); -// if (val.IsNull() && item.first.empty()) { -// // skip null value with empty key, indicate the null json value of root in variant map, -// // usally padding in nested arrays -// continue; -// } -// target.AddMember(key, val, allocator); -// } -// break; -// } -// default: -// throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unkown field type: {}", -// field.get_type_name()); -// break; -// } -// } -// -// void convert_variant_map_to_rapidjson(const vectorized::VariantMap& map, rapidjson::Value& target, -// rapidjson::Document::AllocatorType& allocator) { -// target.SetObject(); -// for (const auto& item : map) { -// if (item.second.is_null()) { -// continue; -// } -// rapidjson::Value key; -// key.SetString(item.first.get_path().data(), -// cast_set<rapidjson::SizeType>(item.first.get_path().size())); -// rapidjson::Value val; -// convert_field_to_rapidjson(item.second, val, allocator); -// if (val.IsNull() && item.first.empty()) { -// // skip null value with empty key, indicate the null json value of root in variant map, -// // usally padding in nested arrays -// continue; -// } -// target.AddMember(key, val, allocator); -// } -// } -// -// void convert_array_to_rapidjson(const vectorized::Array& array, rapidjson::Value& target, -// rapidjson::Document::AllocatorType& allocator) { -// target.SetArray(); -// for (const vectorized::Field& item : array) { -// rapidjson::Value val; -// convert_field_to_rapidjson(item, val, allocator); -// target.PushBack(val, allocator); -// } -// } -// -// TEST(ColumnVariantTest, insert_try_insert) { -// auto v = VariantUtil::construct_dst_varint_column(); -// FieldInfo info; -// info.scalar_type_id = TypeIndex::Nothing; -// info.num_dimensions = 0; -// PathInData path("v.f"); -// auto sub = v->get_subcolumn(path); -// Int64 value = 43; -// sub->insert(value, info); -// -// info.num_dimensions = 1; -// sub->insert(value, info); -// -// info.num_dimensions = 2; -// sub->insert(value, info); -// } -// -// TEST(ColumnVariantTest, basic_finalize) { -// auto variant = VariantUtil::construct_basic_varint_column(); -// // 4. finalize -// EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(variant->size(), 10); -// -// // check finalized subcolumn -// // 5 subcolumn + 1 root -// EXPECT_EQ(variant->subcolumns.size(), 6); -// for (const auto& column : variant->subcolumns) { -// if (column->data.is_root) { -// continue; -// } -// EXPECT_EQ(column->data.data.size(), 1); -// } -// -// // check sparse column -// const auto& offsets = variant->serialized_sparse_column_offsets(); -// for (int row = 0; row < 5; ++row) { -// EXPECT_EQ(offsets[row], 0); -// } -// for (int row = 5; row < 10; ++row) { -// EXPECT_EQ(offsets[row] - offsets[row - 1], 3); -// } -// } -// -// TEST(ColumnVariantTest, basic_deserialize) { -// auto variant = VariantUtil::construct_basic_varint_column(); -// -// // 4. finalize -// EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(variant->size(), 10); -// -// const auto& [path, value] = variant->get_sparse_data_paths_and_values(); -// const auto& offsets = variant->serialized_sparse_column_offsets(); -// for (size_t row = 5; row < 10; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.b.d", 5)); -// auto pair = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int64>(), 30); -// -// auto data2 = path->get_data_at(start); -// auto pair2 = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data2, StringRef("v.c.d", 5)); -// EXPECT_EQ(pair2.first.get<Int64>(), 30); -// -// auto data3 = path->get_data_at(start); -// auto pair3 = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// EXPECT_EQ(pair3.first.get<String>(), "50"); -// EXPECT_EQ(start, end); -// } -// } -// -// TEST(ColumnVariantTest, basic_inset_range_from) { -// auto src = VariantUtil::construct_basic_varint_column(); -// EXPECT_TRUE(src->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(src->size(), 10); -// -// // dst is an empty column, has 5 subcolumn + 1 root -// auto dst = VariantUtil::construct_dst_varint_column(); -// -// // subcolumn->subcolumn v.b v.f v.e -// // subcolumn->sparse_column v.a v.c -// // sparse_column->subcolumn v.b.d v.c.d -// // sparse_column->sparse_column v.d.d -// dst->insert_range_from(*src, 0, 10); -// dst->finalize(); -// EXPECT_EQ(dst->size(), 10); -// -// // 5 subcolumn -// EXPECT_EQ(dst->subcolumns.size(), 6); -// ColumnObject::Subcolumns dst_subcolumns = dst->subcolumns; -// std::sort( -// dst_subcolumns.begin(), dst_subcolumns.end(), -// [](const auto& lhsItem, const auto& rhsItem) { return lhsItem->path < rhsItem->path; }); -// -// for (const auto& column : dst_subcolumns) { -// if (column->data.is_root) { -// continue; -// } -// EXPECT_EQ(column->data.data.size(), 1); -// EXPECT_EQ(column->data.data[0]->size(), 10); -// if (column->path.get_path().size() == 3) { -// EXPECT_EQ(column->data.get_non_null_value_size(), 10); -// } else { -// EXPECT_EQ(column->path.get_path().size(), 5); -// EXPECT_EQ(column->data.get_non_null_value_size(), 5); -// for (size_t row = 0; row != 5; ++row) { -// EXPECT_TRUE(column->data.data[0]->is_null_at(row)); -// } -// for (size_t row = 5; row != 10; ++row) { -// EXPECT_EQ((*column->data.data[0])[row].get<Int64>(), 30); -// } -// } -// } -// -// // check sparse column -// const auto& [path, value] = dst->get_sparse_data_paths_and_values(); -// const auto& offsets = dst->serialized_sparse_column_offsets(); -// -// // v.a v.c -// for (int row = 0; row < 5; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.a", 3)); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int64>(), 20); -// -// auto data2 = path->get_data_at(start); -// EXPECT_EQ(data2, StringRef("v.c", 3)); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair2.first.get<Int64>(), 20); -// -// EXPECT_EQ(start, end); -// } -// -// // v.a v.c v.d.d -// for (int row = 5; row < 10; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.a", 3)); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int64>(), 20); -// -// auto data2 = path->get_data_at(start); -// EXPECT_EQ(data2, StringRef("v.c", 3)); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair2.first.get<Int64>(), 20); -// -// auto data3 = path->get_data_at(start); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// auto pair3 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair3.first.get<String>(), "50"); -// -// EXPECT_EQ(start, end); -// } -// } -// -// auto convert_to_jsonb_field(auto serde, auto& column) { -// vectorized::DataTypeSerDe::FormatOptions options; -// options.escape_char = '\\'; -// auto tmp_col = ColumnString::create(); -// VectorBufferWriter write_buffer(*tmp_col.get()); -// EXPECT_TRUE(serde->serialize_column_to_json(column, 0, 1, write_buffer, options).ok()); -// -// write_buffer.commit(); -// auto str_ref = tmp_col->get_data_at(0); -// Slice data((char*)(str_ref.data), str_ref.size); -// -// auto jsonb_type = doris::vectorized::DataTypeFactory::instance().create_data_type( -// TypeIndex::JSONB, false); -// auto jsonb_serde = jsonb_type->get_serde(); -// auto jsonb_column = jsonb_type->create_column(); -// -// DataTypeSerDe::FormatOptions format_options; -// format_options.converted_from_string = true; -// EXPECT_TRUE( -// jsonb_serde->deserialize_one_cell_from_json(*jsonb_column, data, format_options).ok()); -// auto res = jsonb_column->get_data_at(0); -// return JsonbField(res.data, res.size); -// } -// -// auto convert_string_to_jsonb_field(auto& column) { -// auto str_ref = column.get_data_at(0); -// Slice data((char*)(str_ref.data), str_ref.size); -// -// auto jsonb_type = doris::vectorized::DataTypeFactory::instance().create_data_type( -// TypeIndex::JSONB, false); -// auto jsonb_serde = jsonb_type->get_serde(); -// auto jsonb_column = jsonb_type->create_column(); -// DataTypeSerDe::FormatOptions format_options; -// format_options.converted_from_string = true; -// format_options.escape_char = '\\'; -// -// EXPECT_TRUE( -// jsonb_serde->deserialize_one_cell_from_json(*jsonb_column, data, format_options).ok()); -// auto res = jsonb_column->get_data_at(0); -// return JsonbField(res.data, res.size); -// } -// -// doris::vectorized::Field get_jsonb_field(std::string_view type) { -// static std::unordered_map<std::string_view, doris::vectorized::Field> field_map; -// if (field_map.empty()) { -// DataTypePtr data_type_int = doris::vectorized::DataTypeFactory::instance().create_data_type( -// TypeIndex::Int8, false); -// DataTypePtr data_type_array_int = -// std::make_shared<doris::vectorized::DataTypeArray>(data_type_int); -// auto array_column_int = data_type_array_int->create_column(); -// array_column_int->insert(VariantUtil::get_field("array_int")); -// auto array_serde_int = data_type_array_int->get_serde(); -// field_map["array_int"] = convert_to_jsonb_field(array_serde_int, *array_column_int); -// -// DataTypePtr data_type_str = doris::vectorized::DataTypeFactory::instance().create_data_type( -// TypeIndex::String, false); -// DataTypePtr data_type_array_str = -// std::make_shared<doris::vectorized::DataTypeArray>(data_type_str); -// auto array_column_str = data_type_array_str->create_column(); -// array_column_str->insert(VariantUtil::get_field("array_str")); -// auto array_serde_str = data_type_array_str->get_serde(); -// field_map["array_str"] = convert_to_jsonb_field(array_serde_str, *array_column_str); -// -// auto column_int = data_type_int->create_column(); -// column_int->insert(VariantUtil::get_field("int")); -// auto serde_int = data_type_int->get_serde(); -// field_map["int"] = convert_to_jsonb_field(serde_int, *column_int); -// -// // auto column_str = data_type_str->create_column(); -// // column_str->insert(VariantUtil::get_field("string")); -// // field_map["string"] = convert_string_to_jsonb_field(*column_str); -// } -// return field_map[type]; -// } -// -// // std::string convert_jsonb_field_to_string(doris::vectorized::Field jsonb) { -// // const auto& val = jsonb.get<JsonbField>(); -// // const JsonbValue* json_val = JsonbDocument::createValue(val.get_value(), val.get_size()); -// -// // rapidjson::Document doc; -// // doc.SetObject(); -// // rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); -// // rapidjson::Value json_value; -// // convert_jsonb_to_rapidjson(*json_val, json_value, allocator); -// // doc.AddMember("value", json_value, allocator); -// // rapidjson::StringBuffer buffer; -// // rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer); -// // doc.Accept(writer); -// // return std::string(buffer.GetString()); -// // } -// -// std::string convert_field_to_string(doris::vectorized::Field array) { +#include "vec/columns/column_object.h" + +#include <gmock/gmock-more-matchers.h> +#include <gtest/gtest.h> +#include <rapidjson/prettywriter.h> +#include <stdio.h> + +#include "runtime/jsonb_value.h" +#include "testutil/variant_util.h" +#include "vec/common/string_ref.h" +#include "vec/core/field.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_factory.hpp" + +using namespace doris::vectorized; + +using namespace doris; +// #define ADD_SUB_COLUMN(key) \ +// varaint->add_sub_column(PathInData(std::string_view(key)), 0); + +void convert_field_to_rapidjson(const vectorized::Field& field, rapidjson::Value& target, + rapidjson::Document::AllocatorType& allocator) { + switch (field.get_type()) { + case vectorized::Field::Types::Null: + target.SetNull(); + break; + case vectorized::Field::Types::Int64: + target.SetInt64(field.get<Int64>()); + break; + case vectorized::Field::Types::Float64: + target.SetDouble(field.get<Float64>()); + break; + case vectorized::Field::Types::JSONB: { + const auto& val = field.get<JsonbField>(); + JsonbValue* json_val = JsonbDocument::createValue(val.get_value(), val.get_size()); + convert_jsonb_to_rapidjson(*json_val, target, allocator); + break; + } + case vectorized::Field::Types::String: { + const String& val = field.get<String>(); + target.SetString(val.data(), cast_set<rapidjson::SizeType>(val.size())); + break; + } + case vectorized::Field::Types::Array: { + const vectorized::Array& array = field.get<Array>(); + target.SetArray(); + for (const vectorized::Field& item : array) { + rapidjson::Value val; + convert_field_to_rapidjson(item, val, allocator); + target.PushBack(val, allocator); + } + break; + } + case vectorized::Field::Types::VariantMap: { + const vectorized::VariantMap& map = field.get<VariantMap>(); + target.SetObject(); + for (const auto& item : map) { + if (item.second.is_null()) { + continue; + } + rapidjson::Value key; + key.SetString(item.first.get_path().data(), + cast_set<rapidjson::SizeType>(item.first.get_path().size())); + rapidjson::Value val; + convert_field_to_rapidjson(item.second, val, allocator); + if (val.IsNull() && item.first.empty()) { + // skip null value with empty key, indicate the null json value of root in variant map, + // usally padding in nested arrays + continue; + } + target.AddMember(key, val, allocator); + } + break; + } + default: + throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unkown field type: {}", + field.get_type_name()); + break; + } +} + +void convert_variant_map_to_rapidjson(const vectorized::VariantMap& map, rapidjson::Value& target, + rapidjson::Document::AllocatorType& allocator) { + target.SetObject(); + for (const auto& item : map) { + if (item.second.is_null()) { + continue; + } + rapidjson::Value key; + key.SetString(item.first.get_path().data(), + cast_set<rapidjson::SizeType>(item.first.get_path().size())); + rapidjson::Value val; + convert_field_to_rapidjson(item.second, val, allocator); + if (val.IsNull() && item.first.empty()) { + // skip null value with empty key, indicate the null json value of root in variant map, + // usally padding in nested arrays + continue; + } + target.AddMember(key, val, allocator); + } +} + +void convert_array_to_rapidjson(const vectorized::Array& array, rapidjson::Value& target, + rapidjson::Document::AllocatorType& allocator) { + target.SetArray(); + for (const vectorized::Field& item : array) { + rapidjson::Value val; + convert_field_to_rapidjson(item, val, allocator); + target.PushBack(val, allocator); + } +} + +TEST(ColumnVariantTest, insert_try_insert) { + auto v = VariantUtil::construct_dst_varint_column(); + FieldInfo info; + info.scalar_type_id = TypeIndex::Nothing; + info.num_dimensions = 0; + PathInData path("v.f"); + auto sub = v->get_subcolumn(path); + Int64 value = 43; + sub->insert(value, info); + + info.num_dimensions = 1; + sub->insert(value, info); + + info.num_dimensions = 2; + sub->insert(value, info); +} + +TEST(ColumnVariantTest, basic_finalize) { + auto variant = VariantUtil::construct_basic_varint_column(); + // 4. finalize + EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(variant->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(variant->size(), 10); + + // check finalized subcolumn + // 5 subcolumn + 1 root + EXPECT_EQ(variant->subcolumns.size(), 6); + for (const auto& column : variant->subcolumns) { + if (column->data.is_root) { + continue; + } + EXPECT_EQ(column->data.data.size(), 1); + } + + // check sparse column + const auto& offsets = variant->serialized_sparse_column_offsets(); + for (int row = 0; row < 5; ++row) { + EXPECT_EQ(offsets[row], 0); + } + for (int row = 5; row < 10; ++row) { + EXPECT_EQ(offsets[row] - offsets[row - 1], 3); + } +} + +TEST(ColumnVariantTest, basic_deserialize) { + auto variant = VariantUtil::construct_basic_varint_column(); + + // 4. finalize + EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(variant->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(variant->size(), 10); + + const auto& [path, value] = variant->get_sparse_data_paths_and_values(); + const auto& offsets = variant->serialized_sparse_column_offsets(); + for (size_t row = 5; row < 10; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.b.d", 5)); + auto pair = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int64>(), 30); + + auto data2 = path->get_data_at(start); + auto pair2 = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data2, StringRef("v.c.d", 5)); + EXPECT_EQ(pair2.first.get<Int64>(), 30); + + auto data3 = path->get_data_at(start); + auto pair3 = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + EXPECT_EQ(pair3.first.get<String>(), "50"); + EXPECT_EQ(start, end); + } +} + +TEST(ColumnVariantTest, basic_inset_range_from) { + auto src = VariantUtil::construct_basic_varint_column(); + EXPECT_TRUE(src->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(src->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(src->size(), 10); + + // dst is an empty column, has 5 subcolumn + 1 root + auto dst = VariantUtil::construct_dst_varint_column(); + + // subcolumn->subcolumn v.b v.f v.e + // subcolumn->sparse_column v.a v.c + // sparse_column->subcolumn v.b.d v.c.d + // sparse_column->sparse_column v.d.d + dst->insert_range_from(*src, 0, 10); + dst->finalize(); + EXPECT_EQ(dst->size(), 10); + + // 5 subcolumn + EXPECT_EQ(dst->subcolumns.size(), 6); + ColumnObject::Subcolumns dst_subcolumns = dst->subcolumns; + std::sort( + dst_subcolumns.begin(), dst_subcolumns.end(), + [](const auto& lhsItem, const auto& rhsItem) { return lhsItem->path < rhsItem->path; }); + + for (const auto& column : dst_subcolumns) { + if (column->data.is_root) { + continue; + } + EXPECT_EQ(column->data.data.size(), 1); + EXPECT_EQ(column->data.data[0]->size(), 10); + if (column->path.get_path().size() == 3) { + EXPECT_EQ(column->data.get_non_null_value_size(), 10); + } else { + EXPECT_EQ(column->path.get_path().size(), 5); + EXPECT_EQ(column->data.get_non_null_value_size(), 5); + for (size_t row = 0; row != 5; ++row) { + EXPECT_TRUE(column->data.data[0]->is_null_at(row)); + } + for (size_t row = 5; row != 10; ++row) { + EXPECT_EQ((*column->data.data[0])[row].get<Int64>(), 30); + } + } + } + + // check sparse column + const auto& [path, value] = dst->get_sparse_data_paths_and_values(); + const auto& offsets = dst->serialized_sparse_column_offsets(); + + // v.a v.c + for (int row = 0; row < 5; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.a", 3)); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int64>(), 20); + + auto data2 = path->get_data_at(start); + EXPECT_EQ(data2, StringRef("v.c", 3)); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair2.first.get<Int64>(), 20); + + EXPECT_EQ(start, end); + } + + // v.a v.c v.d.d + for (int row = 5; row < 10; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.a", 3)); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int64>(), 20); + + auto data2 = path->get_data_at(start); + EXPECT_EQ(data2, StringRef("v.c", 3)); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair2.first.get<Int64>(), 20); + + auto data3 = path->get_data_at(start); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + auto pair3 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair3.first.get<String>(), "50"); + + EXPECT_EQ(start, end); + } +} + +auto convert_to_jsonb_field(auto serde, auto& column) { + vectorized::DataTypeSerDe::FormatOptions options; + options.escape_char = '\\'; + auto tmp_col = ColumnString::create(); + VectorBufferWriter write_buffer(*tmp_col.get()); + EXPECT_TRUE(serde->serialize_column_to_json(column, 0, 1, write_buffer, options).ok()); + + write_buffer.commit(); + auto str_ref = tmp_col->get_data_at(0); + Slice data((char*)(str_ref.data), str_ref.size); + + auto jsonb_type = doris::vectorized::DataTypeFactory::instance().create_data_type( + TypeIndex::JSONB, false); + auto jsonb_serde = jsonb_type->get_serde(); + auto jsonb_column = jsonb_type->create_column(); + + DataTypeSerDe::FormatOptions format_options; + format_options.converted_from_string = true; + EXPECT_TRUE( + jsonb_serde->deserialize_one_cell_from_json(*jsonb_column, data, format_options).ok()); + auto res = jsonb_column->get_data_at(0); + return JsonbField(res.data, res.size); +} + +auto convert_string_to_jsonb_field(auto& column) { + auto str_ref = column.get_data_at(0); + Slice data((char*)(str_ref.data), str_ref.size); + + auto jsonb_type = doris::vectorized::DataTypeFactory::instance().create_data_type( + TypeIndex::JSONB, false); + auto jsonb_serde = jsonb_type->get_serde(); + auto jsonb_column = jsonb_type->create_column(); + DataTypeSerDe::FormatOptions format_options; + format_options.converted_from_string = true; + format_options.escape_char = '\\'; + + EXPECT_TRUE( + jsonb_serde->deserialize_one_cell_from_json(*jsonb_column, data, format_options).ok()); + auto res = jsonb_column->get_data_at(0); + return JsonbField(res.data, res.size); +} + +doris::vectorized::Field get_jsonb_field(std::string_view type) { + static std::unordered_map<std::string_view, doris::vectorized::Field> field_map; + if (field_map.empty()) { + DataTypePtr data_type_int = doris::vectorized::DataTypeFactory::instance().create_data_type( + TypeIndex::Int8, false); + DataTypePtr data_type_array_int = + std::make_shared<doris::vectorized::DataTypeArray>(data_type_int); + auto array_column_int = data_type_array_int->create_column(); + array_column_int->insert(VariantUtil::get_field("array_int")); + auto array_serde_int = data_type_array_int->get_serde(); + field_map["array_int"] = convert_to_jsonb_field(array_serde_int, *array_column_int); + + DataTypePtr data_type_str = doris::vectorized::DataTypeFactory::instance().create_data_type( + TypeIndex::String, false); + DataTypePtr data_type_array_str = + std::make_shared<doris::vectorized::DataTypeArray>(data_type_str); + auto array_column_str = data_type_array_str->create_column(); + array_column_str->insert(VariantUtil::get_field("array_str")); + auto array_serde_str = data_type_array_str->get_serde(); + field_map["array_str"] = convert_to_jsonb_field(array_serde_str, *array_column_str); + + auto column_int = data_type_int->create_column(); + column_int->insert(VariantUtil::get_field("int")); + auto serde_int = data_type_int->get_serde(); + field_map["int"] = convert_to_jsonb_field(serde_int, *column_int); + + // auto column_str = data_type_str->create_column(); + // column_str->insert(VariantUtil::get_field("string")); + // field_map["string"] = convert_string_to_jsonb_field(*column_str); + } + return field_map[type]; +} + +// std::string convert_jsonb_field_to_string(doris::vectorized::Field jsonb) { +// const auto& val = jsonb.get<JsonbField>(); +// const JsonbValue* json_val = JsonbDocument::createValue(val.get_value(), val.get_size()); + // rapidjson::Document doc; // doc.SetObject(); // rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); // rapidjson::Value json_value; -// // DataTypeSerDe::convert_field_to_rapidjson(array, json_value, allocator); +// convert_jsonb_to_rapidjson(*json_val, json_value, allocator); // doc.AddMember("value", json_value, allocator); // rapidjson::StringBuffer buffer; // rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer); // doc.Accept(writer); // return std::string(buffer.GetString()); // } -// -// TEST(ColumnVariantTest, is_null_at) { -// auto v = VariantUtil::construct_dst_varint_column(); -// PathInData path("v.f"); -// auto sub = v->get_subcolumn(path); -// std::cout << sub->get_least_common_typeBase()->get_name() << std::endl; -// EXPECT_TRUE(sub->is_null_at(0)); -// -// auto v1 = VariantUtil::construct_advanced_varint_column(); -// PathInData path1("v.b.d"); -// auto sub1 = v1->get_subcolumn(path1); -// EXPECT_TRUE(sub1->is_null_at(2)); -// EXPECT_ANY_THROW(sub1->is_null_at(16)); -// vectorized::Field f; -// EXPECT_ANY_THROW(sub1->get(16, f)); -// std::cout << sub1->num_rows << std::endl; -// EXPECT_NO_THROW(sub1->resize(sub1->num_rows)); -// -// auto [sparse_column_keys, sparse_column_values] = v1->get_sparse_data_paths_and_values(); -// std::string_view pa("v.a"); -// EXPECT_NO_THROW( -// sub1->serialize_to_sparse_column(sparse_column_keys, pa, sparse_column_values, 2)); -// EXPECT_ANY_THROW( -// sub1->serialize_to_sparse_column(sparse_column_keys, pa, sparse_column_values, 16)); -// } -// -// TEST(ColumnVariantTest, advanced_finalize) { -// auto variant = VariantUtil::construct_advanced_varint_column(); -// -// // 4. finalize -// EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(variant->size(), 15); -// -// // check finalized subcolumn -// // 5 subcolumn + 1 root -// EXPECT_EQ(variant->subcolumns.size(), 6); -// for (const auto& column : variant->subcolumns) { -// if (column->data.is_root) { -// continue; -// } -// EXPECT_EQ(column->data.data.size(), 1); -// } -// -// // check sparse column -// const auto& offsets = variant->serialized_sparse_column_offsets(); -// for (int row = 0; row < 5; ++row) { -// EXPECT_EQ(offsets[row] - offsets[row - 1], 0); -// } -// for (int row = 5; row < 15; ++row) { -// EXPECT_EQ(offsets[row] - offsets[row - 1], 3); -// } -// -// { -// // Test fill_path_column_from_sparse_data -// auto map = std::make_unique<NullMap>(15, 0); -// vectorized::ColumnObject::fill_path_column_from_sparse_data( -// *variant->get_subcolumn({}) /*root*/, map.get(), StringRef {"array"}, -// variant->get_sparse_column(), 0, 5); -// vectorized::ColumnObject::fill_path_column_from_sparse_data( -// *variant->get_subcolumn({}) /*root*/, map.get(), StringRef {"array"}, -// variant->get_sparse_column(), 5, 15); -// } -// } -// -// TEST(ColumnVariantTest, advanced_deserialize) { -// auto variant = VariantUtil::construct_advanced_varint_column(); -// -// // 4. finalize -// EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(variant->size(), 15); -// -// const auto& [path, value] = variant->get_sparse_data_paths_and_values(); -// const auto& offsets = variant->serialized_sparse_column_offsets(); -// for (size_t row = 5; row < 10; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// auto pair = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data, StringRef("v.b.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair.first), -// convert_field_to_string(get_jsonb_field("array_int"))); -// -// auto data2 = path->get_data_at(start); -// auto pair2 = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data2, StringRef("v.c.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair2.first), -// convert_field_to_string(VariantUtil::get_field("string"))); -// -// auto data3 = path->get_data_at(start); -// auto pair3 = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair3.first), -// convert_field_to_string(get_jsonb_field("array_int"))); -// EXPECT_EQ(start, end); -// } -// -// for (size_t row = 10; row < 15; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// auto pair = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data, StringRef("v.b.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair.first), -// convert_field_to_string(get_jsonb_field("array_str"))); -// -// auto data2 = path->get_data_at(start); -// auto pair2 = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data2, StringRef("v.c.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair2.first), -// convert_field_to_string(get_jsonb_field("int"))); -// -// auto data3 = path->get_data_at(start); -// auto pair3 = variant->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair3.first), -// convert_field_to_string(get_jsonb_field("array_str"))); -// EXPECT_EQ(start, end); -// } -// } -// -// TEST(ColumnVariantTest, advanced_insert_range_from) { -// auto src = VariantUtil::construct_advanced_varint_column(); -// EXPECT_TRUE(src->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(src->size(), 15); -// -// auto dst = VariantUtil::construct_dst_varint_column(); -// -// // subcolumn->subcolumn v.b v.f v.e -// // subcolumn->sparse_column v.a v.c -// // sparse_column->subcolumn v.b.d v.c.d -// // sparse_column->sparse_column v.d.d -// dst->insert_range_from(*src, 0, src->size()); -// dst->finalize(); -// EXPECT_EQ(dst->size(), 15); -// -// EXPECT_EQ(dst->subcolumns.size(), 6); -// ColumnObject::Subcolumns dst_subcolumns = dst->subcolumns; -// -// std::sort( -// dst_subcolumns.begin(), dst_subcolumns.end(), -// [](const auto& lhsItem, const auto& rhsItem) { return lhsItem->path < rhsItem->path; }); -// -// // subcolumns -// for (const auto& column : dst_subcolumns) { -// if (column->data.is_root) { -// continue; -// } -// EXPECT_EQ(column->data.data.size(), 1); -// EXPECT_EQ(column->data.data[0]->size(), 15); -// -// if (column->path.get_path().size() == 3) { -// EXPECT_EQ(column->data.get_non_null_value_size(), 15); -// if (column->path.get_path() == "v.b") { -// EXPECT_EQ(assert_cast<const DataTypeNullable*>(column->data.data_types[0].get()) -// ->get_nested_type() -// ->get_type_id(), -// TypeIndex::JSONB); -// } -// } else if (column->path.get_path().size() == 5) { -// EXPECT_EQ(column->data.get_non_null_value_size(), 10); -// EXPECT_EQ(assert_cast<const DataTypeNullable*>(column->data.data_types[0].get()) -// ->get_nested_type() -// ->get_type_id(), -// TypeIndex::JSONB); -// for (size_t row = 0; row < 5; ++row) { -// EXPECT_TRUE(column->data.data[0]->is_null_at(row)); -// } -// } -// } -// -// // sparse columns -// const auto& [path, value] = dst->get_sparse_data_paths_and_values(); -// const auto& offsets = dst->serialized_sparse_column_offsets(); -// -// // v.a v.c -// for (int row = 0; row < 5; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.a", 3)); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int64>(), 20); -// -// auto data2 = path->get_data_at(start); -// EXPECT_EQ(data2, StringRef("v.c", 3)); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(convert_field_to_string(pair2.first), -// convert_field_to_string(VariantUtil::get_field("array_int"))); -// -// EXPECT_EQ(start, end); -// } -// -// for (int row = 5; row < 10; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data, StringRef("v.a", 3)); -// EXPECT_EQ(pair.first.get<Int64>(), 20); -// -// auto data2 = path->get_data_at(start); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data2, StringRef("v.c", 3)); -// EXPECT_EQ(convert_field_to_string(pair2.first), -// convert_field_to_string(VariantUtil::get_field("array_int"))); -// -// auto data3 = path->get_data_at(start); -// auto pair3 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair3.first), -// convert_field_to_string(get_jsonb_field("array_int"))); -// -// EXPECT_EQ(start, end); -// } -// -// for (int row = 10; row < 15; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data, StringRef("v.a", 3)); -// EXPECT_EQ(pair.first.get<Int64>(), 20); -// -// auto data2 = path->get_data_at(start); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data2, StringRef("v.c", 3)); -// EXPECT_EQ(convert_field_to_string(pair2.first), -// convert_field_to_string(VariantUtil::get_field("array_int"))); -// -// auto data3 = path->get_data_at(start); -// auto pair3 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// EXPECT_EQ(convert_field_to_string(pair3.first), -// convert_field_to_string(get_jsonb_field("array_str"))); -// -// EXPECT_EQ(start, end); -// } -// } -// -// TEST(ColumnVariantTest, empty_inset_range_from) { -// auto src = VariantUtil::construct_varint_column_only_subcolumns(); -// EXPECT_TRUE(src->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(src->size(), 6); -// -// // dst is an empty column -// auto dst = ColumnObject::create(5); -// -// // subcolumn->subcolumn v.a v.b v.c v.f v.e -// dst->insert_range_from(*src, 0, 6); -// EXPECT_EQ(dst->size(), 6); -// -// // 5 subcolumn -// EXPECT_EQ(dst->subcolumns.size(), 6); -// -// for (const auto& column : dst->subcolumns) { -// if (column->data.is_root) { -// EXPECT_EQ(column->data.data.size(), 1); -// EXPECT_EQ(column->data.data[0]->size(), 6); -// EXPECT_EQ(column->data.get_non_null_value_size(), 1); -// continue; -// } -// EXPECT_EQ(column->data.data.size(), 1); -// EXPECT_EQ(column->data.data[0]->size(), 6); -// EXPECT_EQ(column->data.get_non_null_value_size(), 5); -// } -// -// // empty sparse column -// const auto& [path, value] = dst->get_sparse_data_paths_and_values(); -// const auto& offsets = dst->serialized_sparse_column_offsets(); -// EXPECT_EQ(offsets[4], offsets[-1]); -// EXPECT_EQ(path->size(), value->size()); -// -// auto src_contains_seven_subcolumns = VariantUtil::construct_varint_column_more_subcolumns(); -// -// EXPECT_TRUE( -// src_contains_seven_subcolumns->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); -// EXPECT_EQ(src_contains_seven_subcolumns->size(), 5); -// -// // subcolumn->subcolumn v.a v.b v.c v.f v.e -// // add sprase columns v.s v.x v.y v.z -// dst->insert_range_from(*src_contains_seven_subcolumns, 0, 5); -// EXPECT_EQ(dst->size(), 11); -// -// // 5 subcolumn -// EXPECT_EQ(dst->subcolumns.size(), 6); -// -// for (int row = 0; row < 6; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// EXPECT_EQ(start, end); -// } -// -// // v.s v.x v.y v.z -// for (int row = 6; row < 11; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data0 = path->get_data_at(start); -// EXPECT_EQ(data0, StringRef("v.s", 3)); -// auto pair0 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(convert_field_to_string(pair0.first), -// convert_field_to_string(VariantUtil::get_field("string"))); -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.x", 3)); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int16>(), std::numeric_limits<Int16>::max()); -// -// auto data2 = path->get_data_at(start); -// EXPECT_EQ(data2, StringRef("v.y", 3)); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair2.first.get<Int32>(), std::numeric_limits<Int32>::max()); -// -// auto data3 = path->get_data_at(start); -// EXPECT_EQ(data3, StringRef("v.z", 3)); -// auto pair3 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair3.first.get<Int64>(), -// Int64(static_cast<Int64>(std::numeric_limits<Int32>::max()) + 1)); -// -// EXPECT_EQ(start, end); -// } -// -// auto src_contains_subcoumns_and_sparse_columns = VariantUtil::construct_basic_varint_column(); -// EXPECT_TRUE(src_contains_subcoumns_and_sparse_columns -// ->finalize(ColumnObject::FinalizeMode::WRITE_MODE) -// .ok()); -// EXPECT_EQ(src_contains_subcoumns_and_sparse_columns->size(), 10); -// -// // subcolumn->subcolumn v.a v.b v.c v.f v.e -// // add sprase columns v.s v.x v.y v.b.d v.c.d v.d.d -// dst->insert_range_from(*src_contains_subcoumns_and_sparse_columns, 0, 10); -// EXPECT_EQ(dst->size(), 21); -// -// // 5 subcolumn -// EXPECT_EQ(dst->subcolumns.size(), 6); -// -// for (int row = 0; row < 6; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// EXPECT_EQ(start, end); -// } -// -// // v.x v.y -// for (int row = 6; row < 11; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data0 = path->get_data_at(start); -// EXPECT_EQ(data0, StringRef("v.s", 3)); -// auto pair0 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(convert_field_to_string(pair0.first), -// convert_field_to_string(VariantUtil::get_field("string"))); -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.x", 3)); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int16>(), std::numeric_limits<Int16>::max()); -// -// auto data2 = path->get_data_at(start); -// EXPECT_EQ(data2, StringRef("v.y", 3)); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair2.first.get<Int32>(), std::numeric_limits<Int32>::max()); -// -// auto data3 = path->get_data_at(start); -// EXPECT_EQ(data3, StringRef("v.z", 3)); -// auto pair3 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair3.first.get<Int64>(), -// Int64(static_cast<Int64>(std::numeric_limits<Int32>::max()) + 1)); -// -// EXPECT_EQ(start, end); -// } -// -// for (int row = 11; row < 16; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// EXPECT_EQ(start, end); -// } -// -// //v.b.d v.c.d v.d.d -// for (int row = 16; row < 21; ++row) { -// size_t start = offsets[row - 1]; -// size_t end = offsets[row]; -// -// auto data = path->get_data_at(start); -// EXPECT_EQ(data, StringRef("v.b.d", 5)); -// auto pair = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(pair.first.get<Int64>(), 30); -// -// auto data2 = path->get_data_at(start); -// auto pair2 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data2, StringRef("v.c.d", 5)); -// EXPECT_EQ(pair2.first.get<Int64>(), 30); -// -// auto data3 = path->get_data_at(start); -// auto pair3 = dst->deserialize_from_sparse_column(value, start++); -// EXPECT_EQ(data3, StringRef("v.d.d", 5)); -// EXPECT_EQ(pair3.first.get<String>(), "50"); -// EXPECT_EQ(start, end); -// } -// } \ No newline at end of file + +std::string convert_field_to_string(doris::vectorized::Field array) { + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + rapidjson::Value json_value; + // DataTypeSerDe::convert_field_to_rapidjson(array, json_value, allocator); + doc.AddMember("value", json_value, allocator); + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer); + doc.Accept(writer); + return std::string(buffer.GetString()); +} + +TEST(ColumnVariantTest, is_null_at) { + auto v = VariantUtil::construct_dst_varint_column(); + PathInData path("v.f"); + auto sub = v->get_subcolumn(path); + std::cout << sub->get_least_common_typeBase()->get_name() << std::endl; + EXPECT_TRUE(sub->is_null_at(0)); + + auto v1 = VariantUtil::construct_advanced_varint_column(); + PathInData path1("v.b.d"); + auto sub1 = v1->get_subcolumn(path1); + EXPECT_TRUE(sub1->is_null_at(2)); + EXPECT_ANY_THROW(sub1->is_null_at(16)); + vectorized::Field f; + EXPECT_ANY_THROW(sub1->get(16, f)); + std::cout << sub1->num_rows << std::endl; + EXPECT_NO_THROW(sub1->resize(sub1->num_rows)); + + auto [sparse_column_keys, sparse_column_values] = v1->get_sparse_data_paths_and_values(); + std::string_view pa("v.a"); + EXPECT_NO_THROW( + sub1->serialize_to_sparse_column(sparse_column_keys, pa, sparse_column_values, 2)); + EXPECT_ANY_THROW( + sub1->serialize_to_sparse_column(sparse_column_keys, pa, sparse_column_values, 16)); +} + +TEST(ColumnVariantTest, advanced_finalize) { + auto variant = VariantUtil::construct_advanced_varint_column(); + + // 4. finalize + EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(variant->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(variant->size(), 15); + + // check finalized subcolumn + // 5 subcolumn + 1 root + EXPECT_EQ(variant->subcolumns.size(), 6); + for (const auto& column : variant->subcolumns) { + if (column->data.is_root) { + continue; + } + EXPECT_EQ(column->data.data.size(), 1); + } + + // check sparse column + const auto& offsets = variant->serialized_sparse_column_offsets(); + for (int row = 0; row < 5; ++row) { + EXPECT_EQ(offsets[row] - offsets[row - 1], 0); + } + for (int row = 5; row < 15; ++row) { + EXPECT_EQ(offsets[row] - offsets[row - 1], 3); + } + + { + // Test fill_path_column_from_sparse_data + auto map = std::make_unique<NullMap>(15, 0); + vectorized::ColumnObject::fill_path_column_from_sparse_data( + *variant->get_subcolumn({}) /*root*/, map.get(), StringRef {"array"}, + variant->get_sparse_column(), 0, 5); + vectorized::ColumnObject::fill_path_column_from_sparse_data( + *variant->get_subcolumn({}) /*root*/, map.get(), StringRef {"array"}, + variant->get_sparse_column(), 5, 15); + } +} + +TEST(ColumnVariantTest, advanced_deserialize) { + auto variant = VariantUtil::construct_advanced_varint_column(); + + // 4. finalize + EXPECT_TRUE(variant->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(variant->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(variant->size(), 15); + + const auto& [path, value] = variant->get_sparse_data_paths_and_values(); + const auto& offsets = variant->serialized_sparse_column_offsets(); + for (size_t row = 5; row < 10; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + auto pair = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data, StringRef("v.b.d", 5)); + EXPECT_EQ(convert_field_to_string(pair.first), + convert_field_to_string(get_jsonb_field("array_int"))); + + auto data2 = path->get_data_at(start); + auto pair2 = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data2, StringRef("v.c.d", 5)); + EXPECT_EQ(convert_field_to_string(pair2.first), + convert_field_to_string(VariantUtil::get_field("string"))); + + auto data3 = path->get_data_at(start); + auto pair3 = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + EXPECT_EQ(convert_field_to_string(pair3.first), + convert_field_to_string(get_jsonb_field("array_int"))); + EXPECT_EQ(start, end); + } + + for (size_t row = 10; row < 15; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + auto pair = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data, StringRef("v.b.d", 5)); + EXPECT_EQ(convert_field_to_string(pair.first), + convert_field_to_string(get_jsonb_field("array_str"))); + + auto data2 = path->get_data_at(start); + auto pair2 = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data2, StringRef("v.c.d", 5)); + EXPECT_EQ(convert_field_to_string(pair2.first), + convert_field_to_string(get_jsonb_field("int"))); + + auto data3 = path->get_data_at(start); + auto pair3 = variant->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + EXPECT_EQ(convert_field_to_string(pair3.first), + convert_field_to_string(get_jsonb_field("array_str"))); + EXPECT_EQ(start, end); + } +} + +TEST(ColumnVariantTest, advanced_insert_range_from) { + auto src = VariantUtil::construct_advanced_varint_column(); + EXPECT_TRUE(src->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(src->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(src->size(), 15); + + auto dst = VariantUtil::construct_dst_varint_column(); + + // subcolumn->subcolumn v.b v.f v.e + // subcolumn->sparse_column v.a v.c + // sparse_column->subcolumn v.b.d v.c.d + // sparse_column->sparse_column v.d.d + dst->insert_range_from(*src, 0, src->size()); + dst->finalize(); + EXPECT_EQ(dst->size(), 15); + + EXPECT_EQ(dst->subcolumns.size(), 6); + ColumnObject::Subcolumns dst_subcolumns = dst->subcolumns; + + std::sort( + dst_subcolumns.begin(), dst_subcolumns.end(), + [](const auto& lhsItem, const auto& rhsItem) { return lhsItem->path < rhsItem->path; }); + + // subcolumns + for (const auto& column : dst_subcolumns) { + if (column->data.is_root) { + continue; + } + EXPECT_EQ(column->data.data.size(), 1); + EXPECT_EQ(column->data.data[0]->size(), 15); + + if (column->path.get_path().size() == 3) { + EXPECT_EQ(column->data.get_non_null_value_size(), 15); + if (column->path.get_path() == "v.b") { + EXPECT_EQ(assert_cast<const DataTypeNullable*>(column->data.data_types[0].get()) + ->get_nested_type() + ->get_type_id(), + TypeIndex::JSONB); + } + } else if (column->path.get_path().size() == 5) { + EXPECT_EQ(column->data.get_non_null_value_size(), 10); + EXPECT_EQ(assert_cast<const DataTypeNullable*>(column->data.data_types[0].get()) + ->get_nested_type() + ->get_type_id(), + TypeIndex::JSONB); + for (size_t row = 0; row < 5; ++row) { + EXPECT_TRUE(column->data.data[0]->is_null_at(row)); + } + } + } + + // sparse columns + const auto& [path, value] = dst->get_sparse_data_paths_and_values(); + const auto& offsets = dst->serialized_sparse_column_offsets(); + + // v.a v.c + for (int row = 0; row < 5; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.a", 3)); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int64>(), 20); + + auto data2 = path->get_data_at(start); + EXPECT_EQ(data2, StringRef("v.c", 3)); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(convert_field_to_string(pair2.first), + convert_field_to_string(VariantUtil::get_field("array_int"))); + + EXPECT_EQ(start, end); + } + + for (int row = 5; row < 10; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data, StringRef("v.a", 3)); + EXPECT_EQ(pair.first.get<Int64>(), 20); + + auto data2 = path->get_data_at(start); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data2, StringRef("v.c", 3)); + EXPECT_EQ(convert_field_to_string(pair2.first), + convert_field_to_string(VariantUtil::get_field("array_int"))); + + auto data3 = path->get_data_at(start); + auto pair3 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + EXPECT_EQ(convert_field_to_string(pair3.first), + convert_field_to_string(get_jsonb_field("array_int"))); + + EXPECT_EQ(start, end); + } + + for (int row = 10; row < 15; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data, StringRef("v.a", 3)); + EXPECT_EQ(pair.first.get<Int64>(), 20); + + auto data2 = path->get_data_at(start); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data2, StringRef("v.c", 3)); + EXPECT_EQ(convert_field_to_string(pair2.first), + convert_field_to_string(VariantUtil::get_field("array_int"))); + + auto data3 = path->get_data_at(start); + auto pair3 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + EXPECT_EQ(convert_field_to_string(pair3.first), + convert_field_to_string(get_jsonb_field("array_str"))); + + EXPECT_EQ(start, end); + } +} + +TEST(ColumnVariantTest, empty_inset_range_from) { + auto src = VariantUtil::construct_varint_column_only_subcolumns(); + EXPECT_TRUE(src->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(src->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(src->size(), 6); + + // dst is an empty column + auto dst = ColumnObject::create(5); + + // subcolumn->subcolumn v.a v.b v.c v.f v.e + dst->insert_range_from(*src, 0, 6); + EXPECT_EQ(dst->size(), 6); + + // 5 subcolumn + EXPECT_EQ(dst->subcolumns.size(), 6); + + for (const auto& column : dst->subcolumns) { + if (column->data.is_root) { + EXPECT_EQ(column->data.data.size(), 1); + EXPECT_EQ(column->data.data[0]->size(), 6); + EXPECT_EQ(column->data.get_non_null_value_size(), 1); + continue; + } + EXPECT_EQ(column->data.data.size(), 1); + EXPECT_EQ(column->data.data[0]->size(), 6); + EXPECT_EQ(column->data.get_non_null_value_size(), 5); + } + + // empty sparse column + const auto& [path, value] = dst->get_sparse_data_paths_and_values(); + const auto& offsets = dst->serialized_sparse_column_offsets(); + EXPECT_EQ(offsets[4], offsets[-1]); + EXPECT_EQ(path->size(), value->size()); + + auto src_contains_seven_subcolumns = VariantUtil::construct_varint_column_more_subcolumns(); + + EXPECT_TRUE( + src_contains_seven_subcolumns->finalize(ColumnObject::FinalizeMode::WRITE_MODE).ok()); + EXPECT_TRUE(src_contains_seven_subcolumns->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(src_contains_seven_subcolumns->size(), 5); + + // subcolumn->subcolumn v.a v.b v.c v.f v.e + // add sprase columns v.s v.x v.y v.z + dst->insert_range_from(*src_contains_seven_subcolumns, 0, 5); + EXPECT_EQ(dst->size(), 11); + + // 5 subcolumn + EXPECT_EQ(dst->subcolumns.size(), 6); + + for (int row = 0; row < 6; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + EXPECT_EQ(start, end); + } + + // v.s v.x v.y v.z + for (int row = 6; row < 11; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data0 = path->get_data_at(start); + EXPECT_EQ(data0, StringRef("v.s", 3)); + auto pair0 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(convert_field_to_string(pair0.first), + convert_field_to_string(VariantUtil::get_field("string"))); + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.x", 3)); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int16>(), std::numeric_limits<Int16>::max()); + + auto data2 = path->get_data_at(start); + EXPECT_EQ(data2, StringRef("v.y", 3)); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair2.first.get<Int32>(), std::numeric_limits<Int32>::max()); + + auto data3 = path->get_data_at(start); + EXPECT_EQ(data3, StringRef("v.z", 3)); + auto pair3 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair3.first.get<Int64>(), + Int64(static_cast<Int64>(std::numeric_limits<Int32>::max()) + 1)); + + EXPECT_EQ(start, end); + } + + auto src_contains_subcoumns_and_sparse_columns = VariantUtil::construct_basic_varint_column(); + EXPECT_TRUE(src_contains_subcoumns_and_sparse_columns + ->finalize(ColumnObject::FinalizeMode::WRITE_MODE) + .ok()); + EXPECT_TRUE( + src_contains_subcoumns_and_sparse_columns->pick_subcolumns_to_sparse_column({}).ok()); + EXPECT_EQ(src_contains_subcoumns_and_sparse_columns->size(), 10); + + // subcolumn->subcolumn v.a v.b v.c v.f v.e + // add sprase columns v.s v.x v.y v.b.d v.c.d v.d.d + dst->insert_range_from(*src_contains_subcoumns_and_sparse_columns, 0, 10); + EXPECT_EQ(dst->size(), 21); + + // 5 subcolumn + EXPECT_EQ(dst->subcolumns.size(), 6); + + for (int row = 0; row < 6; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + EXPECT_EQ(start, end); + } + + // v.x v.y + for (int row = 6; row < 11; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data0 = path->get_data_at(start); + EXPECT_EQ(data0, StringRef("v.s", 3)); + auto pair0 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(convert_field_to_string(pair0.first), + convert_field_to_string(VariantUtil::get_field("string"))); + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.x", 3)); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int16>(), std::numeric_limits<Int16>::max()); + + auto data2 = path->get_data_at(start); + EXPECT_EQ(data2, StringRef("v.y", 3)); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair2.first.get<Int32>(), std::numeric_limits<Int32>::max()); + + auto data3 = path->get_data_at(start); + EXPECT_EQ(data3, StringRef("v.z", 3)); + auto pair3 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair3.first.get<Int64>(), + Int64(static_cast<Int64>(std::numeric_limits<Int32>::max()) + 1)); + + EXPECT_EQ(start, end); + } + + for (int row = 11; row < 16; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + EXPECT_EQ(start, end); + } + + //v.b.d v.c.d v.d.d + for (int row = 16; row < 21; ++row) { + size_t start = offsets[row - 1]; + size_t end = offsets[row]; + + auto data = path->get_data_at(start); + EXPECT_EQ(data, StringRef("v.b.d", 5)); + auto pair = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(pair.first.get<Int64>(), 30); + + auto data2 = path->get_data_at(start); + auto pair2 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data2, StringRef("v.c.d", 5)); + EXPECT_EQ(pair2.first.get<Int64>(), 30); + + auto data3 = path->get_data_at(start); + auto pair3 = dst->deserialize_from_sparse_column(value, start++); + EXPECT_EQ(data3, StringRef("v.d.d", 5)); + EXPECT_EQ(pair3.first.get<String>(), "50"); + EXPECT_EQ(start, end); + } +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org