This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 6509c3b3d36 [test](index compaction) Add index compaction full flow UT test (#45746) 6509c3b3d36 is described below commit 6509c3b3d362e52e9ed9a92041dd44d009352431 Author: qiye <l...@selectdb.com> AuthorDate: Thu Dec 26 21:11:55 2024 +0800 [test](index compaction) Add index compaction full flow UT test (#45746) 1. Add index compaction full flow UT tests 2. Add index compaction performance test, disable by default. --- .../index_compaction_performance_test.cpp | 265 ++++++ .../compaction/index_compaction_test.cpp | 912 ++++++++++++++++++++- .../compaction/util/index_compaction_utils.cpp | 275 +++++-- .../inverted_index/data/sorted_wikipedia-50-1.json | 50 ++ .../inverted_index/data/sorted_wikipedia-50-2.json | 50 ++ 5 files changed, 1442 insertions(+), 110 deletions(-) diff --git a/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_performance_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_performance_test.cpp new file mode 100644 index 00000000000..566680e8b1e --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_performance_test.cpp @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gmock/gmock.h> + +#include <filesystem> +#include <map> +#include <string> + +#include "olap/utils.h" +#include "util/index_compaction_utils.cpp" + +namespace doris { + +using namespace doris::vectorized; + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "./ut_dir/inverted_index_test"; +constexpr static std::string_view tmp_dir = "./ut_dir/tmp"; + +class DISABLED_IndexCompactionPerformanceTest : public ::testing::Test { +protected: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _current_dir = std::string(buffer); + _absolute_dir = _current_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // tmp dir + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok()); + std::vector<StorePath> paths; + paths.emplace_back(std::string(tmp_dir), 1024000000); + auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); + Status st = tmp_file_dirs->init(); + EXPECT_TRUE(st.ok()) << st.to_json(); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + // storage engine + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + _engine_ref = engine.get(); + _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); + static_cast<void>(_data_dir->update_capacity()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + config::enable_segcompaction = false; + config::string_type_length_soft_limit_bytes = 2147483643; + config::inverted_index_dict_path = + _current_dir + "/be/src/clucene/src/contribs-lib/CLucene/analysis/jieba/dict"; + } + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + DISABLED_IndexCompactionPerformanceTest() = default; + ~DISABLED_IndexCompactionPerformanceTest() override = default; + + void _build_wiki_tablet(const KeysType& keys_type, + const InvertedIndexStorageFormatPB& storage_format, + const std::map<std::string, std::string>& properties) { + // tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(keys_type); + schema_pb.set_inverted_index_storage_format(storage_format); + + IndexCompactionUtils::construct_column(schema_pb.add_column(), 0, "STRING", "title"); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, + "idx_content", 1, "STRING", "content", properties); + IndexCompactionUtils::construct_column(schema_pb.add_column(), 2, "STRING", "redirect"); + IndexCompactionUtils::construct_column(schema_pb.add_column(), 3, "STRING", "namespace"); + if (keys_type == KeysType::UNIQUE_KEYS) { + // unique table must contain the DELETE_SIGN column + auto* column_pb = schema_pb.add_column(); + IndexCompactionUtils::construct_column(column_pb, 4, "TINYINT", DELETE_SIGN); + column_pb->set_length(1); + column_pb->set_index_length(1); + column_pb->set_is_nullable(false); + } + _tablet_schema = std::make_shared<TabletSchema>(); + _tablet_schema->init_from_pb(schema_pb); + + // tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + if (keys_type == KeysType::UNIQUE_KEYS) { + tablet_meta->_enable_unique_key_merge_on_write = true; + } + + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); + } + + void _run_normal_wiki_test() { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + std::string data_dir = + _current_dir + "/be/test/olap/rowset/segment_v2/inverted_index/data/performance"; + std::vector<std::string> data_files; + for (const auto& entry : std::filesystem::directory_iterator(data_dir)) { + if (entry.is_regular_file()) { + std::string filename = entry.path().filename().string(); + if (filename.starts_with("wikipedia") && filename.ends_with(".json")) { + std::cout << "Found file: " << filename << std::endl; + data_files.push_back(entry.path().string()); + } + } + } + + std::vector<RowsetSharedPtr> rowsets(data_files.size()); + auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 1); }; + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets, true); + + auto custom_check_index = [](const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 1); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 1); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1)); + EXPECT_TRUE(compaction._output_rowset->num_segments() == 1) + << compaction._output_rowset->num_segments(); + }; + + RowsetSharedPtr output_rowset_index; + Status st; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, true, + output_rowset_index, custom_check_index, + 10000000); + std::cout << "index compaction time: " << watch.get_elapse_second() << "s" << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); + + const auto& seg_path = output_rowset_index->segment_path(0); + EXPECT_TRUE(seg_path.has_value()) << seg_path.error(); + auto inverted_index_file_reader_index = IndexCompactionUtils::init_index_file_reader( + output_rowset_index, seg_path.value(), + _tablet_schema->get_inverted_index_storage_format()); + + auto custom_check_normal = [](const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 1); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 0); + EXPECT_TRUE(compaction._output_rowset->num_segments() == 1); + }; + + RowsetSharedPtr output_rowset_normal; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, false, + output_rowset_normal, custom_check_normal, + 10000000); + std::cout << "normal compaction time: " << watch.get_elapse_second() << "s" + << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); + const auto& seg_path_normal = output_rowset_normal->segment_path(0); + EXPECT_TRUE(seg_path_normal.has_value()) << seg_path_normal.error(); + auto inverted_index_file_reader_normal = IndexCompactionUtils::init_index_file_reader( + output_rowset_normal, seg_path_normal.value(), + _tablet_schema->get_inverted_index_storage_format()); + + // check index file terms + for (int idx = 10001; idx < 10002; idx++) { + auto dir_idx = inverted_index_file_reader_index->_open(idx, ""); + EXPECT_TRUE(dir_idx.has_value()) << dir_idx.error(); + auto dir_normal = inverted_index_file_reader_normal->_open(idx, ""); + EXPECT_TRUE(dir_normal.has_value()) << dir_normal.error(); + st = IndexCompactionUtils::check_idx_file_correctness(dir_idx->get(), + dir_normal->get()); + EXPECT_TRUE(st.ok()) << st.to_string(); + } + } + +private: + TabletSchemaSPtr _tablet_schema = nullptr; + StorageEngine* _engine_ref = nullptr; + std::unique_ptr<DataDir> _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _current_dir; + int64_t _inc_id = 1000; +}; + +TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_dup_v2_english) { + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2, properties); + _run_normal_wiki_test(); +} + +TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_dup_v2_unicode) { + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2, properties); + _run_normal_wiki_test(); +} + +TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_dup_v2_chinese) { + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2, properties); + _run_normal_wiki_test(); +} + +TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_mow_v2_english) { + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2, properties); + _run_normal_wiki_test(); +} + +TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_mow_v2_unicode) { + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2, properties); + _run_normal_wiki_test(); +} + +TEST_F(DISABLED_IndexCompactionPerformanceTest, tes_wikipedia_mow_v2_chinese) { + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2, properties); + _run_normal_wiki_test(); +} +} // namespace doris diff --git a/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp index 264786570e7..64aec3ffa4a 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/compaction/index_compaction_test.cpp @@ -17,6 +17,7 @@ #include <gmock/gmock.h> +#include "olap/utils.h" #include "util/index_compaction_utils.cpp" namespace doris { @@ -55,7 +56,25 @@ protected: _data_dir = std::make_unique<DataDir>(*_engine_ref, _absolute_dir); static_cast<void>(_data_dir->update_capacity()); ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + config::inverted_index_dict_path = + _current_dir + "/be/src/clucene/src/contribs-lib/CLucene/analysis/jieba/dict"; + } + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + // reset config + config::inverted_index_max_buffered_docs = -1; + config::compaction_batch_size = -1; + config::inverted_index_compaction_enable = false; + } + IndexCompactionTest() = default; + ~IndexCompactionTest() override = default; + + void _build_tablet() { // tablet_schema TabletSchemaPB schema_pb; schema_pb.set_keys_type(KeysType::DUP_KEYS); @@ -65,8 +84,10 @@ protected: "key_index", 0, "INT", "key"); IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, "v1_index", 1, "STRING", "v1"); + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10002, - "v2_index", 2, "STRING", "v2", true); + "v2_index", 2, "STRING", "v2", properties); IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, "v3_index", 3, "INT", "v3"); _tablet_schema = std::make_shared<TabletSchema>(); @@ -78,16 +99,625 @@ protected: _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); EXPECT_TRUE(_tablet->init().ok()); } - void TearDown() override { - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); - EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok()); - _engine_ref = nullptr; - ExecEnv::GetInstance()->set_storage_engine(nullptr); + + void _build_wiki_tablet(const KeysType& keys_type, + const InvertedIndexStorageFormatPB& storage_format) { + // tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(keys_type); + schema_pb.set_inverted_index_storage_format(storage_format); + + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, + "idx_title", 0, "STRING", "title", + std::map<std::string, std::string>(), true); + // parser = english, support_phrase = true, lower_case = true, char_filter = none + std::map<std::string, std::string> properties; + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, + "idx_content_1", 1, "STRING", "content_1", + properties); + properties.clear(); + // parser = english, support_phrase = true, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10002, + "idx_content_2", 2, "STRING", "content_2", + properties); + properties.clear(); + // parser = english, support_phrase = true, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10003, + "idx_content_3", 3, "STRING", "content_3", + properties); + properties.clear(); + // parser = english, support_phrase = true, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10004, + "idx_content_4", 4, "STRING", "content_4", + properties); + properties.clear(); + // parser = english, support_phrase = false, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10005, + "idx_content_5", 5, "STRING", "content_5", + properties); + properties.clear(); + // parser = english, support_phrase = false, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10006, + "idx_content_6", 6, "STRING", "content_6", + properties); + properties.clear(); + // parser = english, support_phrase = false, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10007, + "idx_content_7", 7, "STRING", "content_7", + properties); + properties.clear(); + // parser = english, support_phrase = false, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_ENGLISH); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10008, + "idx_content_8", 8, "STRING", "content_8", + properties); + properties.clear(); + // parser = unicode, support_phrase = true, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10009, + "idx_content_9", 9, "STRING", "content_9", + properties); + properties.clear(); + // parser = unicode, support_phrase = true, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10010, + "idx_content_10", 10, "STRING", "content_10", + properties); + properties.clear(); + // parser = unicode, support_phrase = true, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10011, + "idx_content_11", 11, "STRING", "content_11", + properties); + properties.clear(); + // parser = unicode, support_phrase = true, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10012, + "idx_content_12", 12, "STRING", "content_12", + properties); + properties.clear(); + // parser = unicode, support_phrase = false, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10013, + "idx_content_13", 13, "STRING", "content_13", + properties); + properties.clear(); + // parser = unicode, support_phrase = false, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10014, + "idx_content_14", 14, "STRING", "content_14", + properties); + properties.clear(); + // parser = unicode, support_phrase = false, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10015, + "idx_content_15", 15, "STRING", "content_15", + properties); + properties.clear(); + // parser = unicode, support_phrase = false, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_UNICODE); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10016, + "idx_content_16", 16, "STRING", "content_16", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = true, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10017, + "idx_content_17", 17, "STRING", "content_17", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = true, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10018, + "idx_content_18", 18, "STRING", "content_18", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = true, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10019, + "idx_content_19", 19, "STRING", "content_19", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = true, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10020, + "idx_content_20", 20, "STRING", "content_20", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = false, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10021, + "idx_content_21", 21, "STRING", "content_21", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = false, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10022, + "idx_content_22", 22, "STRING", "content_22", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = false, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10023, + "idx_content_23", 23, "STRING", "content_23", + properties); + properties.clear(); + // parser = chinese, parser_mode = fine_grained, support_phrase = false, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, INVERTED_INDEX_PARSER_FINE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10024, + "idx_content_24", 24, "STRING", "content_24", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = true, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10025, + "idx_content_25", 25, "STRING", "content_25", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = true, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10026, + "idx_content_26", 26, "STRING", "content_26", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = true, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10027, + "idx_content_27", 27, "STRING", "content_27", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = true, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10028, + "idx_content_28", 28, "STRING", "content_28", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = false, lower_case = true, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10029, + "idx_content_29", 29, "STRING", "content_29", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = false, lower_case = true, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_TRUE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10030, + "idx_content_30", 30, "STRING", "content_30", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = false, lower_case = false, char_filter = none + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, ""); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, ""); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10031, + "idx_content_31", 31, "STRING", "content_31", + properties); + properties.clear(); + // parser = chinese, parser_mode = coarse_grained, support_phrase = false, lower_case = false, char_filter = char_replace + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_CHINESE); + properties.emplace(INVERTED_INDEX_PARSER_MODE_KEY, + INVERTED_INDEX_PARSER_COARSE_GRANULARITY); + properties.emplace(INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY, + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO); + properties.emplace(INVERTED_INDEX_PARSER_LOWERCASE_KEY, INVERTED_INDEX_PARSER_FALSE); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, "char_replace"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, "._"); + properties.emplace(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, " "); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10032, + "idx_content_32", 32, "STRING", "content_32", + properties); + properties.clear(); + // parser = none, ignore_above = 256 + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_NONE); + properties.emplace(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY, "256"); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10033, + "idx_content_33", 33, "STRING", "content_33", + properties); + properties.clear(); + // parser = none, ignore_above = 16383 + properties.emplace(INVERTED_INDEX_PARSER_KEY, INVERTED_INDEX_PARSER_NONE); + properties.emplace(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY, "16383"); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10034, + "idx_content_34", 34, "STRING", "content_34", + properties); + + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10035, + "idx_redirect", 35, "STRING", "redirect"); + IndexCompactionUtils::construct_column(schema_pb.add_column(), schema_pb.add_index(), 10036, + "idx_namespace", 36, "STRING", "namespace"); + + if (keys_type == KeysType::UNIQUE_KEYS) { + // unique table must contain the DELETE_SIGN column + auto* column_pb = schema_pb.add_column(); + IndexCompactionUtils::construct_column(column_pb, 37, "TINYINT", DELETE_SIGN); + column_pb->set_length(1); + column_pb->set_index_length(1); + column_pb->set_is_nullable(false); + } + + _tablet_schema = std::make_shared<TabletSchema>(); + _tablet_schema->init_from_pb(schema_pb); + + // tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + if (keys_type == KeysType::UNIQUE_KEYS) { + tablet_meta->_enable_unique_key_merge_on_write = true; + } + _tablet = std::make_shared<Tablet>(*_engine_ref, tablet_meta, _data_dir.get()); + EXPECT_TRUE(_tablet->init().ok()); } - IndexCompactionTest() = default; - ~IndexCompactionTest() override = default; + void _run_normal_wiki_test(bool with_delete = false, const std::string& delete_pred = "", + int64_t max_rows_per_segment = 100000, + int output_rowset_segment_number = 1) { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + std::string data_file1 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json"; + std::string data_file2 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json"; + // for MOW table to delete + std::string data_file3 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json"; + std::vector<std::string> data_files; + data_files.push_back(data_file1); + data_files.push_back(data_file2); + data_files.push_back(data_file3); + + std::vector<RowsetSharedPtr> rowsets(data_files.size()); + auto custom_check_build_rowsets = [this](const int32_t& size) { + auto keys_type = _tablet_schema->keys_type(); + if (keys_type == KeysType::UNIQUE_KEYS) { + EXPECT_EQ(size, _tablet_schema->num_columns() - 1); + } else { + EXPECT_EQ(size, _tablet_schema->num_columns()); + } + }; + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets, false, 50); + + if (with_delete) { + // create delete predicate rowset and add to tablet + auto delete_rowset = IndexCompactionUtils::create_delete_predicate_rowset( + _tablet_schema, delete_pred, _inc_id); + EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok()); + EXPECT_TRUE(_tablet->rowset_map().size() == (data_files.size() + 1)); + rowsets.push_back(delete_rowset); + EXPECT_TRUE(rowsets.size() == (data_files.size() + 1)); + } + auto custom_check_index = [this, output_rowset_segment_number]( + const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + auto keys_type = _tablet_schema->keys_type(); + if (keys_type == KeysType::UNIQUE_KEYS) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns() - 1); + EXPECT_EQ(ctx.columns_to_do_index_compaction.size(), + _tablet_schema->num_columns() - 1); + } else { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns()); + EXPECT_EQ(ctx.columns_to_do_index_compaction.size(), _tablet_schema->num_columns()); + } + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(0)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(2)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(3)); + EXPECT_EQ(compaction._output_rowset->num_segments(), output_rowset_segment_number) + << compaction._output_rowset->num_segments(); + }; + + RowsetSharedPtr output_rowset_index; + Status st; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, true, + output_rowset_index, custom_check_index, + max_rows_per_segment); + std::cout << "index compaction time: " << watch.get_elapse_second() << "s" << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); + + auto custom_check_normal = [this, output_rowset_segment_number]( + const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + auto keys_type = _tablet_schema->keys_type(); + if (keys_type == KeysType::UNIQUE_KEYS) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns() - 1); + } else { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns()); + } + EXPECT_TRUE(ctx.columns_to_do_index_compaction.empty()); + EXPECT_TRUE(compaction._output_rowset->num_segments() == output_rowset_segment_number) + << compaction._output_rowset->num_segments(); + }; + + RowsetSharedPtr output_rowset_normal; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, false, + output_rowset_normal, custom_check_normal, + max_rows_per_segment); + std::cout << "normal compaction time: " << watch.get_elapse_second() << "s" + << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); + + auto num_segments_idx = output_rowset_index->num_segments(); + auto num_segments_normal = output_rowset_normal->num_segments(); + for (int idx = 10000; idx < 10037; idx++) { + if (num_segments_idx == num_segments_normal == 1) { + // check index file terms for single segment + const auto& seg_path = output_rowset_index->segment_path(0); + EXPECT_TRUE(seg_path.has_value()) << seg_path.error(); + auto inverted_index_file_reader_index = + IndexCompactionUtils::init_index_file_reader( + output_rowset_index, seg_path.value(), + _tablet_schema->get_inverted_index_storage_format()); + + const auto& seg_path_normal = output_rowset_normal->segment_path(0); + EXPECT_TRUE(seg_path_normal.has_value()) << seg_path_normal.error(); + auto inverted_index_file_reader_normal = + IndexCompactionUtils::init_index_file_reader( + output_rowset_normal, seg_path_normal.value(), + _tablet_schema->get_inverted_index_storage_format()); + + auto dir_idx = inverted_index_file_reader_index->_open(idx, ""); + EXPECT_TRUE(dir_idx.has_value()) << dir_idx.error(); + auto dir_normal = inverted_index_file_reader_normal->_open(idx, ""); + EXPECT_TRUE(dir_normal.has_value()) << dir_normal.error(); + st = IndexCompactionUtils::check_idx_file_correctness(dir_idx->get(), + dir_normal->get()); + EXPECT_TRUE(st.ok()) << st.to_string(); + } else { + // check index file terms for multiple segments + std::vector<std::unique_ptr<DorisCompoundReader>> dirs_idx(num_segments_idx); + for (int i = 0; i < num_segments_idx; i++) { + const auto& seg_path = output_rowset_index->segment_path(i); + EXPECT_TRUE(seg_path.has_value()) << seg_path.error(); + auto inverted_index_file_reader_index = + IndexCompactionUtils::init_index_file_reader( + output_rowset_index, seg_path.value(), + _tablet_schema->get_inverted_index_storage_format()); + auto dir_idx = inverted_index_file_reader_index->_open(idx, ""); + EXPECT_TRUE(dir_idx.has_value()) << dir_idx.error(); + dirs_idx[i] = std::move(dir_idx.value()); + } + std::vector<std::unique_ptr<DorisCompoundReader>> dirs_normal(num_segments_normal); + for (int i = 0; i < num_segments_normal; i++) { + const auto& seg_path = output_rowset_normal->segment_path(i); + EXPECT_TRUE(seg_path.has_value()) << seg_path.error(); + auto inverted_index_file_reader_normal = + IndexCompactionUtils::init_index_file_reader( + output_rowset_normal, seg_path.value(), + _tablet_schema->get_inverted_index_storage_format()); + auto dir_normal = inverted_index_file_reader_normal->_open(idx, ""); + EXPECT_TRUE(dir_normal.has_value()) << dir_normal.error(); + dirs_normal[i] = std::move(dir_normal.value()); + } + st = IndexCompactionUtils::check_idx_file_correctness(dirs_idx, dirs_normal); + EXPECT_TRUE(st.ok()) << st.to_string(); + } + } + } private: TabletSchemaSPtr _tablet_schema = nullptr; @@ -96,9 +726,11 @@ private: TabletSharedPtr _tablet = nullptr; std::string _absolute_dir; std::string _current_dir; + int64_t _inc_id = 1000; }; TEST_F(IndexCompactionTest, tes_write_index_normally) { + _build_tablet(); EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); std::string data_file1 = @@ -111,8 +743,9 @@ TEST_F(IndexCompactionTest, tes_write_index_normally) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -179,6 +812,7 @@ TEST_F(IndexCompactionTest, tes_write_index_normally) { } TEST_F(IndexCompactionTest, test_col_unique_ids_empty) { + _build_tablet(); // clear column unique id in tablet index 10001 and rebuild tablet_schema TabletSchemaPB schema_pb; _tablet_schema->to_schema_pb(&schema_pb); @@ -198,8 +832,9 @@ TEST_F(IndexCompactionTest, test_col_unique_ids_empty) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 3); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -229,6 +864,7 @@ TEST_F(IndexCompactionTest, test_col_unique_ids_empty) { } TEST_F(IndexCompactionTest, test_tablet_index_id_not_equal) { + _build_tablet(); // replace unique id from 2 to 1 in tablet index 10002 and rebuild tablet_schema TabletSchemaPB schema_pb; _tablet_schema->to_schema_pb(&schema_pb); @@ -248,8 +884,9 @@ TEST_F(IndexCompactionTest, test_tablet_index_id_not_equal) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 3); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -279,6 +916,7 @@ TEST_F(IndexCompactionTest, test_tablet_index_id_not_equal) { } TEST_F(IndexCompactionTest, test_tablet_schema_tablet_index_is_null) { + _build_tablet(); // set index suffix in tablet index 10001 and rebuild tablet_schema // simulate the case that index is null, tablet_schema->inverted_index(1) will return nullptr TabletSchemaPB schema_pb; @@ -299,8 +937,9 @@ TEST_F(IndexCompactionTest, test_tablet_schema_tablet_index_is_null) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 3); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -330,6 +969,7 @@ TEST_F(IndexCompactionTest, test_tablet_schema_tablet_index_is_null) { } TEST_F(IndexCompactionTest, test_rowset_schema_tablet_index_is_null) { + _build_tablet(); EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); std::string data_file1 = @@ -342,8 +982,9 @@ TEST_F(IndexCompactionTest, test_rowset_schema_tablet_index_is_null) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -375,7 +1016,7 @@ TEST_F(IndexCompactionTest, test_rowset_schema_tablet_index_is_null) { _tablet_schema->get_inverted_index_storage_format()); // check index file - // index 10001 cannot be found in idx file + // index 10001 should be found in idx file, it can be produced by normal compaction auto dir_idx_compaction = inverted_index_file_reader_index->_open(10001, ""); EXPECT_TRUE(dir_idx_compaction.has_value()) << dir_idx_compaction.error(); // check index 10001 term stats @@ -386,6 +1027,7 @@ TEST_F(IndexCompactionTest, test_rowset_schema_tablet_index_is_null) { } TEST_F(IndexCompactionTest, test_tablet_index_properties_not_equal) { + _build_tablet(); // add mock property in tablet index 10001 and rebuild tablet_schema // simulate the case that index properties not equal among input rowsets TabletSchemaSPtr mock_schema = std::make_shared<TabletSchema>(); @@ -407,8 +1049,9 @@ TEST_F(IndexCompactionTest, test_tablet_index_properties_not_equal) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -443,6 +1086,7 @@ TEST_F(IndexCompactionTest, test_tablet_index_properties_not_equal) { } TEST_F(IndexCompactionTest, test_is_skip_index_compaction_not_empty) { + _build_tablet(); EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); std::string data_file1 = @@ -455,8 +1099,9 @@ TEST_F(IndexCompactionTest, test_is_skip_index_compaction_not_empty) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -491,6 +1136,7 @@ TEST_F(IndexCompactionTest, test_is_skip_index_compaction_not_empty) { } TEST_F(IndexCompactionTest, test_rowset_fs_nullptr) { + _build_tablet(); EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); std::string data_file1 = @@ -503,8 +1149,9 @@ TEST_F(IndexCompactionTest, test_rowset_fs_nullptr) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -529,6 +1176,7 @@ TEST_F(IndexCompactionTest, test_rowset_fs_nullptr) { } TEST_F(IndexCompactionTest, test_input_row_num_zero) { + _build_tablet(); EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); std::string data_file1 = @@ -541,8 +1189,9 @@ TEST_F(IndexCompactionTest, test_input_row_num_zero) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -582,6 +1231,7 @@ TEST_F(IndexCompactionTest, test_input_row_num_zero) { } TEST_F(IndexCompactionTest, test_cols_to_do_index_compaction_empty) { + _build_tablet(); // add mock property in tablet index 10001, 10002 and rebuild tablet_schema // simulate the case that index properties not equal among input rowsets // the two cols will skip index compaction and make ctx.columns_to_do_index_compaction empty @@ -606,8 +1256,9 @@ TEST_F(IndexCompactionTest, test_cols_to_do_index_compaction_empty) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); auto custom_check_index = [](const BaseCompaction& compaction, const RowsetWriterContext& ctx) { EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), 4); @@ -644,6 +1295,7 @@ TEST_F(IndexCompactionTest, test_cols_to_do_index_compaction_empty) { } TEST_F(IndexCompactionTest, test_index_compaction_with_delete) { + _build_tablet(); EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); std::string data_file1 = @@ -656,12 +1308,13 @@ TEST_F(IndexCompactionTest, test_index_compaction_with_delete) { std::vector<RowsetSharedPtr> rowsets(data_files.size()); auto custom_check_build_rowsets = [](const int32_t& size) { EXPECT_EQ(size, 4); }; - IndexCompactionUtils::build_rowsets(_data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, - data_files, custom_check_build_rowsets); + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::DataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets); // create delete predicate rowset and add to tablet auto delete_rowset = IndexCompactionUtils::create_delete_predicate_rowset( - _tablet_schema, "v1='great'", inc_id++); + _tablet_schema, "v1='great'", _inc_id); EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok()); EXPECT_TRUE(_tablet->rowset_map().size() == 3); rowsets.push_back(delete_rowset); @@ -731,4 +1384,197 @@ TEST_F(IndexCompactionTest, test_index_compaction_with_delete) { IndexCompactionUtils::check_meta_and_file(output_rowset_normal, _tablet_schema, query_map); } +TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2) { + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2) { + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_with_partial_delete) { + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(true, "namespace='Adel, OR'"); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_with_partial_delete) { + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(true, "namespace='Adel, OR'"); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_with_total_delete) { + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2); + std::string delete_pred = "title IS NOT NULL"; + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + std::string data_file1 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json"; + std::string data_file2 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json"; + // for MOW table to delete + std::string data_file3 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json"; + std::vector<std::string> data_files; + data_files.push_back(data_file1); + data_files.push_back(data_file2); + data_files.push_back(data_file3); + + std::vector<RowsetSharedPtr> rowsets(data_files.size()); + auto custom_check_build_rowsets = [this](const int32_t& size) { + EXPECT_EQ(size, _tablet_schema->num_columns()); + }; + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets, false, 50); + + // create delete predicate rowset and add to tablet + auto delete_rowset = IndexCompactionUtils::create_delete_predicate_rowset(_tablet_schema, + delete_pred, _inc_id); + EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok()); + EXPECT_TRUE(_tablet->rowset_map().size() == (data_files.size() + 1)); + rowsets.push_back(delete_rowset); + EXPECT_TRUE(rowsets.size() == (data_files.size() + 1)); + + auto custom_check_index = [this](const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns()); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == _tablet_schema->num_columns()); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(0)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(2)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(3)); + EXPECT_TRUE(compaction._output_rowset->num_segments() == 0); + }; + + RowsetSharedPtr output_rowset_index; + Status st; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, true, + output_rowset_index, custom_check_index); + std::cout << "index compaction time: " << watch.get_elapse_second() << "s" << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); + + auto custom_check_normal = [this](const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns()); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 0); + EXPECT_TRUE(compaction._output_rowset->num_segments() == 0); + }; + + RowsetSharedPtr output_rowset_normal; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, false, + output_rowset_normal, custom_check_normal); + std::cout << "normal compaction time: " << watch.get_elapse_second() << "s" << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_with_total_delete) { + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2); + std::string delete_pred = "title IS NOT NULL"; + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + std::string data_file1 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json"; + std::string data_file2 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json"; + // for MOW table to delete + std::string data_file3 = + _current_dir + + "/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json"; + std::vector<std::string> data_files; + data_files.push_back(data_file1); + data_files.push_back(data_file2); + data_files.push_back(data_file3); + + std::vector<RowsetSharedPtr> rowsets(data_files.size()); + auto custom_check_build_rowsets = [this](const int32_t& size) { + EXPECT_EQ(size, _tablet_schema->num_columns() - 1); + }; + IndexCompactionUtils::build_rowsets<IndexCompactionUtils::WikiDataRow>( + _data_dir, _tablet_schema, _tablet, _engine_ref, rowsets, data_files, _inc_id, + custom_check_build_rowsets, false, 50); + + // create delete predicate rowset and add to tablet + auto delete_rowset = IndexCompactionUtils::create_delete_predicate_rowset(_tablet_schema, + delete_pred, _inc_id); + EXPECT_TRUE(_tablet->add_rowset(delete_rowset).ok()); + EXPECT_TRUE(_tablet->rowset_map().size() == (data_files.size() + 1)); + rowsets.push_back(delete_rowset); + EXPECT_TRUE(rowsets.size() == (data_files.size() + 1)); + + auto custom_check_index = [this](const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns() - 1); + EXPECT_EQ(ctx.columns_to_do_index_compaction.size(), _tablet_schema->num_columns() - 1); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(0)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(1)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(2)); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.contains(3)); + EXPECT_TRUE(compaction._output_rowset->num_segments() == 0); + }; + + RowsetSharedPtr output_rowset_index; + Status st; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, true, + output_rowset_index, custom_check_index); + std::cout << "index compaction time: " << watch.get_elapse_second() << "s" << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); + + auto custom_check_normal = [this](const BaseCompaction& compaction, + const RowsetWriterContext& ctx) { + EXPECT_EQ(compaction._cur_tablet_schema->inverted_indexes().size(), + _tablet_schema->num_columns() - 1); + EXPECT_TRUE(ctx.columns_to_do_index_compaction.size() == 0); + EXPECT_TRUE(compaction._output_rowset->num_segments() == 0); + }; + + RowsetSharedPtr output_rowset_normal; + { + OlapStopWatch watch; + st = IndexCompactionUtils::do_compaction(rowsets, _engine_ref, _tablet, false, + output_rowset_normal, custom_check_normal); + std::cout << "normal compaction time: " << watch.get_elapse_second() << "s" << std::endl; + } + EXPECT_TRUE(st.ok()) << st.to_string(); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_multiple_dest_segments) { + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(false, "", 50, 3); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_multiple_dest_segments) { + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(false, "", 50, 2); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_dup_v2_multiple_src_lucene_segments) { + config::inverted_index_max_buffered_docs = 100; + _build_wiki_tablet(KeysType::DUP_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(); +} + +TEST_F(IndexCompactionTest, tes_wikipedia_mow_v2_multiple_src_lucene_segments) { + config::inverted_index_max_buffered_docs = 100; + _build_wiki_tablet(KeysType::UNIQUE_KEYS, InvertedIndexStorageFormatPB::V2); + _run_normal_wiki_test(); +} } // namespace doris diff --git a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp index 530dca8054c..02353fc5441 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/compaction/util/index_compaction_utils.cpp @@ -21,6 +21,7 @@ #include <iomanip> #include <iostream> #include <memory> +#include <nlohmann/json.hpp> #include <sstream> #include <vector> @@ -38,7 +39,6 @@ namespace doris { -static int64_t inc_id = 1000; const static std::string expected_output = "Max Docs: 2000\n" "Num Docs: 2000\n" @@ -76,8 +76,18 @@ class IndexCompactionUtils { std::string url; int num; }; + struct WikiDataRow { + std::string title; + std::string content; + std::string redirect; + std::string space; + }; + + template <typename T> + static std::vector<T> read_data(const std::string& file_name); - static std::vector<DataRow> read_data(const std::string file_name) { + template <> + std::vector<DataRow> read_data<DataRow>(const std::string& file_name) { std::ifstream file(file_name); EXPECT_TRUE(file.is_open()); @@ -103,6 +113,38 @@ class IndexCompactionUtils { return data; } + template <> + std::vector<WikiDataRow> read_data<WikiDataRow>(const std::string& file_name) { + std::ifstream file(file_name); + EXPECT_TRUE(file.is_open()); + + std::vector<WikiDataRow> data; + std::string line; + + while (std::getline(file, line)) { + if (line.empty()) { + continue; + } + // catch parse exception and continue + try { + nlohmann::json j = nlohmann::json::parse(line); + WikiDataRow row; + row.title = j.value("title", "null"); + row.content = j.value("content", "null"); + row.redirect = j.value("redirect", "null"); + row.space = j.value("space", "null"); + + data.emplace_back(std::move(row)); + } catch (const std::exception& e) { + std::cout << "parse json error: " << e.what() << std::endl; + continue; + } + } + + file.close(); + return data; + } + static bool query_bkd(const TabletIndex* index, std::shared_ptr<InvertedIndexFileReader>& inverted_index_file_reader, const std::vector<int>& query_data, @@ -233,85 +275,82 @@ class IndexCompactionUtils { r->close(); _CLLDELETE(r); } - static Status check_idx_file_correctness(lucene::store::Directory* index_reader, - lucene::store::Directory* tmp_index_reader) { - lucene::index::IndexReader* idx_reader = lucene::index::IndexReader::open(index_reader); - lucene::index::IndexReader* tmp_idx_reader = - lucene::index::IndexReader::open(tmp_index_reader); - + static Status check_idx_file_correctness_impl(lucene::index::IndexReader* idx_reader, + lucene::index::IndexReader* normal_idx_reader) { // compare numDocs - if (idx_reader->numDocs() != tmp_idx_reader->numDocs()) { + if (idx_reader->numDocs() != normal_idx_reader->numDocs()) { return Status::InternalError( "index compaction correctness check failed, numDocs not equal, idx_numDocs={}, " - "tmp_idx_numDocs={}", - idx_reader->numDocs(), tmp_idx_reader->numDocs()); + "normal_idx_numDocs={}", + idx_reader->numDocs(), normal_idx_reader->numDocs()); } lucene::index::TermEnum* term_enum = idx_reader->terms(); - lucene::index::TermEnum* tmp_term_enum = tmp_idx_reader->terms(); + lucene::index::TermEnum* normal_term_enum = normal_idx_reader->terms(); lucene::index::TermDocs* term_docs = nullptr; - lucene::index::TermDocs* tmp_term_docs = nullptr; + lucene::index::TermDocs* normal_term_docs = nullptr; // iterate TermEnum - while (term_enum->next() && tmp_term_enum->next()) { + while (term_enum->next() && normal_term_enum->next()) { std::string token = lucene_wcstoutf8string(term_enum->term(false)->text(), term_enum->term(false)->textLength()); std::string field = lucene_wcstoutf8string( term_enum->term(false)->field(), lenOfString(term_enum->term(false)->field())); - std::string tmp_token = lucene_wcstoutf8string( - tmp_term_enum->term(false)->text(), tmp_term_enum->term(false)->textLength()); - std::string tmp_field = - lucene_wcstoutf8string(tmp_term_enum->term(false)->field(), - lenOfString(tmp_term_enum->term(false)->field())); + std::string normal_token = + lucene_wcstoutf8string(normal_term_enum->term(false)->text(), + normal_term_enum->term(false)->textLength()); + std::string normal_field = + lucene_wcstoutf8string(normal_term_enum->term(false)->field(), + lenOfString(normal_term_enum->term(false)->field())); // compare token and field - if (field != tmp_field) { + if (field != normal_field) { return Status::InternalError( "index compaction correctness check failed, fields not equal, field={}, " - "tmp_field={}", + "normal_field={}", field, field); } - if (token != tmp_token) { + if (token != normal_token) { return Status::InternalError( "index compaction correctness check failed, tokens not equal, token={}, " - "tmp_token={}", - token, tmp_token); + "normal_token={}", + token, normal_token); } // get term's docId and freq term_docs = idx_reader->termDocs(term_enum->term(false)); - tmp_term_docs = tmp_idx_reader->termDocs(tmp_term_enum->term(false)); + normal_term_docs = normal_idx_reader->termDocs(normal_term_enum->term(false)); // compare term's docId and freq - while (term_docs->next() && tmp_term_docs->next()) { - if (term_docs->doc() != tmp_term_docs->doc() || - term_docs->freq() != tmp_term_docs->freq()) { + while (term_docs->next() && normal_term_docs->next()) { + if (term_docs->doc() != normal_term_docs->doc() || + term_docs->freq() != normal_term_docs->freq()) { return Status::InternalError( "index compaction correctness check failed, docId or freq not equal, " - "docId={}, tmp_docId={}, freq={}, tmp_freq={}", - term_docs->doc(), tmp_term_docs->doc(), term_docs->freq(), - tmp_term_docs->freq()); + "docId={}, normal_docId={}, freq={}, normal_freq={}", + term_docs->doc(), normal_term_docs->doc(), term_docs->freq(), + normal_term_docs->freq()); } } // check if there are remaining docs - if (term_docs->next() || tmp_term_docs->next()) { + if (term_docs->next() || normal_term_docs->next()) { return Status::InternalError( "index compaction correctness check failed, number of docs not equal for " - "term={}, tmp_term={}", - token, tmp_token); + "term={}, normal_term={}", + token, normal_token); } if (term_docs) { term_docs->close(); _CLLDELETE(term_docs); } - if (tmp_term_docs) { - tmp_term_docs->close(); - _CLLDELETE(tmp_term_docs); + if (normal_term_docs) { + normal_term_docs->close(); + _CLLDELETE(normal_term_docs); } } // check if there are remaining terms - if (term_enum->next() || tmp_term_enum->next()) { + if (term_enum->next() || normal_term_enum->next()) { return Status::InternalError( "index compaction correctness check failed, number of terms not equal"); } @@ -319,27 +358,61 @@ class IndexCompactionUtils { term_enum->close(); _CLLDELETE(term_enum); } - if (tmp_term_enum) { - tmp_term_enum->close(); - _CLLDELETE(tmp_term_enum); + if (normal_term_enum) { + normal_term_enum->close(); + _CLLDELETE(normal_term_enum); } if (idx_reader) { idx_reader->close(); _CLLDELETE(idx_reader); } - if (tmp_idx_reader) { - tmp_idx_reader->close(); - _CLLDELETE(tmp_idx_reader); + if (normal_idx_reader) { + normal_idx_reader->close(); + _CLLDELETE(normal_idx_reader); } return Status::OK(); } + static Status check_idx_file_correctness(lucene::store::Directory* index_reader, + lucene::store::Directory* normal_index_reader) { + lucene::index::IndexReader* idx_reader = lucene::index::IndexReader::open(index_reader); + lucene::index::IndexReader* normal_idx_reader = + lucene::index::IndexReader::open(normal_index_reader); + + return check_idx_file_correctness_impl(idx_reader, normal_idx_reader); + } + + static Status check_idx_file_correctness( + const std::vector<std::unique_ptr<DorisCompoundReader>>& index_readers, + const std::vector<std::unique_ptr<DorisCompoundReader>>& normal_index_readers) { + ValueArray<lucene::index::IndexReader*> readers(index_readers.size()); + for (int i = 0; i < index_readers.size(); i++) { + lucene::index::IndexReader* idx_reader = + lucene::index::IndexReader::open(index_readers[i].get()); + readers[i] = idx_reader; + } + ValueArray<lucene::index::IndexReader*> normal_readers(normal_index_readers.size()); + for (int i = 0; i < normal_index_readers.size(); i++) { + lucene::index::IndexReader* normal_idx_reader = + lucene::index::IndexReader::open(normal_index_readers[i].get()); + normal_readers[i] = normal_idx_reader; + } + + auto* idx_reader = new lucene::index::MultiReader(&readers, true); + auto* normal_idx_reader = new lucene::index::MultiReader(&normal_readers, true); + + return check_idx_file_correctness_impl(idx_reader, normal_idx_reader); + } + static Status do_compaction( const std::vector<RowsetSharedPtr>& rowsets, StorageEngine* engine_ref, const TabletSharedPtr& tablet, bool is_index_compaction, RowsetSharedPtr& rowset_ptr, const std::function<void(const BaseCompaction&, const RowsetWriterContext&)> - custom_check = nullptr) { + custom_check = nullptr, + int64_t max_rows_per_segment = 100000) { config::inverted_index_compaction_enable = is_index_compaction; + // control max rows in one block + config::compaction_batch_size = max_rows_per_segment; // only base compaction can handle delete predicate BaseCompaction compaction(*engine_ref, tablet); compaction._input_rowsets = std::move(rowsets); @@ -349,12 +422,13 @@ class IndexCompactionUtils { create_input_rowsets_readers(compaction, input_rs_readers); RowsetWriterContext ctx; + ctx.max_rows_per_segment = max_rows_per_segment; RETURN_IF_ERROR(compaction.construct_output_rowset_writer(ctx)); compaction._stats.rowid_conversion = compaction._rowid_conversion.get(); RETURN_IF_ERROR(Merger::vertical_merge_rowsets( tablet, compaction.compaction_type(), *(compaction._cur_tablet_schema), - input_rs_readers, compaction._output_rs_writer.get(), 100000, 5, + input_rs_readers, compaction._output_rs_writer.get(), max_rows_per_segment - 1, 5, &compaction._stats)); const auto& dst_writer = @@ -409,36 +483,41 @@ class IndexCompactionUtils { } static RowsetSharedPtr create_delete_predicate_rowset(const TabletSchemaSPtr& schema, - std::string pred, int64_t version) { + std::string pred, int64& inc_id) { DeletePredicatePB del_pred; del_pred.add_sub_predicates(pred); del_pred.set_version(1); RowsetMetaSharedPtr rsm(new RowsetMeta()); - init_rs_meta(rsm, version, version); + init_rs_meta(rsm, inc_id, inc_id); RowsetId id; - id.init(version); + id.init(inc_id); rsm->set_rowset_id(id); rsm->set_delete_predicate(std::move(del_pred)); rsm->set_tablet_schema(schema); + inc_id++; return std::make_shared<BetaRowset>(schema, rsm, ""); } static void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, int64_t index_id, const std::string& index_name, int32_t col_unique_id, const std::string& column_type, const std::string& column_name, - bool parser = false) { + const std::map<std::string, std::string>& properties = + std::map<std::string, std::string>(), + bool is_key = false) { column_pb->set_unique_id(col_unique_id); column_pb->set_name(column_name); column_pb->set_type(column_type); - column_pb->set_is_key(false); + column_pb->set_is_key(is_key); column_pb->set_is_nullable(true); tablet_index->set_index_id(index_id); tablet_index->set_index_name(index_name); tablet_index->set_index_type(IndexType::INVERTED); tablet_index->add_col_unique_id(col_unique_id); - if (parser) { - auto* properties = tablet_index->mutable_properties(); - (*properties)[INVERTED_INDEX_PARSER_KEY] = INVERTED_INDEX_PARSER_UNICODE; + if (!properties.empty()) { + auto* pros = tablet_index->mutable_properties(); + for (const auto& [key, value] : properties) { + (*pros)[key] = value; + } } } @@ -521,7 +600,8 @@ class IndexCompactionUtils { static RowsetWriterContext rowset_writer_context(const std::unique_ptr<DataDir>& data_dir, const TabletSchemaSPtr& schema, - const std::string& tablet_path) { + const std::string& tablet_path, int64& inc_id, + int64 max_rows_per_segment = 200) { RowsetWriterContext context; RowsetId rowset_id; rowset_id.init(inc_id); @@ -532,23 +612,28 @@ class IndexCompactionUtils { context.tablet_schema = schema; context.tablet_path = tablet_path; context.version = Version(inc_id, inc_id); - context.max_rows_per_segment = 200; + context.max_rows_per_segment = max_rows_per_segment; inc_id++; return context; } + template <typename T> static void build_rowsets(const std::unique_ptr<DataDir>& data_dir, const TabletSchemaSPtr& schema, const TabletSharedPtr& tablet, StorageEngine* engine_ref, std::vector<RowsetSharedPtr>& rowsets, - const std::vector<std::string>& data_files, - const std::function<void(const int32_t&)> custom_check = nullptr) { - std::vector<std::vector<DataRow>> data; - for (auto file : data_files) { - data.emplace_back(read_data(file)); + const std::vector<std::string>& data_files, int64& inc_id, + const std::function<void(const int32_t&)> custom_check = nullptr, + const bool& is_performance = false, + int64 max_rows_per_segment = 200) { + std::vector<std::vector<T>> data; + for (const auto& file : data_files) { + data.emplace_back(read_data<T>(file)); } for (int i = 0; i < data.size(); i++) { const auto& res = RowsetFactory::create_rowset_writer( - *engine_ref, rowset_writer_context(data_dir, schema, tablet->tablet_path()), + *engine_ref, + rowset_writer_context(data_dir, schema, tablet->tablet_path(), inc_id, + max_rows_per_segment), false); EXPECT_TRUE(res.has_value()) << res.error(); const auto& rowset_writer = res.value(); @@ -556,24 +641,58 @@ class IndexCompactionUtils { vectorized::Block block = schema->create_block(); auto columns = block.mutate_columns(); for (const auto& row : data[i]) { - vectorized::Field key = int32_t(row.key); - vectorized::Field v1(row.word); - vectorized::Field v2(row.url); - vectorized::Field v3 = int32_t(row.num); - columns[0]->insert(key); - columns[1]->insert(v1); - columns[2]->insert(v2); - columns[3]->insert(v3); + if constexpr (std::is_same_v<T, DataRow>) { + vectorized::Field key = int32_t(row.key); + vectorized::Field v1(row.word); + vectorized::Field v2(row.url); + vectorized::Field v3 = int32_t(row.num); + columns[0]->insert(key); + columns[1]->insert(v1); + columns[2]->insert(v2); + columns[3]->insert(v3); + } else if constexpr (std::is_same_v<T, WikiDataRow>) { + vectorized::Field title(row.title); + vectorized::Field content(row.content); + vectorized::Field redirect(row.redirect); + vectorized::Field space(row.space); + columns[0]->insert(title); + if (is_performance) { + columns[1]->insert(content); + columns[2]->insert(redirect); + columns[3]->insert(space); + if (schema->keys_type() == UNIQUE_KEYS) { + uint8_t num = 0; + columns[4]->insert_data((const char*)&num, sizeof(num)); + } + } else { + for (int j = 1; j < 35; j++) { + columns[j]->insert(content); + } + columns[35]->insert(redirect); + columns[36]->insert(space); + if (schema->keys_type() == UNIQUE_KEYS) { + uint8_t num = 0; + columns[37]->insert_data((const char*)&num, sizeof(num)); + } + } + } } - EXPECT_TRUE(rowset_writer->add_block(&block).ok()); - EXPECT_TRUE(rowset_writer->flush().ok()); + + Status st = rowset_writer->add_block(&block); + EXPECT_TRUE(st.ok()) << st.to_string(); + st = rowset_writer->flush(); + EXPECT_TRUE(st.ok()) << st.to_string(); const auto& dst_writer = dynamic_cast<BaseBetaRowsetWriter*>(rowset_writer.get()); check_idx_file_writer_closed(dst_writer, true); - EXPECT_TRUE(rowset_writer->build(rowsets[i]).ok()); - EXPECT_TRUE(tablet->add_rowset(rowsets[i]).ok()); - EXPECT_TRUE(rowsets[i]->num_segments() == 5); + st = rowset_writer->build(rowsets[i]); + EXPECT_TRUE(st.ok()) << st.to_string(); + st = tablet->add_rowset(rowsets[i]); + EXPECT_TRUE(st.ok()) << st.to_string(); + EXPECT_TRUE(rowsets[i]->num_segments() == + (rowsets[i]->num_rows() / max_rows_per_segment)) + << rowsets[i]->num_segments(); // check rowset meta and file for (int seg_id = 0; seg_id < rowsets[i]->num_segments(); seg_id++) { @@ -583,7 +702,8 @@ class IndexCompactionUtils { const auto& file_name = fmt::format("{}/{}_{}.idx", rowsets[i]->tablet_path(), rowsets[i]->rowset_id().to_string(), seg_id); int64_t file_size = 0; - EXPECT_TRUE(fs->file_size(file_name, &file_size).ok()); + Status st = fs->file_size(file_name, &file_size); + EXPECT_TRUE(st.ok()) << st.to_string(); EXPECT_EQ(index_info.index_size(), file_size); const auto& seg_path = rowsets[i]->segment_path(seg_id); @@ -593,7 +713,8 @@ class IndexCompactionUtils { auto inverted_index_file_reader = std::make_shared<InvertedIndexFileReader>( fs, std::string(index_file_path_prefix), schema->get_inverted_index_storage_format(), index_info); - EXPECT_TRUE(inverted_index_file_reader->init().ok()); + st = inverted_index_file_reader->init(); + EXPECT_TRUE(st.ok()) << st.to_string(); const auto& dirs = inverted_index_file_reader->get_all_directories(); EXPECT_TRUE(dirs.has_value()); if (custom_check) { diff --git a/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json new file mode 100644 index 00000000000..4cbdc10850f --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-1.json @@ -0,0 +1,50 @@ +{"title":"102.2 Smooth FM","content":"{{About|the defunct GMG Radio station which played adult contemporary music|the current station which plays oldies|102.2 Smooth Radio}}\n{{Use British English|date=May 2015}}\n{{Use dmy dates|date=December 2023}}\n{{Infobox radio station\n| logo = SmoothFM london.png\n| logo_size = 100px\n| name = 102.2 Smooth FM (London) (defunct)\n| airdate = 7 June 2005\n| frequency = 102.2 [[megahertz|MHz]]\n| area = [[Greater London]] (FM),<br / [...] +{"title":"1932 Prussian coup d'état","content":"{{Short description|Takeover by Weimar chancellor Franz von Papen}}\n{{Infobox civil conflict\n| title = 1932 Prussian coup d'état\n| subtitle =\n| partof = [[Weimar Republic#Reasons for failure|failure of Weimar Republic]]\n| image = Bundesarchiv Bild 102-13680, Berlin, Verordnung über Ausnahmezustand.jpg\n| caption = The Emergency Decree of President von Hindenburg (Berlin, July 1932)\n| date = 20 July 1932\n| place = [[Free State of Prus [...] +{"title":"3467 Bernheim","redirect":"List of minor planets: 3001–4000"} +{"title":"509 Harbourfront","content":"{{short description|Streetcar route in Toronto, Canada}}\n{{Use dmy dates|date=June 2022}}\n{{Infobox rail line\n| box_width = auto\n| name = 509 Harbourfront\n| color = \n| logo = TTC.svg\n| logo_width = 75\n| logo_alt = \n| image = Streetcar 4407 Queens Quay West at Harbourfront [...] +{"title":"A Midsummer Night's Gene","content":"{{Short description|1997 novel by Andrew Harman}}\n{{infobox book | <!-- See [[Wikipedia:WikiProject Novels]] or [[Wikipedia:WikiProject Books]] -->\n| name = A Midsummer Night's Gene\n| title_orig = \n| translator = \n| image = A Midsummer Night's Gene.jpg\n| caption = First edition\n| author = [[Andrew Harman]]\n| illustrator = \n| cover_artist = \n| country = United Kingdom\n| language = English [...] +{"title":"A Sides Win: Singles 1992–2005","content":"{{Infobox album\n| name = A Sides Win: Singles 1992–2005\n| type = greatest\n| artist = [[Sloan (band)|Sloan]]\n| cover = Sloan asideswin.png\n| alt =\n| released = {{Start date|2005|5|3}}\n| recorded = 1992–2005\n| venue =\n| studio =\n| genre = [[Rock and roll|Rock]]\n| length = 54:54\n| label = [[Sony BMG Music Entertainment|Sony / BMG]] {{small|([[Canada]])}}<br />[[Koch Entert [...] +{"title":"Aalesund University College","content":"{{Use dmy dates|date=July 2020}}\n{{coord|62|28|19.87|N|6|14|8.58|E|type:edu_region:NO_dim:1100|display=title}}\n{{Infobox university\n| name = Aalesund University College\n| native_name = Høgskolen i Ålesund\n| latin_name = \n| image = [[File:A-3-rgb.png|200px]]\n| motto = \n| established = 1994\n| type = [[Public University]]\n| rector [...] +{"title":"Abbun d'bishmayya","redirect":"Lord's Prayer"} +{"title":"Abraxas Foundation","redirect":"Boyd Rice"} +{"title":"Academy of the Asturian Language","content":"{{Short description|Asturian Institution}}\n{{Use dmy dates|date=October 2013}}\n[[File:Academia de la Llingua Asturiana 1.jpg|thumb|350px|Current headquarters at [[Oviedo]]]]\n[[File:WIKITONGUES- Victor speaking Asturian.webm|thumb|Victor speaking Asturian.]]\nThe '''Academia de la Llingua Asturiana''' or '''Academy of the Asturian Language''' (ALLA) is an Official Institution<ref>Official Decret of Asturian Regional Council 33/1980 [...] +{"title":"Ada, MI","redirect":"Ada Township, Michigan"} +{"title":"Adams Township, Clinton County, OH","redirect":"Adams Township, Clinton County, Ohio"} +{"title":"Adams, Adams County, WI","redirect":"Adams, Adams County, Wisconsin"} +{"title":"Adel, OR","redirect":"Adel, Oregon"} +{"title":"Afognak, AK","redirect":"Afognak, Alaska"} +{"title":"Age fabrication","content":"{{short description|Misrepresenting a person's age}}\n{{refimprove|date=May 2022}}\n{{Use mdy dates|date=April 2018}}\n'''Age fabrication''' occurs when people deliberately misrepresent their true age. This is usually done with intent to garner privileges or [[Social status|status]] that would not otherwise be available to that person (e.g. a minor misrepresenting their age in order to garner the privileges given to adults). It may be done through th [...] +{"title":"Agenda, Ashland County, WI","redirect":"Agenda, Wisconsin"} +{"title":"Agua Dulce, CA","redirect":"Agua Dulce, California"} +{"title":"Aguanga, CA","redirect":"Aguanga, California"} +{"title":"Aleksandra Kollontai","redirect":"Alexandra Kollontai"} +{"title":"Aleksandra Mikhailovna Kollontai","redirect":"Alexandra Kollontai"} +{"title":"Aleksei Nikolaevich Tolstoy","redirect":"Aleksey Nikolayevich Tolstoy"} +{"title":"All Mod Cons","content":"{{about|the album by The Jam|the television episode|All Mod Cons (Minder)}}\n{{EngvarB|date=May 2014}}\n{{Use dmy dates|date=March 2021}}\n\n{{Infobox album\n| name = All Mod Cons\n| type = Album\n| artist = [[the Jam]]\n| cover = The_Jam_-_All_Mod_Cons.jpg\n| alt =\n| released = 3 November 1978\n| recorded = 4 July – 17 August 1978\n| venue =\n| studio = [[RAK Studios|RAK]] and [[Eden Studios|Eden]], London\n| g [...] +{"title":"Ana Palacio","content":"{{short description|Spanish politician}}\n{{family name hatnote|de Palacio|del Valle Lersundi|lang=Spanish}}\n{{Infobox officeholder\n| honorific-prefix = [[The Most Excellent]]\n| name = Ana Palacio\n| image = Ana Palacio.jpg\n| caption = Palacio in 2004\n| office = [[Ministry of Foreign Affairs (Spain)|Minister of Foreign Affairs]]\n| term_start = July 20, 2002\n| term_end = April 18, 2004\n| predecessor = [[Jose [...] +{"title":"Anatoli Lunacharsky","redirect":"Anatoly Lunacharsky"} +{"title":"Anatolij Vasil'evich Lunacharskij","redirect":"Anatoly Lunacharsky"} +{"title":"Andrew Harman","content":"{{Use dmy dates|date=February 2018}}\n{{Use British English|date=February 2018}}\n'''Andrew Harman''' (born 1964) is an author from the United Kingdom known for writing pun-filled and farcical [[fantasy fiction]].\n\n== Life ==\nAndrew Harman studied [[biochemistry]] at the [[University of York]], being a member of [[Wentworth College]].\n\nSince 2000, Harman has moved on from writing to create YAY Games, a UK independent publisher of board and card ga [...] +{"title":"Angarsk","content":"{{Short description|City in Irkutsk Oblast, Russia}}\n{{Use mdy dates|date=May 2011}}\n{{Expand Russian|topic=geo|date=April 2020}}\n{{Infobox Russian inhabited locality\n|en_name=Angarsk\n|ru_name=Ангарск\n|image_skyline=Angarsk_car_Volga_GAZ-21_(25720495842).jpg\n|image_caption=city center\n|coordinates = {{coord|52|33|N|103|54|E|display=inline,title}}\n|map_label_position=top\n|image_coa=Coat of Arms of Angarsk (Irkutsk oblast).png\n|coa_caption=\n|image_ [...] +{"title":"Animal rescue group","content":"{{short description|Rescue organization is dedicated to pet adoption}}\n{{about|pet rescue|other uses|Rescue|and|Rescue (disambiguation)|and|Animal rescue (disambiguation)}}\n{{Multiple issues|\n{{more footnotes|date=January 2014}}\n{{Original research|date=August 2020}}\n}}\nAn '''animal rescue group''' or '''animal rescue organization''' is a group dedicated to [[pet adoption]]. These groups take unwanted, abandoned, abused, or [[feral|stray]] [...] +{"title":"Annapolis—Kings","content":"<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Annapolis—Kings (Nova Scotia federal electoral district)]]'' --->\n{{Infobox Canada electoral district\n| name = Annapolis—Kings\n| province = Nova Scotia\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1947\n| fed-abolished = 1952\n| fed- [...] +{"title":"Anne Teresa De Keersmaeker","content":"{{Use dmy dates|date=August 2023}}\n[[File:Anne Teresa De Keersmaeker 2016.jpg|thumb|Anne Teresa De Keersmaeker in 2016]]\n'''Anne Teresa, Baroness De Keersmaeker''' ({{IPA-nl|ˈɑnə teːˈreːzaː dəˈkeːrsmaːkər}}, born 1960 in [[Mechelen]], Belgium, grew up in Wemmel) is a [[contemporary dance]] choreographer. The dance company constructed around her, {{ill|Rosas (dance ensemble)|fr|Compagnie Rosas}}, was in residence at [[La Monnaie]] in [[Br [...] +{"title":"Anne Teresa de Keersmaeker","redirect":"Anne Teresa De Keersmaeker"} +{"title":"Antoine Chaudet","redirect":"Antoine-Denis Chaudet"} +{"title":"Apostatic selection","content":"{{short description|Process in evolutionary theory}}\n{{Use dmy dates|date=July 2016}}\n'''Apostatic selection''' is a form of negative [[frequency-dependent selection]]. It describes the survival of individual [[prey]] animals that are different (through [[mutation]]) from their species in a way that makes it more likely for them to be ignored by their [[predator]]s. It operates on [[polymorphism (biology)|polymorphic]] species, species which ha [...] +{"title":"Architecture Without Architects","content":"{{Short description|1964 book by Bernard Rudofsky}}\n{{italic title}}\n[[image:Architecture without Architects cover.JPG|thumb|right|200px|''Architecture Without Architects'' cover]]\n\n'''''Architecture Without Architects: A Short Introduction to Non-Pedigreed Architecture''''' is a book based on the [[New York City|NYC]] [[Museum of Modern Art|MoMA]] exhibition of the same name by [[Bernard Rudofsky]] originally published in 1964. I [...] +{"title":"Argenteuil—Deux-Montagnes","redirect":"Argenteuil—Papineau—Mirabel"} +{"title":"Army of the Pharaohs","content":"{{short description|American hip hop group}}\n\n{{about|the U.S. hip hop group|the military and history topic|military of ancient Egypt}}\n{{Use mdy dates|date=March 2021}}\n\n{{Infobox musical artist\n| name = Army of the Pharaohs\n| image = AOTP 2014.jpg\n| image_upright = 1.1\n| caption = The group in 2014\n| origin = [[Philadelphia]], Pennsylvania, U.S.\n| genre = [[Hip hop music|Hip hop]], [ [...] +{"title":"Ashland Global","content":"{{short description|American chemical company}}\n{{Infobox company\n| name = Ashland Global Holdings, Inc.\n| logo = Ashland 4color process.png\n| logo_size = 250px\n| type = [[Public company|Public]]\n| traded_as = {{NYSE|ASH}}<br />[[List of S&P 400 companies|S&P 400 Component]]\n| foundation = 1924\n| location = [[Wilmington, Delaware]], [[United States| U.S.]]\n| key_people = [[ [...] +{"title":"Ashland Oil","redirect":"Ashland Global"} +{"title":"At The Circus","redirect":"At the Circus"} +{"title":"Automobles","redirect":"Car"} +{"title":"Banshu","redirect":"Banshū"} +{"title":"Banshu Province","redirect":"Harima Province"} +{"title":"Battle River—Camrose","content":"<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Battle River–Camrose (federal electoral district)]]'' --->\n'''Battle River—Camrose''' was a federal [[electoral district (Canada)|electoral district]] in [[Alberta]], Canada, that was represented in the [[House of Commons of Canada]] from 1953 to 1968.\n\nThis riding was created in 1952 from parts of [[Battle River (electoral district)|Battle River]], [...] +{"title":"Battle of Pacocha","content":"{{EngvarB|date=July 2014}}\n{{Use dmy dates|date=July 2014}}\n\n{{Infobox military conflict|\n| conflict = Battle of Pacocha\n| image = Combate de Pacocha.jpg\n| image_size = 300px\n| caption = ''The Naval Combat in the Pacific between HMS SHAH and HMS AMETHYST and the Peruvian Rebel Ironclad Turret Ram HUASCAR on May 29th 1877'', [[William Frederick Mitchell]]\n| date = 29 May 1877\n| place = Off [[Ilo, Peru|Ylo]], [[Pac [...] +{"title":"Beant Singh (assassin)","content":"{{Short description|Sikh bodyguard and assassin of Indian Prime Minister Indira Gandhi}}\n{{Other uses|Beant Singh (disambiguation)}}\n{{more citations needed|date=September 2015}}\n{{Use dmy dates|date=June 2020}}\n{{Infobox person\n| name = Beant Singh\n| image = Photograph of Beant Singh, one of two assassins of Indira Gandhi.jpg\n| caption = \n| native_name = \n| birth_name = Beant Singh [...] +{"title":"Bedford railway station","content":"{{Short description|Railway station in Bedfordshire, England}}\n{{about|the station in Bedford, Bedfordshire, England|the proposed station in Bedford, Virginia|Bedford station (Virginia)}}\n{{Use dmy dates|date=March 2015}}\n{{Use British English|date=March 2015}}\n{{Infobox station\n| name = Bedford\n| symbol_location = gb\n| symbol = rail\n| image = Bedford railway station MMB 06 222022.jpg\n| address = \n| borough = [[Bedford]], [[Borough [...] +{"title":"Ben Cheney","content":"{{more citations needed|date=December 2010}}\n{{Use mdy dates|date=August 2023}} \n{{Infobox person\n| name = Ben Cheney\n| birth_name = Ben Bradbury Cheney\n| birth_date = {{birth date|1905|03|24}}\n| birth_place = [[Lima, Montana]], U.S.\n| death_date = {{death date and age|1971|05|18|1905|03|24}}\n| death_place = [[Tacoma, Washington]], U.S.\n}}\n'''Ben Bradbury Cheney''' (March 24, 1905 – May 18, 1971) was an American businessman and sports enthusiast [...] +{"title":"Beneventan Script","redirect":"Beneventan script"} +{"title":"Bengt Gabrielsson, Greve Oxenstierna","redirect":"Bengt Gabrielsson Oxenstierna"} diff --git a/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json new file mode 100644 index 00000000000..859f55797be --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/data/sorted_wikipedia-50-2.json @@ -0,0 +1,50 @@ +{"title":"100 Rifles","content":"{{short description|1969 American Western film}}\n{{Use American English|date=October 2021}}\n{{Use mdy dates|date=July 2020}}\n{{Infobox film\n| name = 100 Rifles\n| image = 100 Rifles (movie poster).jpg\n| caption = Theatrical release poster\n| director = [[Tom Gries]]\n| producer = [[Marvin Schwartz]]\n| screenplay = [[Clair Huffaker]]<br>Tom Gries\n| based_on = {{based on|''The Californio''<br>1967 novel [...] +{"title":"4-By The Beatles (EP)","redirect":"4 by the Beatles"} +{"title":"Abitibi (electoral district)","redirect":"Abitibi—Baie-James—Nunavik—Eeyou"} +{"title":"Adblock Plus","content":"{{Distinguish|AdBlock}}\n{{short description|Content-filtering and ad blocking browser extension}}\n{{Use mdy dates|date=March 2022}}\n{{Infobox software\n| name = Adblock Plus\n| logo = Adblock Plus 2014 Logo.svg\n| screenshot = Adblock-plus-1.2-en-preferences-add-exception-xfwm4.png\n| caption = Preferences dialog box of Adblock Plus showing a group of filters\n| developer = Eyeo GmbH<r [...] +{"title":"Aframomum melegueta","redirect":"Grains of paradise"} +{"title":"Alexander Stamboliski","redirect":"Aleksandar Stamboliyski"} +{"title":"Always Be My Baby","content":"{{Short description|1996 single by Mariah Carey}}\n{{About||the Sara Evans country song|You'll Always Be My Baby||Always Be My Maybe (disambiguation) {{!}}Always Be My Maybe}}\n{{Use American English|date=September 2020}}\n{{Use mdy dates|date=December 2017}}\n{{Infobox song\n| name = Always Be My Baby\n| cover = Always Be My Baby (Mariah Carey single - cover art).jpg\n| alt = A black-and-white photo of Carey smiling from [...] +{"title":"Annapolis Valley (electoral district)","redirect":"Kings—Hants"} +{"title":"Anyox","content":"{{Short description|Ghost town in British Columbia, Canada}}\n[[File:Anyox British Columbia 1911.jpg|200px|thumb|right|Anyox, British Columbia]]\n'''Anyox''' was a small company-owned mining town in [[British Columbia]], Canada.<ref>{{BCGNIS|36025|Anyox}}</ref> Today it is a [[ghost town]], abandoned and largely destroyed. It is located on the shores of Granby Bay in coastal [[Observatory Inlet]], about {{convert|60|km|mi|0|abbr=off}} southeast of (but without [...] +{"title":"Argenteuil—Papineau","redirect":"Argenteuil—Papineau—Mirabel"} +{"title":"Art of Ancient Greece","redirect":"Ancient Greek art"} +{"title":"BSG 75","redirect":"Battlestar Galactica (fictional spacecraft)"} +{"title":"BSG-75","redirect":"Battlestar Galactica (fictional spacecraft)"} +{"title":"BSG75","redirect":"Battlestar Galactica (fictional spacecraft)"} +{"title":"Bangor & Aroostook","redirect":"Bangor and Aroostook Railroad"} +{"title":"Barrie—Simcoe—Bradford","redirect":"Barrie (federal electoral district)"} +{"title":"Bas-Richelieu—Nicolet—Becancour","redirect":"Bécancour—Nicolet—Saurel"} +{"title":"Battleford—Kindersley","content":"'''Battleford—Kindersley''' was a federal [[electoral district (Canada)|electoral district]] (riding) n [[Saskatchewan]], Canada, that was represented in the [[House of Commons of Canada]] from 1968 to 1979.\n\nThis [[Riding (division)|riding]] was created in 1966 from parts of [[Kindersley (electoral district)|Kindersley]], [[The Battlefords (federal electoral district)|The Battlefords]] and [[Rosetown—Biggar (federal electoral district)|Roset [...] +{"title":"Beaches (federal electoral district)","content":"{{for|the defunct provincial electoral district|Beaches (provincial electoral district)}}\n{{Infobox Canada electoral district\n| province = Ontario\n| image = Beaches riding.png\n| caption = Beaches in relation to other electoral districts in Toronto\n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1976\n| fed-abolished = 1987\n| fed-election-first = [...] +{"title":"Beauport—Montmorency—Côte-de-Beaupré—Île-d'Orléans","redirect":"Montmorency (federal electoral district)"} +{"title":"Beauport—Montmorency—Orléans","redirect":"Montmorency (federal electoral district)"} +{"title":"Beaver River (federal electoral district)","content":"<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Beaver River (federal electoral district)]]'' --->\n'''Beaver River''' was a federal [[electoral district (Canada)|electoral district]] represented in the [[House of Commons of Canada]] from 1988 to 1997.\n\nIt was located in the [[provinces and territories of Canada|province]] of [[Alberta]]. This riding was created in 1987, and w [...] +{"title":"Belarus at the 2000 Summer Olympics","content":"{{Use dmy dates|date=September 2021}}\n{{infobox country at games\n| NOC = BLR\n| NOCname = [[Belarus Olympic Committee]]\n| games = Summer Olympics\n| year = 2000\n| flagcaption = \n| oldcode = \n| website = {{url|www.noc.by }} {{in lang|ru|en}}\n| location = [[Sydney]]\n| competitors = 139 (72 men and 67 women)\n| sports = 20\n| flagbearer = [[Sergey Lishtvan]]\n| rank = 23\n| gold = 3\n| silver = 3\n| bronze = 11\n| offici [...] +{"title":"Bella Ciao","redirect":"Bella ciao"} +{"title":"Bernard Fokke","content":"{{Short description|17th-century Frisian-born captain for the Dutch East India Company.}}\n'''Bernard''' or '''Barend Fokke''', sometimes known as '''Barend Fockesz''', was a 17th-century, [[Frisians|Frisian]]-born [[Captain (nautical)|captain]] for the [[Dutch East India Company]]. He was renowned for the uncanny speed of his trips from the [[Dutch Republic]] to [[Java (island)|Java]]. For example, in 1678, he traveled the distance in 3 months and 4 ( [...] +{"title":"Berosus","content":"'''Berosus''' may refer to:\n*In Greek mythology:\n**Berosus, father of Tanais by [[Lysippe (Amazon)]]\n**Berosus, father of the [[Sibyl]] Sabbe by Erymanthe\n*[[Berossus]] (3rd century BC), Hellenistic-era Babylonian writer and astronomer\n*[[Berosus (beetle)]], a genus of beetles of the family [[Hydrophilidae]]\n*[[Berosus (crater)]], a lunar crater\n\n{{disambig}}"} +{"title":"Berthier—Montcalm","content":"<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Berthier—Montcalm (federal electoral district)]]'' --->\n{{Infobox Canada electoral district\n| name = Berthier—Montcalm\n| province = Quebec\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1987\n| fed-abolished = 2003\n| fed-election-first = 1988\n| [...] +{"title":"Billie Dove","content":"{{Short description|American actress (1903–1997)}}\n{{Use American English|date=July 2020}}\n{{Use mdy dates|date=August 2014}}\n{{Infobox person\n| name = Billie Dove\n| birth_name = Bertha Eugenie Bohny\n| image = Billy Dove portrait photograph with roses (retouched).jpg\n| caption = Dove in 1920\n| birth_date = {{Birth date|1903|5|14|mf=yes}}\n| birth_place = New York City, U.S.\n| dea [...] +{"title":"Blainville—Deux-Montagne","redirect":"Blainville—Deux-Montagnes"} +{"title":"Blainville—Deux-Montagnes","content":"{{Use Canadian English|date=January 2023}}\n{{Infobox Canada electoral district\n| name = Blainville—Deux-Montagnes\n| province = Quebec\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1976\n| fed-abolished = 1996\n| fed-election-first = 1979\n| fed-election-last = 1993\n| fed-rep = \n| fed-rep-party = \n| demo-pop-r [...] +{"title":"Blow fly","redirect":"Blowfly"} +{"title":"Bombing of Vietnam's Dikes","redirect":"Proposed bombing of Vietnam's dikes"} +{"title":"Bombing of the dikes","redirect":"Proposed bombing of Vietnam's dikes"} +{"title":"Bombing of the dykes","redirect":"Proposed bombing of Vietnam's dikes"} +{"title":"Bonaventure—Îles-de-la-Madeleine","redirect":"Bonaventure (federal electoral district)"} +{"title":"Bonavista—Trinity—Conception","content":"{{Infobox Canada electoral district\n| province = Newfoundland and Labrador\n| image = \n| caption = \n| fed-status = defunct\n| fed-created = 1966\n| fed-abolished = 2003\n| fed-election-first = 1968\n| fed-election-last = 2002 by-election\n}}\n<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Bonavista—Trinity—Conception (elector [...] +{"title":"Boston & Maine","redirect":"Boston and Maine Railroad"} +{"title":"Boston-area streetcar lines/old","redirect":"Boston-area streetcar lines"} +{"title":"Brampton (federal electoral district)","content":"''For the defunct provincial electoral district, see [[Brampton (provincial electoral district)]].''\n{{Infobox Canada electoral district\n| name = Brampton\n| province = Ontario\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1987\n| fed-abolished = 1996\n| fed-election-first = 1988\n| fed-election-last = 1 [...] +{"title":"Brampton Centre (federal electoral district)","content":"{{short description|Federal electoral district in Ontario, Canada}}\n{{use mdy dates|date=October 2021}}\n{{for|the future provincial electoral district|Brampton Centre (provincial electoral district)}}\n{{Infobox Canada electoral district\n| province = Ontario\n| image = Brampton Centre 2015.svg\n| caption = Brampton Centre in relation to other [[Greater Toronto Area]] districts\n| fe [...] +{"title":"Brampton West—Mississauga (federal electoral district)","content":"{{for|the defunct provincial electoral district|Brampton West—Mississauga (provincial electoral district)}}\n{{Infobox Canada electoral district\n| province = Ontario\n| image = [[File:Brampton West-Mississauga (riding map).png|250px]]\n| caption = Map of the riding\n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1996\n| fed-abolished [...] +{"title":"Brampton—Georgetown","content":"<!--- uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Brampton–Georgetown (electoral district)]]'' --->\n{{Infobox Canada electoral district\n| name = Brampton—Georgetown\n| province = Ontario\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1976\n| fed-abolished = 1987\n| fed-election-f [...] +{"title":"Bras d'Or (electoral district)","redirect":"Cape Breton—Canso"} +{"title":"Brossard—La-Prairie","redirect":"Brossard—La Prairie"} +{"title":"Bruce—Grey","redirect":"Bruce—Grey—Owen Sound (federal electoral district)"} +{"title":"Burin—St. George's","content":"{{Infobox Canada electoral district\n| province = Newfoundland and Labrador\n| image = \n| caption = \n| fed-status = defunct\n| fed-created = 1976\n| fed-abolished = 2003\n| fed-election-first = 1979\n| fed-election-last = 2004\n}}\n<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Burin—St. George's (electoral district)]]'' --->\n'''Burin— [...] +{"title":"Burnaby (federal electoral district)","content":"{{for|the historical provincial electoral district of the same name|Burnaby (provincial electoral district)}}\n{{Infobox Canada electoral district\n| name = Burnaby\n| province = British Columbia\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1976\n| fed-abolished = 1987\n| fed-election-first = 1979\n| fed-election-last = [...] +{"title":"Burnaby—Kingsway","content":"{{Infobox Canada electoral district\n| province = British Columbia\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1987\n| fed-abolished = 1996\n| fed-election-first = 1988\n| fed-election-last = 1993\n| fed-rep = \n| fed-rep-link = \n| fed-rep-party = \n| fed-rep-party-link = \n| demo-pop-ref = \n| demo-area-r [...] +{"title":"Bussells","redirect":"Bussell family"} +{"title":"Cape Breton Highlands—Canso","content":"<!--uncomment if needed ''For the current|defunct federal|provincial electoral district, see [[Cape Breton Highlands—Canso (electoral district)]]'' --->\n{{Infobox Canada electoral district\n| name = Cape Breton Highlands—Canso\n| province = Nova Scotia\n| image = \n| caption = \n| fed-status = defunct\n| fed-district-number = \n| fed-created = 1966\n| fed-abolished [...] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org