This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 78bd61e6d5e [feat](inverted index) Adding Storage Format V3 for Inverted Index (#44414) 78bd61e6d5e is described below commit 78bd61e6d5e7fb1e869cbbd7c1f987a2dff01918 Author: zzzxl <yangs...@selectdb.com> AuthorDate: Wed Dec 4 17:12:30 2024 +0800 [feat](inverted index) Adding Storage Format V3 for Inverted Index (#44414) Problem Summary: 1. "Mainly added the functionality for compressing inverted index position information and dictionary information." 2. "Position information compression must be enabled by setting inverted_index_storage_format to v3 when creating the table." e.g. ``` CREATE TABLE tbl ( ... ) ENGINE=OLAP DUPLICATE KEY(`x`) COMMENT "OLAP" DISTRIBUTED BY RANDOM BUCKETS 1 PROPERTIES ( "inverted_index_storage_format" = "V3" ); ``` 4. "The dictionary compression feature requires setting inverted_index_storage_format to v3 and configuring dict_compression to true in the properties." e.g. ``` INDEX x_idx (`x`) USING INVERTED PROPERTIES("dict_compression" = "true") COMMENT '' ``` --- be/src/clucene | 2 +- be/src/olap/inverted_index_parser.cpp | 9 ++ be/src/olap/inverted_index_parser.h | 5 + be/src/olap/rowset/beta_rowset.cpp | 18 +++- .../char_filter/char_replace_char_filter.h | 2 +- .../segment_v2/inverted_index_file_reader.cpp | 8 +- .../rowset/segment_v2/inverted_index_file_reader.h | 2 +- .../segment_v2/inverted_index_file_writer.cpp | 22 ++-- .../rowset/segment_v2/inverted_index_file_writer.h | 10 +- .../rowset/segment_v2/inverted_index_writer.cpp | 22 ++++ be/src/olap/tablet_meta.cpp | 3 + .../inverted_index/query/phrase_query_test.cpp | 1 - .../segment_v2/inverted_index_file_writer_test.cpp | 8 +- .../apache/doris/analysis/InvertedIndexUtil.java | 12 ++- .../cloud/datasource/CloudInternalCatalog.java | 6 +- .../apache/doris/common/util/PropertyAnalyzer.java | 6 ++ .../doris/analysis/InvertedIndexUtilTest.java | 47 +++++++++ .../apache/doris/common/PropertyAnalyzerTest.java | 50 +++++++++ gensrc/proto/olap_file.proto | 1 + gensrc/thrift/Types.thrift | 3 +- .../inverted_index_p0/test_inverted_index_v3.out | 25 +++++ .../test_inverted_index_v3_fault_injection.groovy | 60 +++++++++++ .../test_inverted_index_v3.groovy | 117 +++++++++++++++++++++ 23 files changed, 406 insertions(+), 33 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 48fa9cc4ec3..a506dbb6c52 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 48fa9cc4ec32b40bf3b02338d0a1b2cdbc6408cf +Subproject commit a506dbb6c523aa65044eb1c527a066d236172543 diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index f7e511970d9..f1de5a5e0c1 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -136,4 +136,13 @@ std::string get_parser_stopwords_from_properties( } } +std::string get_parser_dict_compression_from_properties( + const std::map<std::string, std::string>& properties) { + if (properties.find(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY); + } else { + return ""; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 0b8426d74c7..f1f85995a20 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -83,6 +83,8 @@ const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords"; +const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = "dict_compression"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -119,4 +121,7 @@ std::string get_parser_lowercase_from_properties( std::string get_parser_stopwords_from_properties( const std::map<std::string, std::string>& properties); +std::string get_parser_dict_compression_from_properties( + const std::map<std::string, std::string>& properties); + } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index bbb2ca72b4a..cd52deed0c8 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -703,10 +703,24 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, rapidjson::Document::AllocatorType& allocator) { const auto& fs = _rowset_meta->fs(); auto storage_format = _schema->get_inverted_index_storage_format(); - auto format_str = storage_format == InvertedIndexStorageFormatPB::V1 ? "V1" : "V2"; + std::string format_str; + switch (storage_format) { + case InvertedIndexStorageFormatPB::V1: + format_str = "V1"; + break; + case InvertedIndexStorageFormatPB::V2: + format_str = "V2"; + break; + case InvertedIndexStorageFormatPB::V3: + format_str = "V3"; + break; + default: + return Status::InternalError("inverted index storage format error"); + break; + } auto rs_id = rowset_id().to_string(); rowset_value->AddMember("rowset_id", rapidjson::Value(rs_id.c_str(), allocator), allocator); - rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str, allocator), + rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str.c_str(), allocator), allocator); rapidjson::Value segments(rapidjson::kArrayType); for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h index d9e5080d2d5..1e5e6f5d5ce 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h @@ -17,7 +17,7 @@ #pragma once -#include <CLucene.h> +#include <CLucene.h> // IWYU pragma: keep #include <CLucene/analysis/CharFilter.h> #include <bitset> diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp index 113833d560f..8d480829a0c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp @@ -30,8 +30,8 @@ namespace doris::segment_v2 { Status InvertedIndexFileReader::init(int32_t read_buffer_size) { if (!_inited) { _read_buffer_size = read_buffer_size; - if (_storage_format == InvertedIndexStorageFormatPB::V2) { - auto st = _init_from_v2(read_buffer_size); + if (_storage_format >= InvertedIndexStorageFormatPB::V2) { + auto st = _init_from(read_buffer_size); if (!st.ok()) { return st; } @@ -41,7 +41,7 @@ Status InvertedIndexFileReader::init(int32_t read_buffer_size) { return Status::OK(); } -Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { +Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size) { auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); std::unique_lock<std::shared_mutex> lock(_mutex); // Lock for writing @@ -79,7 +79,7 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { // 3. read file int32_t version = _stream->readInt(); // Read version number - if (version == InvertedIndexStorageFormatPB::V2) { + if (version >= InvertedIndexStorageFormatPB::V2) { DCHECK(version == _storage_format); int32_t numIndices = _stream->readInt(); // Read number of indices ReaderFileEntry* entry = nullptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h index 3b7161c7643..443d40cfaf0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h @@ -70,7 +70,7 @@ public: int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } private: - Status _init_from_v2(int32_t read_buffer_size); + Status _init_from(int32_t read_buffer_size); Result<std::unique_ptr<DorisCompoundReader>> _open(int64_t index_id, const std::string& index_suffix) const; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp index bb373be5ee9..4d6892aa785 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp @@ -150,7 +150,7 @@ Status InvertedIndexFileWriter::close() { } } else { try { - RETURN_IF_ERROR(write_v2()); + RETURN_IF_ERROR(write()); for (const auto& entry : _indices_dirs) { const auto& dir = entry.second; // delete index path, which contains separated inverted index files @@ -293,7 +293,7 @@ Status InvertedIndexFileWriter::write_v1() { return Status::OK(); } -Status InvertedIndexFileWriter::write_v2() { +Status InvertedIndexFileWriter::write() { std::unique_ptr<lucene::store::Directory, DirectoryDeleter> out_dir = nullptr; std::unique_ptr<lucene::store::IndexOutput> compound_file_output = nullptr; ErrorContext error_context; @@ -301,10 +301,10 @@ Status InvertedIndexFileWriter::write_v2() { // Calculate header length and initialize offset int64_t current_offset = headerLength(); // Prepare file metadata - auto file_metadata = prepare_file_metadata_v2(current_offset); + auto file_metadata = prepare_file_metadata(current_offset); // Create output stream - auto result = create_output_stream_v2(); + auto result = create_output_stream(); out_dir = std::move(result.first); compound_file_output = std::move(result.second); @@ -315,7 +315,7 @@ Status InvertedIndexFileWriter::write_v2() { write_index_headers_and_metadata(compound_file_output.get(), file_metadata); // Copy file data - copy_files_data_v2(compound_file_output.get(), file_metadata); + copy_files_data(compound_file_output.get(), file_metadata); _total_file_size = compound_file_output->getFilePointer(); _file_info.set_index_size(_total_file_size); @@ -470,7 +470,7 @@ void InvertedIndexFileWriter::write_header_and_data_v1(lucene::store::IndexOutpu std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>, std::unique_ptr<lucene::store::IndexOutput>> -InvertedIndexFileWriter::create_output_stream_v2() { +InvertedIndexFileWriter::create_output_stream() { io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); @@ -486,15 +486,15 @@ InvertedIndexFileWriter::create_output_stream_v2() { void InvertedIndexFileWriter::write_version_and_indices_count(lucene::store::IndexOutput* output) { // Write the version number - output->writeInt(InvertedIndexStorageFormatPB::V2); + output->writeInt(_storage_format); // Write the number of indices const auto num_indices = static_cast<uint32_t>(_indices_dirs.size()); output->writeInt(num_indices); } -std::vector<InvertedIndexFileWriter::FileMetadata> -InvertedIndexFileWriter::prepare_file_metadata_v2(int64_t& current_offset) { +std::vector<InvertedIndexFileWriter::FileMetadata> InvertedIndexFileWriter::prepare_file_metadata( + int64_t& current_offset) { std::vector<FileMetadata> file_metadata; for (const auto& entry : _indices_dirs) { @@ -546,8 +546,8 @@ void InvertedIndexFileWriter::write_index_headers_and_metadata( } } -void InvertedIndexFileWriter::copy_files_data_v2(lucene::store::IndexOutput* output, - const std::vector<FileMetadata>& file_metadata) { +void InvertedIndexFileWriter::copy_files_data(lucene::store::IndexOutput* output, + const std::vector<FileMetadata>& file_metadata) { const int64_t buffer_length = 16384; uint8_t buffer[buffer_length]; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index ba42ffdceb1..ab7cdbff152 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -71,7 +71,7 @@ public: Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); virtual ~InvertedIndexFileWriter() = default; - Status write_v2(); + Status write(); Status write_v1(); Status close(); const InvertedIndexFileInfo* get_index_file_info() const { @@ -122,7 +122,7 @@ private: // Helper functions specific to write_v2 virtual std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>, std::unique_ptr<lucene::store::IndexOutput>> - create_output_stream_v2(); + create_output_stream(); void write_version_and_indices_count(lucene::store::IndexOutput* output); struct FileMetadata { int64_t index_id; @@ -141,11 +141,11 @@ private: length(len), directory(dir) {} }; - std::vector<FileMetadata> prepare_file_metadata_v2(int64_t& current_offset); + std::vector<FileMetadata> prepare_file_metadata(int64_t& current_offset); virtual void write_index_headers_and_metadata(lucene::store::IndexOutput* output, const std::vector<FileMetadata>& file_metadata); - void copy_files_data_v2(lucene::store::IndexOutput* output, - const std::vector<FileMetadata>& file_metadata); + void copy_files_data(lucene::store::IndexOutput* output, + const std::vector<FileMetadata>& file_metadata); Status _insert_directory_into_map(int64_t index_id, const std::string& index_suffix, std::shared_ptr<DorisFSDirectory> dir); // Member variables... diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 86a8f89e4c9..02edf2f1976 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -212,6 +212,28 @@ public: (*field)->setOmitTermFreqAndPositions( !(get_parser_phrase_support_string_from_properties(_index_meta->properties()) == INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES)); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_v3", { + if (_index_file_writer->get_storage_format() != InvertedIndexStorageFormatPB::V3) { + return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>( + "debug point: InvertedIndexColumnWriterImpl::create_field_v3 error"); + } + }) + if (_index_file_writer->get_storage_format() >= InvertedIndexStorageFormatPB::V3) { + (*field)->setIndexVersion(IndexVersion::kV3); + // Only effective in v3 + std::string dict_compression = + get_parser_dict_compression_from_properties(_index_meta->properties()); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_dic_compression", { + if (dict_compression != INVERTED_INDEX_PARSER_TRUE) { + return Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>( + "debug point: " + "InvertedIndexColumnWriterImpl::create_field_dic_compression error"); + } + }) + if (dict_compression == INVERTED_INDEX_PARSER_TRUE) { + (*field)->updateFlag(FlagBits::DICT_COMPRESS); + } + } return Status::OK(); } diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index fc9fc034b0b..d1746836e23 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -203,6 +203,9 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id case TInvertedIndexFileStorageFormat::V2: schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); break; + case TInvertedIndexFileStorageFormat::V3: + schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3); + break; default: schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); break; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp index f3fb9763c9b..b8e11bece7b 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp @@ -62,7 +62,6 @@ TEST_F(PhraseQueryTest, test_parser_info) { EXPECT_EQ(query_info.slop, res3); EXPECT_EQ(query_info.ordered, res4); EXPECT_EQ(query_info.additional_terms.size(), res5); - std::cout << "--- 1 ---: " << query_info.to_string() << std::endl; }; // "english/history off.gif ~20+" sequential_opt = true diff --git a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp index b454080434a..41703d49d5e 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp @@ -506,7 +506,7 @@ TEST_F(InvertedIndexFileWriterTest, WriteV2ExceptionHandlingTest) { EXPECT_CALL(writer_mock, write_index_headers_and_metadata(::testing::_, ::testing::_)) .WillOnce(::testing::Throw(CLuceneError(CL_ERR_IO, "Simulated exception", false))); - Status status = writer_mock.write_v2(); + Status status = writer_mock.write(); ASSERT_FALSE(status.ok()); ASSERT_EQ(status.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR); } @@ -523,7 +523,7 @@ public: MOCK_METHOD((std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>, std::unique_ptr<lucene::store::IndexOutput>>), - create_output_stream_v2, (), (override)); + create_output_stream, (), (override)); }; class InvertedIndexFileWriterMockCreateOutputStreamV1 : public InvertedIndexFileWriter { @@ -622,7 +622,7 @@ TEST_F(InvertedIndexFileWriterTest, WriteV2OutputTest) { auto compound_file_output = std::unique_ptr<DorisFSDirectory::FSIndexOutputV2>(mock_output_v2); compound_file_output->init(file_writer.get()); - EXPECT_CALL(writer_mock, create_output_stream_v2()) + EXPECT_CALL(writer_mock, create_output_stream()) .WillOnce(::testing::Invoke( [&]() -> std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>, std::unique_ptr<lucene::store::IndexOutput>> { @@ -680,7 +680,7 @@ TEST_F(InvertedIndexFileWriterTest, WriteV2OutputCloseErrorTest) { auto compound_file_output = std::unique_ptr<DorisFSDirectory::FSIndexOutputV2>(mock_output_v2); compound_file_output->init(file_writer.get()); - EXPECT_CALL(writer_mock, create_output_stream_v2()) + EXPECT_CALL(writer_mock, create_output_stream()) .WillOnce(::testing::Invoke( [&]() -> std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>, std::unique_ptr<lucene::store::IndexOutput>> { diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index abba2762d56..dd6a1a7612a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -54,6 +54,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords"; + public static String INVERTED_INDEX_DICT_COMPRESSION_KEY = "dict_compression"; + public static String getInvertedIndexParser(Map<String, String> properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -157,7 +159,8 @@ public class InvertedIndexUtil { INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY, INVERTED_INDEX_PARSER_LOWERCASE_KEY, - INVERTED_INDEX_PARSER_STOPWORDS_KEY + INVERTED_INDEX_PARSER_STOPWORDS_KEY, + INVERTED_INDEX_DICT_COMPRESSION_KEY )); for (String key : properties.keySet()) { @@ -174,6 +177,7 @@ public class InvertedIndexUtil { String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY); String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY); String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY); + String dictCompression = properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY); if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) { throw new AnalysisException("Invalid inverted index 'parser' value: " + parser @@ -221,5 +225,11 @@ public class InvertedIndexUtil { throw new AnalysisException("Invalid inverted index 'stopWords' value: " + stopWords + ", stopWords must be none"); } + + if (dictCompression != null && !dictCompression.matches("true|false")) { + throw new AnalysisException( + "Invalid inverted index 'dict_compression' value: " + + dictCompression + ", dict_compression must be true or false"); + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java index b8a364f9449..e14b4efb0d5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java @@ -354,8 +354,12 @@ public class CloudInternalCatalog extends InternalCatalog { if (invertedIndexFileStorageFormat != null) { if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V1) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V1); - } else { + } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V2) { schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2); + } else if (invertedIndexFileStorageFormat == TInvertedIndexFileStorageFormat.V3) { + schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3); + } else { + throw new DdlException("invalid inverted index storage format"); } } schemaBuilder.setRowStorePageSize(pageSize); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java index 5721db0c27e..915b5d48f01 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java @@ -1126,6 +1126,8 @@ public class PropertyAnalyzer { } else { if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) { return TInvertedIndexFileStorageFormat.V1; + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V3")) { + return TInvertedIndexFileStorageFormat.V3; } else { return TInvertedIndexFileStorageFormat.V2; } @@ -1135,9 +1137,13 @@ public class PropertyAnalyzer { return TInvertedIndexFileStorageFormat.V1; } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v2")) { return TInvertedIndexFileStorageFormat.V2; + } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v3")) { + return TInvertedIndexFileStorageFormat.V3; } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("default")) { if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) { return TInvertedIndexFileStorageFormat.V1; + } else if (Config.inverted_index_storage_format.equalsIgnoreCase("V3")) { + return TInvertedIndexFileStorageFormat.V3; } else { return TInvertedIndexFileStorageFormat.V2; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java new file mode 100644 index 00000000000..a9be242cf3f --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.Map; + +public class InvertedIndexUtilTest { + @Test + public void testCheckInvertedIndexProperties() throws AnalysisException { + Map<String, String> properties = new HashMap<>(); + properties.put(InvertedIndexUtil.INVERTED_INDEX_DICT_COMPRESSION_KEY, "true"); + + InvertedIndexUtil.checkInvertedIndexProperties(properties); + + properties.put(InvertedIndexUtil.INVERTED_INDEX_DICT_COMPRESSION_KEY, "invalid_value"); + try { + InvertedIndexUtil.checkInvertedIndexProperties(properties); + Assertions.fail("Expected AnalysisException was not thrown"); + } catch (AnalysisException e) { + Assertions.assertEquals( + "errCode = 2, detailMessage = Invalid inverted index 'dict_compression' value: invalid_value, " + + "dict_compression must be true or false", + e.getMessage()); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java index 041ca89bfc5..6d708aa0826 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java @@ -28,6 +28,7 @@ import org.apache.doris.catalog.Type; import org.apache.doris.common.util.PropertyAnalyzer; import org.apache.doris.common.util.TimeUtils; import org.apache.doris.resource.Tag; +import org.apache.doris.thrift.TInvertedIndexFileStorageFormat; import org.apache.doris.thrift.TStorageFormat; import org.apache.doris.thrift.TStorageMedium; @@ -37,6 +38,7 @@ import com.google.common.collect.Sets; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; +import org.junit.jupiter.api.Assertions; import org.junit.rules.ExpectedException; import java.time.Instant; @@ -236,4 +238,52 @@ public class PropertyAnalyzerTest { Assert.assertTrue(e.getMessage().contains("Storage page size must be between 4KB and 10MB")); } } + + @Test + public void testAnalyzeInvertedIndexFileStorageFormat() throws AnalysisException { + TInvertedIndexFileStorageFormat result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(null); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result); + + Config.inverted_index_storage_format = "V1"; + result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(new HashMap<>()); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result); + + Map<String, String> propertiesWithV1 = new HashMap<>(); + propertiesWithV1.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, "v1"); + result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV1); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result); + + Map<String, String> propertiesWithV2 = new HashMap<>(); + propertiesWithV2.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, "v2"); + result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV2); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result); + + Map<String, String> propertiesWithV3 = new HashMap<>(); + propertiesWithV3.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, "v3"); + result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV3); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V3, result); + + Config.inverted_index_storage_format = "V1"; + Map<String, String> propertiesWithDefaultV1 = new HashMap<>(); + propertiesWithDefaultV1.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, "default"); + result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithDefaultV1); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result); + + Config.inverted_index_storage_format = "V2"; + Map<String, String> propertiesWithDefaultV2 = new HashMap<>(); + propertiesWithDefaultV2.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, "default"); + result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithDefaultV2); + Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result); + + Map<String, String> propertiesWithUnknown = new HashMap<>(); + propertiesWithUnknown.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, "unknown_format"); + try { + PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithUnknown); + Assertions.fail("Expected AnalysisException was not thrown"); + } catch (AnalysisException e) { + Assertions.assertEquals( + "errCode = 2, detailMessage = unknown inverted index storage format: unknown_format", + e.getMessage()); + } + } } diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 259f9f2861a..41f8727ed1b 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -343,6 +343,7 @@ enum IndexType { enum InvertedIndexStorageFormatPB { V1 = 0; V2 = 1; + V3 = 2; } message TabletIndexPB { diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift index 1912f950587..623da9ce067 100644 --- a/gensrc/thrift/Types.thrift +++ b/gensrc/thrift/Types.thrift @@ -124,7 +124,8 @@ enum TStorageBackendType { enum TInvertedIndexFileStorageFormat { DEFAULT = 0, // Default format, unspecified storage method. V1 = 1, // Index per idx: Each index is stored separately based on its identifier. - V2 = 2 // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. + V2 = 2 // Segment id per idx: Indexes are organized based on segment identifiers, grouping indexes by their associated segment. + V3 = 3 // Position and dictionary compression } struct TScalarType { diff --git a/regression-test/data/inverted_index_p0/test_inverted_index_v3.out b/regression-test/data/inverted_index_p0/test_inverted_index_v3.out new file mode 100644 index 00000000000..9dc20f3e0e0 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_inverted_index_v3.out @@ -0,0 +1,25 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +238 + +-- !sql -- +104 + +-- !sql -- +104 + +-- !sql -- +105 + +-- !sql -- +238 + +-- !sql -- +104 + +-- !sql -- +104 + +-- !sql -- +105 + diff --git a/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy b/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy new file mode 100644 index 00000000000..98c0e110964 --- /dev/null +++ b/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_inverted_index_v3_fault_injection", "nonConcurrent"){ + def indexTbName1 = "test_inverted_index_v3_fault_injection" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX clientip_idx (`clientip`) COMMENT '', + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "V3" + ); + """ + + try { + GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3") + + sql """ INSERT INTO ${indexTbName1} VALUES (1, '40.135.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 24736); """ + } finally { + GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3") + } + + try { + GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3") + GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_dic_compression") + + sql """ INSERT INTO ${indexTbName1} VALUES (2, '40.135.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 24736); """ + } finally { + GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3") + GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_dic_compression") + } +} \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy b/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy new file mode 100644 index 00000000000..ea7dd0b595f --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_inverted_index_v3", "p0"){ + def indexTbName1 = "test_inverted_index_v3_1" + def indexTbName2 = "test_inverted_index_v3_2" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + sql "DROP TABLE IF EXISTS ${indexTbName2}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "V2" + ); + """ + + sql """ + CREATE TABLE ${indexTbName2} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "inverted_index_storage_format" = "V3" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, indexTbName1, 'true', 'json', 'documents-1000.json') + load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 'documents-1000.json') + + sql "sync" + + qt_sql """ select count() from ${indexTbName1} where request match_any 'hm bg'; """ + qt_sql """ select count() from ${indexTbName1} where request match_all 'hm bg'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'hm bg'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase_prefix 'hm bg'; """ + + qt_sql """ select count() from ${indexTbName2} where request match_any 'hm bg'; """ + qt_sql """ select count() from ${indexTbName2} where request match_all 'hm bg'; """ + qt_sql """ select count() from ${indexTbName2} where request match_phrase 'hm bg'; """ + qt_sql """ select count() from ${indexTbName2} where request match_phrase_prefix 'hm bg'; """ + + } finally { + } +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org