This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 78bd61e6d5e [feat](inverted index) Adding Storage Format V3 for 
Inverted Index (#44414)
78bd61e6d5e is described below

commit 78bd61e6d5e7fb1e869cbbd7c1f987a2dff01918
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Wed Dec 4 17:12:30 2024 +0800

    [feat](inverted index) Adding Storage Format V3 for Inverted Index (#44414)
    
    Problem Summary:
    
    1. "Mainly added the functionality for compressing inverted index
    position information and dictionary information."
    
    2. "Position information compression must be enabled by setting
    inverted_index_storage_format to v3 when creating the table."
    
    e.g.
    ```
        CREATE TABLE tbl (
              ...
        ) ENGINE=OLAP
        DUPLICATE KEY(`x`)
        COMMENT "OLAP"
        DISTRIBUTED BY RANDOM BUCKETS 1
        PROPERTIES (
        "inverted_index_storage_format" = "V3"
        );
    ```
    4. "The dictionary compression feature requires setting
    inverted_index_storage_format to v3 and configuring dict_compression to
    true in the properties."
    
    e.g.
    ```
    INDEX x_idx (`x`) USING INVERTED PROPERTIES("dict_compression" = "true") 
COMMENT ''
    ```
---
 be/src/clucene                                     |   2 +-
 be/src/olap/inverted_index_parser.cpp              |   9 ++
 be/src/olap/inverted_index_parser.h                |   5 +
 be/src/olap/rowset/beta_rowset.cpp                 |  18 +++-
 .../char_filter/char_replace_char_filter.h         |   2 +-
 .../segment_v2/inverted_index_file_reader.cpp      |   8 +-
 .../rowset/segment_v2/inverted_index_file_reader.h |   2 +-
 .../segment_v2/inverted_index_file_writer.cpp      |  22 ++--
 .../rowset/segment_v2/inverted_index_file_writer.h |  10 +-
 .../rowset/segment_v2/inverted_index_writer.cpp    |  22 ++++
 be/src/olap/tablet_meta.cpp                        |   3 +
 .../inverted_index/query/phrase_query_test.cpp     |   1 -
 .../segment_v2/inverted_index_file_writer_test.cpp |   8 +-
 .../apache/doris/analysis/InvertedIndexUtil.java   |  12 ++-
 .../cloud/datasource/CloudInternalCatalog.java     |   6 +-
 .../apache/doris/common/util/PropertyAnalyzer.java |   6 ++
 .../doris/analysis/InvertedIndexUtilTest.java      |  47 +++++++++
 .../apache/doris/common/PropertyAnalyzerTest.java  |  50 +++++++++
 gensrc/proto/olap_file.proto                       |   1 +
 gensrc/thrift/Types.thrift                         |   3 +-
 .../inverted_index_p0/test_inverted_index_v3.out   |  25 +++++
 .../test_inverted_index_v3_fault_injection.groovy  |  60 +++++++++++
 .../test_inverted_index_v3.groovy                  | 117 +++++++++++++++++++++
 23 files changed, 406 insertions(+), 33 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 48fa9cc4ec3..a506dbb6c52 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 48fa9cc4ec32b40bf3b02338d0a1b2cdbc6408cf
+Subproject commit a506dbb6c523aa65044eb1c527a066d236172543
diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index f7e511970d9..f1de5a5e0c1 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -136,4 +136,13 @@ std::string get_parser_stopwords_from_properties(
     }
 }
 
+std::string get_parser_dict_compression_from_properties(
+        const std::map<std::string, std::string>& properties) {
+    if (properties.find(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY) != 
properties.end()) {
+        return properties.at(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY);
+    } else {
+        return "";
+    }
+}
+
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index 0b8426d74c7..f1f85995a20 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -83,6 +83,8 @@ const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = 
"lower_case";
 
 const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
 
+const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = 
"dict_compression";
+
 std::string inverted_index_parser_type_to_string(InvertedIndexParserType 
parser_type);
 
 InvertedIndexParserType get_inverted_index_parser_type_from_string(const 
std::string& parser_str);
@@ -119,4 +121,7 @@ std::string get_parser_lowercase_from_properties(
 std::string get_parser_stopwords_from_properties(
         const std::map<std::string, std::string>& properties);
 
+std::string get_parser_dict_compression_from_properties(
+        const std::map<std::string, std::string>& properties);
+
 } // namespace doris
diff --git a/be/src/olap/rowset/beta_rowset.cpp 
b/be/src/olap/rowset/beta_rowset.cpp
index bbb2ca72b4a..cd52deed0c8 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -703,10 +703,24 @@ Status 
BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value,
                                           rapidjson::Document::AllocatorType& 
allocator) {
     const auto& fs = _rowset_meta->fs();
     auto storage_format = _schema->get_inverted_index_storage_format();
-    auto format_str = storage_format == InvertedIndexStorageFormatPB::V1 ? 
"V1" : "V2";
+    std::string format_str;
+    switch (storage_format) {
+    case InvertedIndexStorageFormatPB::V1:
+        format_str = "V1";
+        break;
+    case InvertedIndexStorageFormatPB::V2:
+        format_str = "V2";
+        break;
+    case InvertedIndexStorageFormatPB::V3:
+        format_str = "V3";
+        break;
+    default:
+        return Status::InternalError("inverted index storage format error");
+        break;
+    }
     auto rs_id = rowset_id().to_string();
     rowset_value->AddMember("rowset_id", rapidjson::Value(rs_id.c_str(), 
allocator), allocator);
-    rowset_value->AddMember("index_storage_format", 
rapidjson::Value(format_str, allocator),
+    rowset_value->AddMember("index_storage_format", 
rapidjson::Value(format_str.c_str(), allocator),
                             allocator);
     rapidjson::Value segments(rapidjson::kArrayType);
     for (int seg_id = 0; seg_id < num_segments(); ++seg_id) {
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
index d9e5080d2d5..1e5e6f5d5ce 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include <CLucene.h>
+#include <CLucene.h> // IWYU pragma: keep
 #include <CLucene/analysis/CharFilter.h>
 
 #include <bitset>
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
index 113833d560f..8d480829a0c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp
@@ -30,8 +30,8 @@ namespace doris::segment_v2 {
 Status InvertedIndexFileReader::init(int32_t read_buffer_size) {
     if (!_inited) {
         _read_buffer_size = read_buffer_size;
-        if (_storage_format == InvertedIndexStorageFormatPB::V2) {
-            auto st = _init_from_v2(read_buffer_size);
+        if (_storage_format >= InvertedIndexStorageFormatPB::V2) {
+            auto st = _init_from(read_buffer_size);
             if (!st.ok()) {
                 return st;
             }
@@ -41,7 +41,7 @@ Status InvertedIndexFileReader::init(int32_t 
read_buffer_size) {
     return Status::OK();
 }
 
-Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) {
+Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size) {
     auto index_file_full_path = 
InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix);
 
     std::unique_lock<std::shared_mutex> lock(_mutex); // Lock for writing
@@ -79,7 +79,7 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t 
read_buffer_size) {
 
         // 3. read file
         int32_t version = _stream->readInt(); // Read version number
-        if (version == InvertedIndexStorageFormatPB::V2) {
+        if (version >= InvertedIndexStorageFormatPB::V2) {
             DCHECK(version == _storage_format);
             int32_t numIndices = _stream->readInt(); // Read number of indices
             ReaderFileEntry* entry = nullptr;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
index 3b7161c7643..443d40cfaf0 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h
@@ -70,7 +70,7 @@ public:
     int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : 
_stream->length(); }
 
 private:
-    Status _init_from_v2(int32_t read_buffer_size);
+    Status _init_from(int32_t read_buffer_size);
     Result<std::unique_ptr<DorisCompoundReader>> _open(int64_t index_id,
                                                        const std::string& 
index_suffix) const;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
index bb373be5ee9..4d6892aa785 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp
@@ -150,7 +150,7 @@ Status InvertedIndexFileWriter::close() {
         }
     } else {
         try {
-            RETURN_IF_ERROR(write_v2());
+            RETURN_IF_ERROR(write());
             for (const auto& entry : _indices_dirs) {
                 const auto& dir = entry.second;
                 // delete index path, which contains separated inverted index 
files
@@ -293,7 +293,7 @@ Status InvertedIndexFileWriter::write_v1() {
     return Status::OK();
 }
 
-Status InvertedIndexFileWriter::write_v2() {
+Status InvertedIndexFileWriter::write() {
     std::unique_ptr<lucene::store::Directory, DirectoryDeleter> out_dir = 
nullptr;
     std::unique_ptr<lucene::store::IndexOutput> compound_file_output = nullptr;
     ErrorContext error_context;
@@ -301,10 +301,10 @@ Status InvertedIndexFileWriter::write_v2() {
         // Calculate header length and initialize offset
         int64_t current_offset = headerLength();
         // Prepare file metadata
-        auto file_metadata = prepare_file_metadata_v2(current_offset);
+        auto file_metadata = prepare_file_metadata(current_offset);
 
         // Create output stream
-        auto result = create_output_stream_v2();
+        auto result = create_output_stream();
         out_dir = std::move(result.first);
         compound_file_output = std::move(result.second);
 
@@ -315,7 +315,7 @@ Status InvertedIndexFileWriter::write_v2() {
         write_index_headers_and_metadata(compound_file_output.get(), 
file_metadata);
 
         // Copy file data
-        copy_files_data_v2(compound_file_output.get(), file_metadata);
+        copy_files_data(compound_file_output.get(), file_metadata);
 
         _total_file_size = compound_file_output->getFilePointer();
         _file_info.set_index_size(_total_file_size);
@@ -470,7 +470,7 @@ void 
InvertedIndexFileWriter::write_header_and_data_v1(lucene::store::IndexOutpu
 
 std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
           std::unique_ptr<lucene::store::IndexOutput>>
-InvertedIndexFileWriter::create_output_stream_v2() {
+InvertedIndexFileWriter::create_output_stream() {
     io::Path index_path 
{InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)};
 
     auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, 
index_path.parent_path().c_str());
@@ -486,15 +486,15 @@ InvertedIndexFileWriter::create_output_stream_v2() {
 
 void 
InvertedIndexFileWriter::write_version_and_indices_count(lucene::store::IndexOutput*
 output) {
     // Write the version number
-    output->writeInt(InvertedIndexStorageFormatPB::V2);
+    output->writeInt(_storage_format);
 
     // Write the number of indices
     const auto num_indices = static_cast<uint32_t>(_indices_dirs.size());
     output->writeInt(num_indices);
 }
 
-std::vector<InvertedIndexFileWriter::FileMetadata>
-InvertedIndexFileWriter::prepare_file_metadata_v2(int64_t& current_offset) {
+std::vector<InvertedIndexFileWriter::FileMetadata> 
InvertedIndexFileWriter::prepare_file_metadata(
+        int64_t& current_offset) {
     std::vector<FileMetadata> file_metadata;
 
     for (const auto& entry : _indices_dirs) {
@@ -546,8 +546,8 @@ void 
InvertedIndexFileWriter::write_index_headers_and_metadata(
     }
 }
 
-void InvertedIndexFileWriter::copy_files_data_v2(lucene::store::IndexOutput* 
output,
-                                                 const 
std::vector<FileMetadata>& file_metadata) {
+void InvertedIndexFileWriter::copy_files_data(lucene::store::IndexOutput* 
output,
+                                              const std::vector<FileMetadata>& 
file_metadata) {
     const int64_t buffer_length = 16384;
     uint8_t buffer[buffer_length];
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h 
b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
index ba42ffdceb1..ab7cdbff152 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
@@ -71,7 +71,7 @@ public:
     Status delete_index(const TabletIndex* index_meta);
     Status initialize(InvertedIndexDirectoryMap& indices_dirs);
     virtual ~InvertedIndexFileWriter() = default;
-    Status write_v2();
+    Status write();
     Status write_v1();
     Status close();
     const InvertedIndexFileInfo* get_index_file_info() const {
@@ -122,7 +122,7 @@ private:
     // Helper functions specific to write_v2
     virtual std::pair<std::unique_ptr<lucene::store::Directory, 
DirectoryDeleter>,
                       std::unique_ptr<lucene::store::IndexOutput>>
-    create_output_stream_v2();
+    create_output_stream();
     void write_version_and_indices_count(lucene::store::IndexOutput* output);
     struct FileMetadata {
         int64_t index_id;
@@ -141,11 +141,11 @@ private:
                   length(len),
                   directory(dir) {}
     };
-    std::vector<FileMetadata> prepare_file_metadata_v2(int64_t& 
current_offset);
+    std::vector<FileMetadata> prepare_file_metadata(int64_t& current_offset);
     virtual void write_index_headers_and_metadata(lucene::store::IndexOutput* 
output,
                                                   const 
std::vector<FileMetadata>& file_metadata);
-    void copy_files_data_v2(lucene::store::IndexOutput* output,
-                            const std::vector<FileMetadata>& file_metadata);
+    void copy_files_data(lucene::store::IndexOutput* output,
+                         const std::vector<FileMetadata>& file_metadata);
     Status _insert_directory_into_map(int64_t index_id, const std::string& 
index_suffix,
                                       std::shared_ptr<DorisFSDirectory> dir);
     // Member variables...
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 86a8f89e4c9..02edf2f1976 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -212,6 +212,28 @@ public:
         (*field)->setOmitTermFreqAndPositions(
                 
!(get_parser_phrase_support_string_from_properties(_index_meta->properties()) ==
                   INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES));
+        DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_v3", {
+            if (_index_file_writer->get_storage_format() != 
InvertedIndexStorageFormatPB::V3) {
+                return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+                        "debug point: 
InvertedIndexColumnWriterImpl::create_field_v3 error");
+            }
+        })
+        if (_index_file_writer->get_storage_format() >= 
InvertedIndexStorageFormatPB::V3) {
+            (*field)->setIndexVersion(IndexVersion::kV3);
+            // Only effective in v3
+            std::string dict_compression =
+                    
get_parser_dict_compression_from_properties(_index_meta->properties());
+            
DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_dic_compression", {
+                if (dict_compression != INVERTED_INDEX_PARSER_TRUE) {
+                    return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+                            "debug point: "
+                            
"InvertedIndexColumnWriterImpl::create_field_dic_compression error");
+                }
+            })
+            if (dict_compression == INVERTED_INDEX_PARSER_TRUE) {
+                (*field)->updateFlag(FlagBits::DICT_COMPRESS);
+            }
+        }
         return Status::OK();
     }
 
diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp
index fc9fc034b0b..d1746836e23 100644
--- a/be/src/olap/tablet_meta.cpp
+++ b/be/src/olap/tablet_meta.cpp
@@ -203,6 +203,9 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t 
partition_id, int64_t tablet_id
     case TInvertedIndexFileStorageFormat::V2:
         
schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
         break;
+    case TInvertedIndexFileStorageFormat::V3:
+        
schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3);
+        break;
     default:
         
schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
         break;
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
index f3fb9763c9b..b8e11bece7b 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
@@ -62,7 +62,6 @@ TEST_F(PhraseQueryTest, test_parser_info) {
         EXPECT_EQ(query_info.slop, res3);
         EXPECT_EQ(query_info.ordered, res4);
         EXPECT_EQ(query_info.additional_terms.size(), res5);
-        std::cout << "--- 1 ---: " << query_info.to_string() << std::endl;
     };
 
     // "english/history off.gif ~20+" sequential_opt = true
diff --git a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
index b454080434a..41703d49d5e 100644
--- a/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/inverted_index_file_writer_test.cpp
@@ -506,7 +506,7 @@ TEST_F(InvertedIndexFileWriterTest, 
WriteV2ExceptionHandlingTest) {
     EXPECT_CALL(writer_mock, write_index_headers_and_metadata(::testing::_, 
::testing::_))
             .WillOnce(::testing::Throw(CLuceneError(CL_ERR_IO, "Simulated 
exception", false)));
 
-    Status status = writer_mock.write_v2();
+    Status status = writer_mock.write();
     ASSERT_FALSE(status.ok());
     ASSERT_EQ(status.code(), ErrorCode::INVERTED_INDEX_CLUCENE_ERROR);
 }
@@ -523,7 +523,7 @@ public:
 
     MOCK_METHOD((std::pair<std::unique_ptr<lucene::store::Directory, 
DirectoryDeleter>,
                            std::unique_ptr<lucene::store::IndexOutput>>),
-                create_output_stream_v2, (), (override));
+                create_output_stream, (), (override));
 };
 
 class InvertedIndexFileWriterMockCreateOutputStreamV1 : public 
InvertedIndexFileWriter {
@@ -622,7 +622,7 @@ TEST_F(InvertedIndexFileWriterTest, WriteV2OutputTest) {
     auto compound_file_output = 
std::unique_ptr<DorisFSDirectory::FSIndexOutputV2>(mock_output_v2);
     compound_file_output->init(file_writer.get());
 
-    EXPECT_CALL(writer_mock, create_output_stream_v2())
+    EXPECT_CALL(writer_mock, create_output_stream())
             .WillOnce(::testing::Invoke(
                     [&]() -> 
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
                                        
std::unique_ptr<lucene::store::IndexOutput>> {
@@ -680,7 +680,7 @@ TEST_F(InvertedIndexFileWriterTest, 
WriteV2OutputCloseErrorTest) {
     auto compound_file_output = 
std::unique_ptr<DorisFSDirectory::FSIndexOutputV2>(mock_output_v2);
     compound_file_output->init(file_writer.get());
 
-    EXPECT_CALL(writer_mock, create_output_stream_v2())
+    EXPECT_CALL(writer_mock, create_output_stream())
             .WillOnce(::testing::Invoke(
                     [&]() -> 
std::pair<std::unique_ptr<lucene::store::Directory, DirectoryDeleter>,
                                        
std::unique_ptr<lucene::store::IndexOutput>> {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index abba2762d56..dd6a1a7612a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -54,6 +54,8 @@ public class InvertedIndexUtil {
 
     public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
 
+    public static String INVERTED_INDEX_DICT_COMPRESSION_KEY = 
"dict_compression";
+
     public static String getInvertedIndexParser(Map<String, String> 
properties) {
         String parser = properties == null ? null : 
properties.get(INVERTED_INDEX_PARSER_KEY);
         // default is "none" if not set
@@ -157,7 +159,8 @@ public class InvertedIndexUtil {
                 INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
                 INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
                 INVERTED_INDEX_PARSER_LOWERCASE_KEY,
-                INVERTED_INDEX_PARSER_STOPWORDS_KEY
+                INVERTED_INDEX_PARSER_STOPWORDS_KEY,
+                INVERTED_INDEX_DICT_COMPRESSION_KEY
         ));
 
         for (String key : properties.keySet()) {
@@ -174,6 +177,7 @@ public class InvertedIndexUtil {
         String ignoreAbove = 
properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
         String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
         String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
+        String dictCompression = 
properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY);
 
         if (parser != null && 
!parser.matches("none|english|unicode|chinese|standard")) {
             throw new AnalysisException("Invalid inverted index 'parser' 
value: " + parser
@@ -221,5 +225,11 @@ public class InvertedIndexUtil {
             throw new AnalysisException("Invalid inverted index 'stopWords' 
value: " + stopWords
                     + ", stopWords must be none");
         }
+
+        if (dictCompression != null && !dictCompression.matches("true|false")) 
{
+            throw new AnalysisException(
+                    "Invalid inverted index 'dict_compression' value: "
+                            + dictCompression + ", dict_compression must be 
true or false");
+        }
     }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
index b8a364f9449..e14b4efb0d5 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/datasource/CloudInternalCatalog.java
@@ -354,8 +354,12 @@ public class CloudInternalCatalog extends InternalCatalog {
         if (invertedIndexFileStorageFormat != null) {
             if (invertedIndexFileStorageFormat == 
TInvertedIndexFileStorageFormat.V1) {
                 
schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V1);
-            } else {
+            } else if (invertedIndexFileStorageFormat == 
TInvertedIndexFileStorageFormat.V2) {
                 
schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V2);
+            } else if (invertedIndexFileStorageFormat == 
TInvertedIndexFileStorageFormat.V3) {
+                
schemaBuilder.setInvertedIndexStorageFormat(OlapFile.InvertedIndexStorageFormatPB.V3);
+            } else {
+                throw new DdlException("invalid inverted index storage 
format");
             }
         }
         schemaBuilder.setRowStorePageSize(pageSize);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
index 5721db0c27e..915b5d48f01 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
@@ -1126,6 +1126,8 @@ public class PropertyAnalyzer {
         } else {
             if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) {
                 return TInvertedIndexFileStorageFormat.V1;
+            } else if 
(Config.inverted_index_storage_format.equalsIgnoreCase("V3")) {
+                return TInvertedIndexFileStorageFormat.V3;
             } else {
                 return TInvertedIndexFileStorageFormat.V2;
             }
@@ -1135,9 +1137,13 @@ public class PropertyAnalyzer {
             return TInvertedIndexFileStorageFormat.V1;
         } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v2")) {
             return TInvertedIndexFileStorageFormat.V2;
+        } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("v3")) {
+            return TInvertedIndexFileStorageFormat.V3;
         } else if (invertedIndexFileStorageFormat.equalsIgnoreCase("default")) 
{
             if (Config.inverted_index_storage_format.equalsIgnoreCase("V1")) {
                 return TInvertedIndexFileStorageFormat.V1;
+            } else if 
(Config.inverted_index_storage_format.equalsIgnoreCase("V3")) {
+                return TInvertedIndexFileStorageFormat.V3;
             } else {
                 return TInvertedIndexFileStorageFormat.V2;
             }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java
new file mode 100644
index 00000000000..a9be242cf3f
--- /dev/null
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/analysis/InvertedIndexUtilTest.java
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.analysis;
+
+import org.apache.doris.common.AnalysisException;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class InvertedIndexUtilTest {
+    @Test
+    public void testCheckInvertedIndexProperties() throws AnalysisException {
+        Map<String, String> properties = new HashMap<>();
+        properties.put(InvertedIndexUtil.INVERTED_INDEX_DICT_COMPRESSION_KEY, 
"true");
+
+        InvertedIndexUtil.checkInvertedIndexProperties(properties);
+
+        properties.put(InvertedIndexUtil.INVERTED_INDEX_DICT_COMPRESSION_KEY, 
"invalid_value");
+        try {
+            InvertedIndexUtil.checkInvertedIndexProperties(properties);
+            Assertions.fail("Expected AnalysisException was not thrown");
+        } catch (AnalysisException e) {
+            Assertions.assertEquals(
+                    "errCode = 2, detailMessage = Invalid inverted index 
'dict_compression' value: invalid_value, "
+                            + "dict_compression must be true or false",
+                    e.getMessage());
+        }
+    }
+}
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
index 041ca89bfc5..6d708aa0826 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
@@ -28,6 +28,7 @@ import org.apache.doris.catalog.Type;
 import org.apache.doris.common.util.PropertyAnalyzer;
 import org.apache.doris.common.util.TimeUtils;
 import org.apache.doris.resource.Tag;
+import org.apache.doris.thrift.TInvertedIndexFileStorageFormat;
 import org.apache.doris.thrift.TStorageFormat;
 import org.apache.doris.thrift.TStorageMedium;
 
@@ -37,6 +38,7 @@ import com.google.common.collect.Sets;
 import org.junit.Assert;
 import org.junit.Rule;
 import org.junit.Test;
+import org.junit.jupiter.api.Assertions;
 import org.junit.rules.ExpectedException;
 
 import java.time.Instant;
@@ -236,4 +238,52 @@ public class PropertyAnalyzerTest {
             Assert.assertTrue(e.getMessage().contains("Storage page size must 
be between 4KB and 10MB"));
         }
     }
+
+    @Test
+    public void testAnalyzeInvertedIndexFileStorageFormat() throws 
AnalysisException {
+        TInvertedIndexFileStorageFormat result = 
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(null);
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result);
+
+        Config.inverted_index_storage_format = "V1";
+        result = PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(new 
HashMap<>());
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result);
+
+        Map<String, String> propertiesWithV1 = new HashMap<>();
+        
propertiesWithV1.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, 
"v1");
+        result = 
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV1);
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result);
+
+        Map<String, String> propertiesWithV2 = new HashMap<>();
+        
propertiesWithV2.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, 
"v2");
+        result = 
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV2);
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result);
+
+        Map<String, String> propertiesWithV3 = new HashMap<>();
+        
propertiesWithV3.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT, 
"v3");
+        result = 
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithV3);
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V3, result);
+
+        Config.inverted_index_storage_format = "V1";
+        Map<String, String> propertiesWithDefaultV1 = new HashMap<>();
+        
propertiesWithDefaultV1.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
 "default");
+        result = 
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithDefaultV1);
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V1, result);
+
+        Config.inverted_index_storage_format = "V2";
+        Map<String, String> propertiesWithDefaultV2 = new HashMap<>();
+        
propertiesWithDefaultV2.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
 "default");
+        result = 
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithDefaultV2);
+        Assertions.assertEquals(TInvertedIndexFileStorageFormat.V2, result);
+
+        Map<String, String> propertiesWithUnknown = new HashMap<>();
+        
propertiesWithUnknown.put(PropertyAnalyzer.PROPERTIES_INVERTED_INDEX_STORAGE_FORMAT,
 "unknown_format");
+        try {
+            
PropertyAnalyzer.analyzeInvertedIndexFileStorageFormat(propertiesWithUnknown);
+            Assertions.fail("Expected AnalysisException was not thrown");
+        } catch (AnalysisException e) {
+            Assertions.assertEquals(
+                    "errCode = 2, detailMessage = unknown inverted index 
storage format: unknown_format",
+                    e.getMessage());
+        }
+    }
 }
diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto
index 259f9f2861a..41f8727ed1b 100644
--- a/gensrc/proto/olap_file.proto
+++ b/gensrc/proto/olap_file.proto
@@ -343,6 +343,7 @@ enum IndexType {
 enum InvertedIndexStorageFormatPB {
     V1 = 0;
     V2 = 1;
+    V3 = 2;
 }
 
 message TabletIndexPB {
diff --git a/gensrc/thrift/Types.thrift b/gensrc/thrift/Types.thrift
index 1912f950587..623da9ce067 100644
--- a/gensrc/thrift/Types.thrift
+++ b/gensrc/thrift/Types.thrift
@@ -124,7 +124,8 @@ enum TStorageBackendType {
 enum TInvertedIndexFileStorageFormat {
     DEFAULT = 0, // Default format, unspecified storage method.
     V1 = 1,      // Index per idx: Each index is stored separately based on 
its identifier.
-    V2 = 2      // Segment id per idx: Indexes are organized based on segment 
identifiers, grouping indexes by their associated segment.
+    V2 = 2       // Segment id per idx: Indexes are organized based on segment 
identifiers, grouping indexes by their associated segment.
+    V3 = 3       // Position and dictionary compression
 }
 
 struct TScalarType {
diff --git a/regression-test/data/inverted_index_p0/test_inverted_index_v3.out 
b/regression-test/data/inverted_index_p0/test_inverted_index_v3.out
new file mode 100644
index 00000000000..9dc20f3e0e0
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_inverted_index_v3.out
@@ -0,0 +1,25 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+238
+
+-- !sql --
+104
+
+-- !sql --
+104
+
+-- !sql --
+105
+
+-- !sql --
+238
+
+-- !sql --
+104
+
+-- !sql --
+104
+
+-- !sql --
+105
+
diff --git 
a/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy
 
b/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy
new file mode 100644
index 00000000000..98c0e110964
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/test_inverted_index_v3_fault_injection.groovy
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_inverted_index_v3_fault_injection", "nonConcurrent"){
+    def indexTbName1 = "test_inverted_index_v3_fault_injection"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+      `@timestamp` int(11) NULL COMMENT "",
+      `clientip` varchar(20) NULL COMMENT "",
+      `request` text NULL COMMENT "",
+      `status` int(11) NULL COMMENT "",
+      `size` int(11) NULL COMMENT "",
+      INDEX clientip_idx (`clientip`) COMMENT '',
+      INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1",
+      "inverted_index_storage_format" = "V3"
+      );
+    """
+
+    try {
+      
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+      
+      sql """ INSERT INTO ${indexTbName1} VALUES (1, '40.135.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 200, 24736); """
+    } finally {
+      
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+    }
+
+    try {
+      
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+      
GetDebugPoint().enableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_dic_compression")
+
+      sql """ INSERT INTO ${indexTbName1} VALUES (2, '40.135.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 200, 24736); """
+    } finally {
+      
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_v3")
+      
GetDebugPoint().disableDebugPointForAllBEs("InvertedIndexColumnWriterImpl::create_field_dic_compression")
+    }
+}
\ No newline at end of file
diff --git 
a/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy 
b/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy
new file mode 100644
index 00000000000..ea7dd0b595f
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_inverted_index_v3.groovy
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_inverted_index_v3", "p0"){
+    def indexTbName1 = "test_inverted_index_v3_1"
+    def indexTbName2 = "test_inverted_index_v3_2"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+    sql "DROP TABLE IF EXISTS ${indexTbName2}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+      `@timestamp` int(11) NULL COMMENT "",
+      `clientip` varchar(20) NULL COMMENT "",
+      `request` text NULL COMMENT "",
+      `status` int(11) NULL COMMENT "",
+      `size` int(11) NULL COMMENT "",
+      INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1",
+      "inverted_index_storage_format" = "V2"
+      );
+    """
+
+    sql """
+      CREATE TABLE ${indexTbName2} (
+      `@timestamp` int(11) NULL COMMENT "",
+      `clientip` varchar(20) NULL COMMENT "",
+      `request` text NULL COMMENT "",
+      `status` int(11) NULL COMMENT "",
+      `size` int(11) NULL COMMENT "",
+      INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1",
+      "inverted_index_storage_format" = "V3"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+        
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+            
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+                    assertEquals("success", json.Status.toLowerCase())
+                    if (expected_succ_rows >= 0) {
+                        assertEquals(json.NumberLoadedRows, expected_succ_rows)
+                    } else {
+                        assertEquals(json.NumberTotalRows, 
json.NumberLoadedRows + json.NumberUnselectedRows)
+                        assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes 
> 0)
+                }
+            }
+        }
+    }
+
+    try {
+      load_httplogs_data.call(indexTbName1, indexTbName1, 'true', 'json', 
'documents-1000.json')
+      load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 
'documents-1000.json')
+      
+      sql "sync"
+
+      qt_sql """ select count() from ${indexTbName1} where request match_any 
'hm bg'; """
+      qt_sql """ select count() from ${indexTbName1} where request match_all 
'hm bg'; """
+      qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'hm bg'; """
+      qt_sql """ select count() from ${indexTbName1} where request 
match_phrase_prefix 'hm bg'; """
+
+      qt_sql """ select count() from ${indexTbName2} where request match_any 
'hm bg'; """
+      qt_sql """ select count() from ${indexTbName2} where request match_all 
'hm bg'; """
+      qt_sql """ select count() from ${indexTbName2} where request 
match_phrase 'hm bg'; """
+      qt_sql """ select count() from ${indexTbName2} where request 
match_phrase_prefix 'hm bg'; """
+
+    } finally {
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org


Reply via email to