This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new ca9ebecd0d4 branch-3.0: [fix](inverted index) Clear inverted index 
cache from file cache #49685 (#49738)
ca9ebecd0d4 is described below

commit ca9ebecd0d4a707b9eb01883801339836a73103b
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Apr 2 10:47:00 2025 +0800

    branch-3.0: [fix](inverted index) Clear inverted index cache from file 
cache #49685 (#49738)
    
    Cherry-picked from #49685
    
    Co-authored-by: zzzxl <yangs...@selectdb.com>
---
 be/src/olap/rowset/rowset.cpp                      |  29 ++++++
 be/src/olap/rowset/rowset.h                        |   2 +
 .../olap/rowset/segment_v2/inverted_index_desc.cpp |  13 +++
 .../olap/rowset/segment_v2/inverted_index_desc.h   |   5 +
 be/test/olap/rowset/beta_rowset_test.cpp           | 107 +++++++++++++++++++++
 .../org/apache/doris/regression/suite/Suite.groovy |   8 +-
 .../tablets/test_clean_stale_rs_file_cache.groovy  |   2 +-
 ...=> test_clean_stale_rs_index_file_cache.groovy} |  14 +--
 8 files changed, 169 insertions(+), 11 deletions(-)

diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp
index 3b86504090d..737872575ae 100644
--- a/be/src/olap/rowset/rowset.cpp
+++ b/be/src/olap/rowset/rowset.cpp
@@ -22,6 +22,7 @@
 #include "common/config.h"
 #include "io/cache/block_file_cache_factory.h"
 #include "olap/olap_define.h"
+#include "olap/rowset/segment_v2/inverted_index_desc.h"
 #include "olap/segment_loader.h"
 #include "olap/tablet_schema.h"
 #include "util/time.h"
@@ -128,6 +129,14 @@ void Rowset::clear_cache() {
             auto* file_cache = 
io::FileCacheFactory::instance()->get_by_path(file_key);
             file_cache->remove_if_cached_async(file_key);
         }
+
+        // inverted index
+        auto file_names = get_index_file_names();
+        for (const auto& file_name : file_names) {
+            auto file_key = io::BlockFileCache::hash(file_name);
+            auto* file_cache = 
io::FileCacheFactory::instance()->get_by_path(file_key);
+            file_cache->remove_if_cached_async(file_key);
+        }
     }
 }
 
@@ -165,4 +174,24 @@ void Rowset::merge_rowset_meta(const RowsetMeta& other) {
     _schema = _rowset_meta->tablet_schema();
 }
 
+std::vector<std::string> Rowset::get_index_file_names() {
+    std::vector<std::string> file_names;
+    auto idx_version = _schema->get_inverted_index_storage_format();
+    for (int64_t seg_id = 0; seg_id < num_segments(); ++seg_id) {
+        if (idx_version == InvertedIndexStorageFormatPB::V1) {
+            for (const auto& index : _schema->inverted_indexes()) {
+                auto file_name = 
segment_v2::InvertedIndexDescriptor::get_index_file_name_v1(
+                        rowset_id().to_string(), seg_id, index->index_id(),
+                        index->get_index_suffix());
+                file_names.emplace_back(std::move(file_name));
+            }
+        } else {
+            auto file_name = 
segment_v2::InvertedIndexDescriptor::get_index_file_name_v2(
+                    rowset_id().to_string(), seg_id);
+            file_names.emplace_back(std::move(file_name));
+        }
+    }
+    return file_names;
+}
+
 } // namespace doris
diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h
index be21f29888e..db6872875a5 100644
--- a/be/src/olap/rowset/rowset.h
+++ b/be/src/olap/rowset/rowset.h
@@ -312,6 +312,8 @@ public:
 
     Result<std::string> segment_path(int64_t seg_id);
 
+    std::vector<std::string> get_index_file_names();
+
 protected:
     friend class RowsetFactory;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_desc.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_desc.cpp
index e909bc1e0a9..faa8dff7a81 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_desc.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_desc.cpp
@@ -76,4 +76,17 @@ std::string 
InvertedIndexDescriptor::get_index_file_cache_key(std::string_view i
     return fmt::format("{}_{}{}", index_path_prefix, index_id, suffix);
 }
 
+std::string InvertedIndexDescriptor::get_index_file_name_v1(const std::string& 
rowset_id,
+                                                            int64_t seg_id, 
int64_t index_id,
+                                                            std::string_view 
index_path_suffix) {
+    std::string suffix =
+            index_path_suffix.empty() ? "" : std::string {"@"} + 
index_path_suffix.data();
+    return fmt::format("{}_{}_{}{}{}", rowset_id, seg_id, index_id, suffix, 
index_suffix);
+}
+
+std::string InvertedIndexDescriptor::get_index_file_name_v2(const std::string& 
rowset_id,
+                                                            int64_t seg_id) {
+    return fmt::format("{}_{}{}", rowset_id, seg_id, index_suffix);
+}
+
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_desc.h 
b/be/src/olap/rowset/segment_v2/inverted_index_desc.h
index f421c7f3790..23c85bcf697 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_desc.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_desc.h
@@ -50,6 +50,11 @@ public:
                                                 int64_t index_id,
                                                 std::string_view 
index_path_suffix);
 
+    static std::string get_index_file_name_v1(const std::string& rowset_id, 
int64_t seg_id,
+                                              int64_t index_id, 
std::string_view index_path_suffix);
+
+    static std::string get_index_file_name_v2(const std::string& rowset_id, 
int64_t seg_id);
+
     static const char* get_temporary_null_bitmap_file_name() { return 
"null_bitmap"; }
     static const char* get_temporary_bkd_index_data_file_name() { return 
"bkd"; }
     static const char* get_temporary_bkd_index_meta_file_name() { return 
"bkd_meta"; }
diff --git a/be/test/olap/rowset/beta_rowset_test.cpp 
b/be/test/olap/rowset/beta_rowset_test.cpp
index 2e13436b3d3..a1e6a0ec633 100644
--- a/be/test/olap/rowset/beta_rowset_test.cpp
+++ b/be/test/olap/rowset/beta_rowset_test.cpp
@@ -27,6 +27,7 @@
 #include <gen_cpp/olap_common.pb.h>
 #include <gtest/gtest-message.h>
 #include <gtest/gtest-test-part.h>
+#include <gtest/gtest.h>
 #include <stdint.h>
 #include <unistd.h>
 
@@ -44,6 +45,7 @@
 #include "io/fs/local_file_system.h"
 #include "io/fs/s3_file_system.h"
 #include "io/fs/s3_obj_storage_client.h"
+#include "json2pb/json_to_pb.h"
 #include "olap/data_dir.h"
 #include "olap/olap_common.h"
 #include "olap/options.h"
@@ -170,6 +172,61 @@ protected:
         EXPECT_EQ(Status::OK(), s);
     }
 
+    void init_rs_meta(RowsetMetaSharedPtr& pb1, int64_t start, int64_t end) {
+        std::string json_rowset_meta = R"({
+            "rowset_id": 540085,
+            "tablet_id": 15674,
+            "partition_id": 10000,
+            "txn_id": 4045,
+            "tablet_schema_hash": 567997588,
+            "rowset_type": "BETA_ROWSET",
+            "rowset_state": "VISIBLE",
+            "start_version": 2,
+            "end_version": 2,
+            "num_rows": 3929,
+            "total_disk_size": 84699,
+            "data_disk_size": 84464,
+            "index_disk_size": 235,
+            "empty": false,
+            "load_id": {
+                "hi": -5350970832824939812,
+                "lo": -6717994719194512122
+            },
+            "creation_time": 1553765670
+        })";
+
+        RowsetMetaPB rowset_meta_pb;
+        json2pb::JsonToProtoMessage(json_rowset_meta, &rowset_meta_pb);
+        rowset_meta_pb.set_start_version(start);
+        rowset_meta_pb.set_end_version(end);
+        rowset_meta_pb.set_creation_time(10000);
+
+        pb1->init_from_pb(rowset_meta_pb);
+    }
+
+    void construct_column(ColumnPB* column_pb, TabletIndexPB* tablet_index, 
int64_t index_id,
+                          const std::string& index_name, int32_t col_unique_id,
+                          const std::string& column_type, const std::string& 
column_name,
+                          const std::map<std::string, std::string>& properties 
=
+                                  std::map<std::string, std::string>(),
+                          bool is_key = false) {
+        column_pb->set_unique_id(col_unique_id);
+        column_pb->set_name(column_name);
+        column_pb->set_type(column_type);
+        column_pb->set_is_key(is_key);
+        column_pb->set_is_nullable(true);
+        tablet_index->set_index_id(index_id);
+        tablet_index->set_index_name(index_name);
+        tablet_index->set_index_type(IndexType::INVERTED);
+        tablet_index->add_col_unique_id(col_unique_id);
+        if (!properties.empty()) {
+            auto* pros = tablet_index->mutable_properties();
+            for (const auto& [key, value] : properties) {
+                (*pros)[key] = value;
+            }
+        }
+    }
+
 private:
     std::unique_ptr<DataDir> _data_dir;
 };
@@ -304,4 +361,54 @@ TEST_F(BetaRowsetTest, AddToBinlogTest) {
     ASSERT_TRUE(s.ok()) << "second add_to_binlog(): " << s;
 }
 
+TEST_F(BetaRowsetTest, GetIndexFileNames) {
+    // v1
+    {
+        TabletSchemaPB schema_pb;
+        schema_pb.set_keys_type(KeysType::DUP_KEYS);
+        
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
+        construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, 
"key_index", 0,
+                         "INT", "key");
+        construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, 
"v1_index", 1,
+                         "STRING", "v1");
+        
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
+        auto tablet_schema = std::make_shared<TabletSchema>();
+        tablet_schema->init_from_pb(schema_pb);
+
+        auto rowset_meta = std::make_shared<RowsetMeta>();
+        init_rs_meta(rowset_meta, 1, 1);
+        rowset_meta->set_num_segments(2);
+
+        BetaRowset rowset(tablet_schema, rowset_meta, "");
+        auto file_names = rowset.get_index_file_names();
+        ASSERT_EQ(file_names[0], "540085_0_10000.idx");
+        ASSERT_EQ(file_names[1], "540085_0_10001.idx");
+        ASSERT_EQ(file_names[2], "540085_1_10000.idx");
+        ASSERT_EQ(file_names[3], "540085_1_10001.idx");
+    }
+
+    // v2
+    {
+        TabletSchemaPB schema_pb;
+        schema_pb.set_keys_type(KeysType::DUP_KEYS);
+        
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
+        construct_column(schema_pb.add_column(), schema_pb.add_index(), 10000, 
"key_index", 0,
+                         "INT", "key");
+        construct_column(schema_pb.add_column(), schema_pb.add_index(), 10001, 
"v1_index", 1,
+                         "STRING", "v1");
+        
schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2);
+        auto tablet_schema = std::make_shared<TabletSchema>();
+        tablet_schema->init_from_pb(schema_pb);
+
+        auto rowset_meta = std::make_shared<RowsetMeta>();
+        init_rs_meta(rowset_meta, 1, 1);
+        rowset_meta->set_num_segments(2);
+
+        BetaRowset rowset(tablet_schema, rowset_meta, "");
+        auto file_names = rowset.get_index_file_names();
+        ASSERT_EQ(file_names[0], "540085_0.idx");
+        ASSERT_EQ(file_names[1], "540085_1.idx");
+    }
+}
+
 } // namespace doris
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
index 2062f9ad7ef..32f75d2e760 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
@@ -3008,7 +3008,7 @@ class Suite implements GroovyInterceptable {
         }
     }
 
-    def getRowsetFileCacheDirFromBe = { beHttpPort, msHttpPort, tabletId, 
version -> 
+    def getRowsetFileCacheDirFromBe = { beHttpPort, msHttpPort, tabletId, 
version, fileSuffix = "dat" -> 
         def hashValues = []
         def segmentFiles = []
         getSegmentFilesFromMs(msHttpPort, tabletId, version) {
@@ -3018,7 +3018,7 @@ class Suite implements GroovyInterceptable {
                 // 
{"rowset_id":"0","partition_id":"27695","tablet_id":"27700","txn_id":"7057526525952","tablet_schema_hash":0,"rowset_type":"BETA_ROWSET","rowset_state":"COMMITTED","start_version":"3","end_version":"3","version_hash":"0","num_rows":"1","total_disk_size":"895","data_disk_size":"895","index_disk_size":"0","empty":false,"load_id":{"hi":"-1646598626735601581","lo":"-6677682539881484579"},"delete_flag":false,"creation_time":"1736153402","num_segments":"1","rowset_id_v2":"020
 [...]
                 def segmentNum = json.num_segments as int
                 def rowsetId = json.rowset_id_v2 as String
-                segmentFiles = (0..<segmentNum).collect { i -> 
"${rowsetId}_${i}.dat" }
+                segmentFiles = (0..<segmentNum).collect { i -> 
"${rowsetId}_${i}.${fileSuffix}" }
         }
 
         segmentFiles.each {
@@ -3032,7 +3032,7 @@ class Suite implements GroovyInterceptable {
     }
 
     // get table's tablet file cache
-    def getTabletFileCacheDirFromBe = { msHttpPort, table, version ->
+    def getTabletFileCacheDirFromBe = { msHttpPort, table, version, fileSuffix 
= "dat" ->
         // beHost HashFile
         def beHostToHashFile = [:]
 
@@ -3040,7 +3040,7 @@ class Suite implements GroovyInterceptable {
         getTabletsAndHostFromFe.each {
             def beHost = it.Value[1]
             def tabletId = it.Key
-            def hashRet = getRowsetFileCacheDirFromBe(beHost + ":8040", 
msHttpPort, tabletId, version)
+            def hashRet = getRowsetFileCacheDirFromBe(beHost + ":8040", 
msHttpPort, tabletId, version, fileSuffix)
             hashRet.each {
                 def hashFile = it
                 if (beHostToHashFile.containsKey(beHost)) {
diff --git 
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy 
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
index 8d41939981a..807e51ae95f 100644
--- 
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
+++ 
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
@@ -66,7 +66,7 @@ suite('test_clean_stale_rs_file_cache', 'docker') {
         sql """
             insert into $table values (10, 1, 'v1'), (20, 2, 'v2'), (30, 3, 
'v3')
         """
-        def cacheDirVersion3 = getTabletFileCacheDirFromBe(msHttpPort, table, 
2)
+        def cacheDirVersion3 = getTabletFileCacheDirFromBe(msHttpPort, table, 
3)
         // version 4
         sql """
             insert into $table values (100, 1, 'v1'), (200, 2, 'v2'), (300, 3, 
'v3')
diff --git 
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy 
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
similarity index 92%
copy from 
regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
copy to 
regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
index 8d41939981a..9077364d577 100644
--- 
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
+++ 
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
@@ -18,7 +18,7 @@
 import org.apache.doris.regression.suite.ClusterOptions
 import org.apache.doris.regression.util.Http
 
-suite('test_clean_stale_rs_file_cache', 'docker') {
+suite('test_clean_stale_rs_index_file_cache', 'docker') {
     if (!isCloudMode()) {
         return;
     }
@@ -40,7 +40,9 @@ suite('test_clean_stale_rs_file_cache', 'docker') {
     options.setBeNum(1)
     options.cloudMode = true
 
-    def table = "test_clean_stale_rs_file_cache"
+
+    def table = "test_clean_stale_rs_index_file_cache"
+    sql """ drop table if exists $table; """
 
     docker(options) {
         def ms = cluster.getAllMetaservices().get(0)
@@ -48,7 +50,8 @@ suite('test_clean_stale_rs_file_cache', 'docker') {
         sql """CREATE TABLE $table (
             `k1` int(11) NULL,
             `k2` int(11) NULL,
-            `v1` varchar(2048)
+            `v1` varchar(2048),
+            INDEX v1_idx (`v1`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
             )
             DUPLICATE KEY(`k1`, `k2`)
             COMMENT 'OLAP'
@@ -61,12 +64,12 @@ suite('test_clean_stale_rs_file_cache', 'docker') {
         sql """
             insert into $table values (1, 1, 'v1'), (2, 2, 'v2'), (3, 3, 'v3')
         """
-        def cacheDirVersion2 = getTabletFileCacheDirFromBe(msHttpPort, table, 
2)
+        def cacheDirVersion2 = getTabletFileCacheDirFromBe(msHttpPort, table, 
2, "idx")
         // version 3
         sql """
             insert into $table values (10, 1, 'v1'), (20, 2, 'v2'), (30, 3, 
'v3')
         """
-        def cacheDirVersion3 = getTabletFileCacheDirFromBe(msHttpPort, table, 
2)
+        def cacheDirVersion3 = getTabletFileCacheDirFromBe(msHttpPort, table, 
3, "idx")
         // version 4
         sql """
             insert into $table values (100, 1, 'v1'), (200, 2, 'v2'), (300, 3, 
'v3')
@@ -124,6 +127,5 @@ suite('test_clean_stale_rs_file_cache', 'docker') {
                 "Matching subdir found in: ${subDirs}")
             }
         }
-
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to