This is an automated email from the ASF dual-hosted git repository.

hellostephen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c7089faad6c [fix](filecache) BlockFileCache use-after-free caused by 
destructor order (#61192)
c7089faad6c is described below

commit c7089faad6c5a1584cfc31d5c582344211ddb21e
Author: zhengyu <[email protected]>
AuthorDate: Fri Mar 20 16:11:00 2026 +0800

    [fix](filecache) BlockFileCache use-after-free caused by destructor order 
(#61192)
    
    - In BlockFileCache::initialize_unlocked(): _storage->init(this) is
    executed first, then _ttl_mgr is created via raw pointer from
    fs_storage->get_meta_store()
    - Member variables order: _ttl_mgr (front), _storage (back)
    - C++ reverse destructor order leads to _storage being destructed before
    _ttl_mgr
    - Risk: *_ttl_mgr threads may access already-destructed
    _storage/_meta_store
---
 be/src/io/cache/block_file_cache.cpp         | 138 +++++++++++++++------------
 be/src/io/cache/block_file_cache.h           |   6 ++
 be/src/io/cache/block_file_cache_ttl_mgr.cpp |  31 ++++--
 be/src/io/cache/block_file_cache_ttl_mgr.h   |   5 +-
 4 files changed, 111 insertions(+), 69 deletions(-)

diff --git a/be/src/io/cache/block_file_cache.cpp 
b/be/src/io/cache/block_file_cache.cpp
index a1109443aeb..5d66700636e 100644
--- a/be/src/io/cache/block_file_cache.cpp
+++ b/be/src/io/cache/block_file_cache.cpp
@@ -2198,76 +2198,94 @@ void BlockFileCache::clear_need_update_lru_blocks() {
     *_need_update_lru_blocks_length_recorder << _need_update_lru_blocks.size();
 }
 
+void BlockFileCache::pause_ttl_manager() {
+    if (_ttl_mgr) {
+        _ttl_mgr->stop();
+    }
+}
+
+void BlockFileCache::resume_ttl_manager() {
+    if (_ttl_mgr) {
+        _ttl_mgr->resume();
+    }
+}
+
 std::string BlockFileCache::clear_file_cache_directly() {
+    pause_ttl_manager();
     _lru_dumper->remove_lru_dump_files();
     using namespace std::chrono;
     std::stringstream ss;
     auto start = steady_clock::now();
-    SCOPED_CACHE_LOCK(_mutex, this);
-    LOG_INFO("start clear_file_cache_directly").tag("path", _cache_base_path);
-
-    std::string clear_msg;
-    auto s = _storage->clear(clear_msg);
-    if (!s.ok()) {
-        return clear_msg;
-    }
-
-    int64_t num_files = _files.size();
-    int64_t cache_size = _cur_cache_size;
-    int64_t index_queue_size = _index_queue.get_elements_num(cache_lock);
-    int64_t normal_queue_size = _normal_queue.get_elements_num(cache_lock);
-    int64_t disposible_queue_size = 
_disposable_queue.get_elements_num(cache_lock);
-    int64_t ttl_queue_size = _ttl_queue.get_elements_num(cache_lock);
-
-    int64_t clear_fd_duration = 0;
+    std::string result;
     {
-        // clear FDCache to release fd
-        SCOPED_RAW_TIMER(&clear_fd_duration);
-        for (const auto& [file_key, file_blocks] : _files) {
-            for (const auto& [offset, file_block_cell] : file_blocks) {
-                AccessKeyAndOffset access_key_and_offset(file_key, offset);
-                FDCache::instance()->remove_file_reader(access_key_and_offset);
+        SCOPED_CACHE_LOCK(_mutex, this);
+        LOG_INFO("start clear_file_cache_directly").tag("path", 
_cache_base_path);
+
+        std::string clear_msg;
+        auto s = _storage->clear(clear_msg);
+        if (!s.ok()) {
+            result = clear_msg;
+        } else {
+            int64_t num_files = _files.size();
+            int64_t cache_size = _cur_cache_size;
+            int64_t index_queue_size = 
_index_queue.get_elements_num(cache_lock);
+            int64_t normal_queue_size = 
_normal_queue.get_elements_num(cache_lock);
+            int64_t disposible_queue_size = 
_disposable_queue.get_elements_num(cache_lock);
+            int64_t ttl_queue_size = _ttl_queue.get_elements_num(cache_lock);
+
+            int64_t clear_fd_duration = 0;
+            {
+                // clear FDCache to release fd
+                SCOPED_RAW_TIMER(&clear_fd_duration);
+                for (const auto& [file_key, file_blocks] : _files) {
+                    for (const auto& [offset, file_block_cell] : file_blocks) {
+                        AccessKeyAndOffset access_key_and_offset(file_key, 
offset);
+                        
FDCache::instance()->remove_file_reader(access_key_and_offset);
+                    }
+                }
             }
+
+            _files.clear();
+            _cur_cache_size = 0;
+            _cur_ttl_size = 0;
+            _time_to_key.clear();
+            _key_to_time.clear();
+            _index_queue.clear(cache_lock);
+            _normal_queue.clear(cache_lock);
+            _disposable_queue.clear(cache_lock);
+            _ttl_queue.clear(cache_lock);
+
+            // Update cache metrics immediately so consumers observe the 
cleared state
+            // without waiting for the next background monitor round.
+            _cur_cache_size_metrics->set_value(0);
+            _cur_ttl_cache_size_metrics->set_value(0);
+            _cur_ttl_cache_lru_queue_cache_size_metrics->set_value(0);
+            _cur_ttl_cache_lru_queue_element_count_metrics->set_value(0);
+            _cur_normal_queue_cache_size_metrics->set_value(0);
+            _cur_normal_queue_element_count_metrics->set_value(0);
+            _cur_index_queue_cache_size_metrics->set_value(0);
+            _cur_index_queue_element_count_metrics->set_value(0);
+            _cur_disposable_queue_cache_size_metrics->set_value(0);
+            _cur_disposable_queue_element_count_metrics->set_value(0);
+
+            clear_need_update_lru_blocks();
+
+            ss << "finish clear_file_cache_directly"
+               << " path=" << _cache_base_path << " time_elapsed_ms="
+               << duration_cast<milliseconds>(steady_clock::now() - 
start).count()
+               << " fd_clear_time_ms=" << (clear_fd_duration / 1000000)
+               << " num_files=" << num_files << " cache_size=" << cache_size
+               << " index_queue_size=" << index_queue_size
+               << " normal_queue_size=" << normal_queue_size
+               << " disposible_queue_size=" << disposible_queue_size
+               << "ttl_queue_size=" << ttl_queue_size;
+            result = ss.str();
+            LOG(INFO) << result;
         }
     }
-
-    _files.clear();
-    _cur_cache_size = 0;
-    _cur_ttl_size = 0;
-    _time_to_key.clear();
-    _key_to_time.clear();
-    _index_queue.clear(cache_lock);
-    _normal_queue.clear(cache_lock);
-    _disposable_queue.clear(cache_lock);
-    _ttl_queue.clear(cache_lock);
-
-    // Synchronously update cache metrics so that 
information_schema.file_cache_statistics
-    // reflects the cleared state immediately, without waiting for the next
-    // run_background_monitor cycle.
-    _cur_cache_size_metrics->set_value(0);
-    _cur_ttl_cache_size_metrics->set_value(0);
-    _cur_ttl_cache_lru_queue_cache_size_metrics->set_value(0);
-    _cur_ttl_cache_lru_queue_element_count_metrics->set_value(0);
-    _cur_normal_queue_cache_size_metrics->set_value(0);
-    _cur_normal_queue_element_count_metrics->set_value(0);
-    _cur_index_queue_cache_size_metrics->set_value(0);
-    _cur_index_queue_element_count_metrics->set_value(0);
-    _cur_disposable_queue_cache_size_metrics->set_value(0);
-    _cur_disposable_queue_element_count_metrics->set_value(0);
-
-    clear_need_update_lru_blocks();
-
-    ss << "finish clear_file_cache_directly"
-       << " path=" << _cache_base_path
-       << " time_elapsed_ms=" << 
duration_cast<milliseconds>(steady_clock::now() - start).count()
-       << " fd_clear_time_ms=" << (clear_fd_duration / 1000000) << " 
num_files=" << num_files
-       << " cache_size=" << cache_size << " index_queue_size=" << 
index_queue_size
-       << " normal_queue_size=" << normal_queue_size
-       << " disposible_queue_size=" << disposible_queue_size << 
"ttl_queue_size=" << ttl_queue_size;
-    auto msg = ss.str();
-    LOG(INFO) << msg;
     _lru_dumper->remove_lru_dump_files();
-    return msg;
+    resume_ttl_manager();
+    return result;
 }
 
 std::map<size_t, FileBlockSPtr> BlockFileCache::get_blocks_by_key(const 
UInt128Wrapper& hash) {
diff --git a/be/src/io/cache/block_file_cache.h 
b/be/src/io/cache/block_file_cache.h
index e1c64a665bf..4c155ba2d44 100644
--- a/be/src/io/cache/block_file_cache.h
+++ b/be/src/io/cache/block_file_cache.h
@@ -200,6 +200,9 @@ public:
         if (_cache_background_block_lru_update_thread.joinable()) {
             _cache_background_block_lru_update_thread.join();
         }
+        if (_ttl_mgr) {
+            _ttl_mgr.reset();
+        }
     }
 
     /// Restore cache from local filesystem.
@@ -307,6 +310,9 @@ public:
 
     void update_ttl_atime(const UInt128Wrapper& hash);
 
+    void pause_ttl_manager();
+    void resume_ttl_manager();
+
     std::map<std::string, double> get_stats();
 
     // for be UTs
diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.cpp 
b/be/src/io/cache/block_file_cache_ttl_mgr.cpp
index 7b9e66489cc..939045aa71e 100644
--- a/be/src/io/cache/block_file_cache_ttl_mgr.cpp
+++ b/be/src/io/cache/block_file_cache_ttl_mgr.cpp
@@ -40,16 +40,15 @@ BlockFileCacheTtlMgr::BlockFileCacheTtlMgr(BlockFileCache* 
mgr, CacheBlockMetaSt
         : _mgr(mgr), _meta_store(meta_store), _stop_background(false) {
     _tablet_id_set_size_metrics = std::make_shared<bvar::Status<size_t>>(
             _mgr->get_base_path().c_str(), 
"file_cache_ttl_mgr_tablet_id_set_size", 0);
-    // Start background threads
-    _update_ttl_thread =
-            
std::thread(&BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map, this);
-    _expiration_check_thread =
-            std::thread(&BlockFileCacheTtlMgr::run_backgroud_expiration_check, 
this);
-    _tablet_id_flush_thread =
-            std::thread(&BlockFileCacheTtlMgr::run_background_tablet_id_flush, 
this);
+    resume();
 }
 
 BlockFileCacheTtlMgr::~BlockFileCacheTtlMgr() {
+    stop();
+}
+
+void BlockFileCacheTtlMgr::stop() {
+    std::lock_guard<std::mutex> lifecycle_lock(_thread_lifecycle_mutex);
     _stop_background.store(true, std::memory_order_release);
 
     if (_update_ttl_thread.joinable()) {
@@ -65,6 +64,22 @@ BlockFileCacheTtlMgr::~BlockFileCacheTtlMgr() {
     }
 }
 
+void BlockFileCacheTtlMgr::resume() {
+    std::lock_guard<std::mutex> lifecycle_lock(_thread_lifecycle_mutex);
+    if (_update_ttl_thread.joinable() || _expiration_check_thread.joinable() ||
+        _tablet_id_flush_thread.joinable()) {
+        return;
+    }
+
+    _stop_background.store(false, std::memory_order_release);
+    _update_ttl_thread =
+            
std::thread(&BlockFileCacheTtlMgr::run_backgroud_update_ttl_info_map, this);
+    _expiration_check_thread =
+            std::thread(&BlockFileCacheTtlMgr::run_backgroud_expiration_check, 
this);
+    _tablet_id_flush_thread =
+            std::thread(&BlockFileCacheTtlMgr::run_background_tablet_id_flush, 
this);
+}
+
 void BlockFileCacheTtlMgr::register_tablet_id(int64_t tablet_id) {
     _tablet_id_queue.enqueue(tablet_id);
 }
@@ -292,4 +307,4 @@ void BlockFileCacheTtlMgr::run_backgroud_expiration_check() 
{
     }
 }
 
-} // namespace doris::io
\ No newline at end of file
+} // namespace doris::io
diff --git a/be/src/io/cache/block_file_cache_ttl_mgr.h 
b/be/src/io/cache/block_file_cache_ttl_mgr.h
index b16b5486b4b..8cd677446f1 100644
--- a/be/src/io/cache/block_file_cache_ttl_mgr.h
+++ b/be/src/io/cache/block_file_cache_ttl_mgr.h
@@ -48,6 +48,8 @@ public:
     ~BlockFileCacheTtlMgr();
 
     void register_tablet_id(int64_t tablet_id);
+    void stop();
+    void resume();
 
     // Background thread to update ttl_info_map
     void run_backgroud_update_ttl_info_map();
@@ -73,10 +75,11 @@ private:
     std::thread _update_ttl_thread;
     std::thread _expiration_check_thread;
     std::thread _tablet_id_flush_thread;
+    std::mutex _thread_lifecycle_mutex;
 
     std::mutex _ttl_info_mutex;
 
     std::shared_ptr<bvar::Status<size_t>> _tablet_id_set_size_metrics;
 };
 
-} // namespace doris::io
\ No newline at end of file
+} // namespace doris::io


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to