This is an automated email from the ASF dual-hosted git repository.

freemandealer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c7449c6434f [fix](filecache) avoid crash when late holder cleanup sees 
removed cache cell (#62437)
c7449c6434f is described below

commit c7449c6434fc6899af59bbc31082d93d38c30970
Author: zhengyu <[email protected]>
AuthorDate: Mon Jun 1 16:36:26 2026 +0800

    [fix](filecache) avoid crash when late holder cleanup sees removed cache 
cell (#62437)
    
    Problem Summary:
    When FileBlocksHolder is destroyed late, the corresponding file block
    may already be removed or replaced in block file cache metadata.
    `BlockFileCache::remove()` can dereference a stale cache cell during
    duplicate cleanup and crash.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [x] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [x] Other reason <!-- Add your reason?  -->
    - Cherry-pick only. The picked commits already include BE unit-test
    coverage, and no local build/test was requested for this task.
    
    - Behavior changed:
        - [ ] No.
        - [x] Yes. <!-- Explain the behavior change -->
    - Avoid crash when late holder cleanup sees a removed or replaced cache
    cell, add warning logs for skipped duplicate remove, and add BE unit
    tests for the stale/replaced-cell cleanup paths.
    
    - Does this need documentation?
        - [x] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/io/cache/block_file_cache.cpp            |  32 ++++-
 be/test/io/cache/block_file_cache_test.cpp      | 173 ++++++++++++++++++++++++
 be/test/io/cache/block_file_cache_test_common.h |  12 +-
 3 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/be/src/io/cache/block_file_cache.cpp 
b/be/src/io/cache/block_file_cache.cpp
index 3ff1526c32f..9e8f71ff2cb 100644
--- a/be/src/io/cache/block_file_cache.cpp
+++ b/be/src/io/cache/block_file_cache.cpp
@@ -1412,7 +1412,37 @@ void BlockFileCache::remove(FileBlockSPtr file_block, T& 
cache_lock, U& block_lo
     auto tablet_id = file_block->tablet_id();
     auto* cell = get_cell(hash, offset, cache_lock);
     file_block->cell = nullptr;
-    DCHECK(cell);
+    // Holder cleanup can race with prior cache metadata cleanup. In that case,
+    // skip the duplicate remove instead of touching a detached or replaced 
cell.
+    if (cell == nullptr) {
+        LOG(WARNING) << "remove skipped because cache cell is missing. hash=" 
<< hash.to_string()
+                     << " offset=" << offset << " size=" << 
file_block->range().size()
+                     << " type=" << cache_type_to_string(type)
+                     << " state=" << 
FileBlock::state_to_string(file_block->state_unsafe())
+                     << " expiration_time=" << expiration_time << " sync=" << 
sync;
+        return;
+    }
+    if (cell->file_block.get() != file_block.get()) {
+        auto* cell_file_block = cell->file_block.get();
+        LOG(WARNING)
+                << "remove skipped because cache cell points to a different 
file block. hash="
+                << hash.to_string() << " offset=" << offset
+                << " size=" << file_block->range().size() << " type=" << 
cache_type_to_string(type)
+                << " state=" << 
FileBlock::state_to_string(file_block->state_unsafe())
+                << " expiration_time=" << expiration_time << " sync=" << sync 
<< " cell_block_hash="
+                << (cell_file_block ? 
cell_file_block->get_hash_value().to_string() : "<null>")
+                << " cell_block_offset="
+                << (cell_file_block ? 
std::to_string(cell_file_block->offset()) : "<null>")
+                << " cell_block_size="
+                << (cell_file_block ? 
std::to_string(cell_file_block->range().size()) : "<null>")
+                << " cell_block_type="
+                << (cell_file_block ? 
cache_type_to_string(cell_file_block->cache_type())
+                                    : "<null>")
+                << " cell_block_state="
+                << (cell_file_block ? 
FileBlock::state_to_string(cell_file_block->state_unsafe())
+                                    : "<null>");
+        return;
+    }
     DCHECK(cell->queue_iterator);
     if (cell->queue_iterator) {
         auto& queue = get_queue(file_block->cache_type());
diff --git a/be/test/io/cache/block_file_cache_test.cpp 
b/be/test/io/cache/block_file_cache_test.cpp
index 9f5563efb3d..da078c3bff4 100644
--- a/be/test/io/cache/block_file_cache_test.cpp
+++ b/be/test/io/cache/block_file_cache_test.cpp
@@ -2723,6 +2723,179 @@ TEST_F(BlockFileCacheTest, remove_directly) {
     }
 }
 
+TEST_F(BlockFileCacheTest, late_holder_remove_skips_missing_cache_cell) {
+    if (fs::exists(cache_base_path)) {
+        fs::remove_all(cache_base_path);
+    }
+    fs::create_directories(cache_base_path);
+
+    io::FileCacheSettings settings;
+    settings.query_queue_size = 30;
+    settings.query_queue_elements = 5;
+    settings.index_queue_size = 30;
+    settings.index_queue_elements = 5;
+    settings.disposable_queue_size = 30;
+    settings.disposable_queue_elements = 5;
+    settings.capacity = 90;
+    settings.max_file_block_size = 30;
+    settings.max_query_cache_size = 0;
+
+    io::BlockFileCache cache(cache_base_path, settings);
+    ASSERT_TRUE(cache.initialize());
+    wait_until_cache_ready(cache);
+
+    io::CacheContext context;
+    ReadStatistics rstats;
+    context.stats = &rstats;
+    context.cache_type = io::FileCacheType::NORMAL;
+
+    auto key = 
io::BlockFileCache::hash("late-holder-remove-skips-missing-cache-cell");
+    auto holder = std::make_unique<FileBlocksHolder>(cache.get_or_set(key, 0, 
5, context));
+    auto blocks = fromHolder(*holder);
+    ASSERT_EQ(blocks.size(), 1);
+
+    auto file_block = blocks[0];
+    ASSERT_EQ(file_block->get_or_set_downloader(), 
io::FileBlock::get_caller_id());
+    download(file_block);
+    file_block->set_deleting();
+
+    ASSERT_EQ(cache._cur_cache_size, 5);
+    {
+        auto* cache_ptr = &cache;
+        SCOPED_CACHE_LOCK(cache_ptr->_mutex, cache_ptr);
+        auto file_it = cache._files.find(key);
+        ASSERT_NE(file_it, cache._files.end());
+        auto cell_it = file_it->second.find(0);
+        ASSERT_NE(cell_it, file_it->second.end());
+        auto& cell = cell_it->second;
+        ASSERT_TRUE(cell.queue_iterator.has_value());
+
+        auto& queue = cache.get_queue(file_block->cache_type());
+        queue.remove(*cell.queue_iterator, cache_lock);
+        cache._cur_cache_size -= file_block->range().size();
+
+        file_it->second.erase(cell_it);
+        if (file_it->second.empty()) {
+            cache._files.erase(file_it);
+        }
+    }
+
+    blocks.clear();
+    ASSERT_EQ(file_block.use_count(), 2);
+
+    holder.reset();
+
+    EXPECT_EQ(cache._cur_cache_size, 0);
+    EXPECT_EQ(cache.get_file_blocks_num(io::FileCacheType::NORMAL), 0);
+
+    file_block.reset();
+    if (fs::exists(cache_base_path)) {
+        fs::remove_all(cache_base_path);
+    }
+}
+
+TEST_F(BlockFileCacheTest, late_holder_remove_skips_replaced_cache_cell) {
+    if (fs::exists(cache_base_path)) {
+        fs::remove_all(cache_base_path);
+    }
+    fs::create_directories(cache_base_path);
+
+    io::FileCacheSettings settings;
+    settings.query_queue_size = 30;
+    settings.query_queue_elements = 5;
+    settings.index_queue_size = 30;
+    settings.index_queue_elements = 5;
+    settings.disposable_queue_size = 30;
+    settings.disposable_queue_elements = 5;
+    settings.capacity = 90;
+    settings.max_file_block_size = 30;
+    settings.max_query_cache_size = 0;
+
+    io::BlockFileCache cache(cache_base_path, settings);
+    ASSERT_TRUE(cache.initialize());
+    wait_until_cache_ready(cache);
+
+    io::CacheContext context;
+    ReadStatistics rstats;
+    context.stats = &rstats;
+    context.cache_type = io::FileCacheType::NORMAL;
+
+    auto key = 
io::BlockFileCache::hash("late-holder-remove-skips-replaced-cache-cell");
+    auto old_holder = std::make_unique<FileBlocksHolder>(cache.get_or_set(key, 
0, 5, context));
+    auto old_blocks = fromHolder(*old_holder);
+    ASSERT_EQ(old_blocks.size(), 1);
+
+    auto old_file_block = old_blocks[0];
+    ASSERT_EQ(old_file_block->get_or_set_downloader(), 
io::FileBlock::get_caller_id());
+    download(old_file_block);
+    old_file_block->set_deleting();
+
+    ASSERT_EQ(cache._cur_cache_size, 5);
+    {
+        auto* cache_ptr = &cache;
+        SCOPED_CACHE_LOCK(cache_ptr->_mutex, cache_ptr);
+        auto file_it = cache._files.find(key);
+        ASSERT_NE(file_it, cache._files.end());
+        auto cell_it = file_it->second.find(0);
+        ASSERT_NE(cell_it, file_it->second.end());
+        auto& cell = cell_it->second;
+        ASSERT_TRUE(cell.queue_iterator.has_value());
+
+        auto& queue = cache.get_queue(old_file_block->cache_type());
+        queue.remove(*cell.queue_iterator, cache_lock);
+        cache._cur_cache_size -= old_file_block->range().size();
+
+        file_it->second.erase(cell_it);
+        if (file_it->second.empty()) {
+            cache._files.erase(file_it);
+        }
+    }
+
+    old_blocks.clear();
+    ASSERT_EQ(old_file_block.use_count(), 2);
+
+    auto new_holder = std::make_unique<FileBlocksHolder>(cache.get_or_set(key, 
0, 5, context));
+    auto new_blocks = fromHolder(*new_holder);
+    ASSERT_EQ(new_blocks.size(), 1);
+
+    auto new_file_block = new_blocks[0];
+    ASSERT_NE(new_file_block.get(), old_file_block.get());
+    ASSERT_EQ(cache._cur_cache_size, 5);
+    {
+        auto* cache_ptr = &cache;
+        SCOPED_CACHE_LOCK(cache_ptr->_mutex, cache_ptr);
+        auto* cell = cache.get_cell(key, 0, cache_lock);
+        ASSERT_NE(cell, nullptr);
+        ASSERT_EQ(cell->file_block.get(), new_file_block.get());
+    }
+
+    old_holder.reset();
+
+    EXPECT_EQ(old_file_block->cell, nullptr);
+    EXPECT_EQ(cache._cur_cache_size, 5);
+    EXPECT_EQ(cache.get_file_blocks_num(io::FileCacheType::NORMAL), 1);
+    {
+        auto* cache_ptr = &cache;
+        SCOPED_CACHE_LOCK(cache_ptr->_mutex, cache_ptr);
+        auto* cell = cache.get_cell(key, 0, cache_lock);
+        ASSERT_NE(cell, nullptr);
+        EXPECT_EQ(cell->file_block.get(), new_file_block.get());
+    }
+
+    new_blocks.clear();
+    new_file_block->set_deleting();
+    new_file_block.reset();
+    new_holder.reset();
+    old_file_block.reset();
+
+    EXPECT_EQ(cache._cur_cache_size, 0);
+    EXPECT_EQ(cache.get_file_blocks_num(io::FileCacheType::NORMAL), 0);
+
+    if (fs::exists(cache_base_path)) {
+        fs::remove_all(cache_base_path);
+    }
+}
+
 TEST_F(BlockFileCacheTest, test_factory_1) {
     std::string cache_path2 = caches_dir / "cache2" / "";
     std::string cache_path3 = caches_dir / "cache3" / "";
diff --git a/be/test/io/cache/block_file_cache_test_common.h 
b/be/test/io/cache/block_file_cache_test_common.h
index de29f5543d7..17de4bb814b 100644
--- a/be/test/io/cache/block_file_cache_test_common.h
+++ b/be/test/io/cache/block_file_cache_test_common.h
@@ -94,6 +94,16 @@ extern void complete_into_memory(const io::FileBlocksHolder& 
holder);
 extern void test_file_cache(io::FileCacheType cache_type);
 extern void test_file_cache_memory_storage(io::FileCacheType cache_type);
 
+inline void wait_until_cache_ready(io::BlockFileCache& cache) {
+    for (int i = 0; i < 100; ++i) {
+        if (cache.get_async_open_success()) {
+            break;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    ASSERT_TRUE(cache.get_async_open_success());
+}
+
 class BlockFileCacheTest : public testing::Test {
 public:
     static void SetUpTestSuite() {
@@ -133,4 +143,4 @@ private:
     inline static std::unique_ptr<FileCacheFactory> factory = 
std::make_unique<FileCacheFactory>();
 };
 
-} // end of namespace doris::io
\ No newline at end of file
+} // end of namespace doris::io


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to