This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 085bf949fe2 branch-4.0: [fix](recycler) Prevent KV deletion on 
Accessor failure and handle invalid keys #60059 (#60135)
085bf949fe2 is described below

commit 085bf949fe2d37b6cb70f659abf026f4676ccb17
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Jan 22 16:24:17 2026 +0800

    branch-4.0: [fix](recycler) Prevent KV deletion on Accessor failure and 
handle invalid keys #60059 (#60135)
    
    Cherry-picked from #60059
    
    Co-authored-by: Yixuan Wang <[email protected]>
---
 cloud/src/recycler/recycler.cpp |   4 ++
 cloud/test/mock_accessor.h      |   2 +-
 cloud/test/recycler_test.cpp    | 145 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/cloud/src/recycler/recycler.cpp b/cloud/src/recycler/recycler.cpp
index 3e0ee296f33..c5b72d53457 100644
--- a/cloud/src/recycler/recycler.cpp
+++ b/cloud/src/recycler/recycler.cpp
@@ -2799,6 +2799,10 @@ int InstanceRecycler::recycle_tablets(int64_t table_id, 
int64_t index_id,
             return -1;
         }
         if (tablet_keys.empty() && tablet_idx_keys.empty()) return 0;
+        if (!tablet_keys.empty() &&
+            std::ranges::all_of(tablet_keys, [](const auto& k) { return 
k.first.empty(); })) {
+            return -1;
+        }
         // sort the vector using key's order
         std::sort(tablet_keys.begin(), tablet_keys.end(),
                   [](const auto& prev, const auto& last) { return prev.first < 
last.first; });
diff --git a/cloud/test/mock_accessor.h b/cloud/test/mock_accessor.h
index 01953ebd3a4..6c187b7ca99 100644
--- a/cloud/test/mock_accessor.h
+++ b/cloud/test/mock_accessor.h
@@ -109,7 +109,7 @@ inline auto MockAccessor::get_prefix_range(const 
std::string& path_prefix) {
 }
 
 inline int MockAccessor::delete_prefix_impl(const std::string& path_prefix) {
-    TEST_SYNC_POINT("MockAccessor::delete_prefix");
+    TEST_SYNC_POINT_RETURN_WITH_VALUE("MockAccessor::delete_prefix", (int)0);
     LOG(INFO) << "delete object of prefix=" << path_prefix;
     std::lock_guard lock(mtx_);
 
diff --git a/cloud/test/recycler_test.cpp b/cloud/test/recycler_test.cpp
index 269a23eb44b..976ca7a18d9 100644
--- a/cloud/test/recycler_test.cpp
+++ b/cloud/test/recycler_test.cpp
@@ -8406,4 +8406,149 @@ TEST(RecyclerTest, 
abort_job_for_related_rowset_when_tablet_recycled) {
             << "Should return 0 when tablet is already recycled (parallel 
recycle scenario)";
 }
 
+TEST(RecyclerTest, recycle_tablet_with_delete_file_failure) {
+    // Test case: recycle_tablet with delete_files failure should not cause
+    // the next recycle to hang due to improperly deleted KV entries.
+    //
+    // Bug scenario:
+    // 1. recycle_tablet calls accessor->delete_files, but it fails (returns 
-1)
+    // 2. recycle_tablet returns empty key (line 2784)
+    // 3. The empty key is filtered out by SyncExecutor (line 2740)
+    // 4. Old bug: tablet_keys with empty keys caused use_range_remove logic 
issue (line 2801-2810)
+    // 5. restore_job_keys were still deleted (line 2857-2859) despite tablet 
not being recycled
+    // 6. Next recycle attempt would hang at check_lazy_txn_finished (line 
2760-2763)
+    //
+    // After fix: Empty keys are filtered out by SyncExecutor, so tablet meta 
keys
+    // won't be deleted if recycle_tablet fails.
+
+    auto* sp = SyncPoint::get_instance();
+    DORIS_CLOUD_DEFER {
+        sp->clear_all_call_backs();
+        sp->clear_trace();
+        sp->disable_processing();
+    };
+
+    auto txn_kv = std::make_shared<MemTxnKv>();
+    ASSERT_EQ(txn_kv->init(), 0);
+
+    InstanceInfoPB instance;
+    instance.set_instance_id(::instance_id);
+    auto obj_info = instance.add_obj_info();
+    obj_info->set_id("recycle_tablet_with_delete_file_failure");
+    obj_info->set_ak(config::test_s3_ak);
+    obj_info->set_sk(config::test_s3_sk);
+    obj_info->set_endpoint(config::test_s3_endpoint);
+    obj_info->set_region(config::test_s3_region);
+    obj_info->set_bucket(config::test_s3_bucket);
+    obj_info->set_prefix("recycle_tablet_with_delete_file_failure");
+
+    InstanceRecycler recycler(txn_kv, instance, thread_group,
+                              std::make_shared<TxnLazyCommitter>(txn_kv));
+    ASSERT_EQ(recycler.init(), 0);
+
+    doris::TabletSchemaCloudPB schema;
+    schema.set_schema_version(1);
+
+    constexpr int64_t table_id = 21000;
+    constexpr int64_t index_id = 21001;
+    constexpr int64_t partition_id = 21002;
+    constexpr int64_t tablet_id = 21003;
+
+    auto accessor = recycler.accessor_map_.begin()->second;
+
+    // Create 5 tablet with its metadata and index keys
+    for (int i = 0; i < 5; ++i) {
+        ASSERT_EQ(create_tablet(txn_kv.get(), table_id, index_id, 
partition_id, tablet_id), 0);
+    }
+
+    // Create some committed rowsets for the tablet (in meta_rowset_key)
+    // These will be picked up by recycle_tablet when it scans for rowsets
+    for (int i = 0; i < 5; ++i) {
+        create_committed_rowset(txn_kv.get(), accessor.get(),
+                                "recycle_tablet_with_delete_file_failure", 
tablet_id, i, index_id,
+                                2, 1, true);
+    }
+
+    // Create partition version kv (required for lazy txn check)
+    ASSERT_EQ(create_partition_version_kv(txn_kv.get(), table_id, 
partition_id), 0);
+
+    // Inject failure: make delete_directory return -1
+    // This simulates the scenario where accessor fails to delete directory 
from object storage
+    std::atomic<int> delete_directory_call_count {0};
+    sp->set_call_back("MockAccessor::delete_prefix", 
[&delete_directory_call_count](auto&& args) {
+        auto* ret = try_any_cast_ret<int>(args);
+        delete_directory_call_count++;
+        ret->first = -1;    // Return error
+        ret->second = true; // Override return value
+    });
+    sp->enable_processing();
+
+    // First recycle attempt - should fail to recycle tablet due to 
delete_directory failure
+    int ret = recycler.recycle_tablets(table_id, index_id, ctx);
+    EXPECT_EQ(ret, -1) << "First recycle attempt should failed";
+    // recycle_tablets may return -1 or 0 depending on implementation,
+    // but the key point is that tablet should NOT be fully recycled
+    EXPECT_GT(delete_directory_call_count.load(), 0) << "delete_directory 
should have been called";
+
+    // Verify tablet metadata still exists (because recycle_tablet failed)
+    {
+        std::unique_ptr<Transaction> txn;
+        ASSERT_EQ(txn_kv->create_txn(&txn), TxnErrorCode::TXN_OK);
+        std::string tablet_key =
+                meta_tablet_key({::instance_id, table_id, index_id, 
partition_id, tablet_id});
+        std::string val;
+        // Tablet key should still exist because recycle failed
+        TxnErrorCode err = txn->get(tablet_key, &val);
+        EXPECT_EQ(err, TxnErrorCode::TXN_OK)
+                << "Tablet key should still exist after failed recycle 
attempt";
+
+        // Check tablet index key also still exists
+        std::string tablet_idx_key = meta_tablet_idx_key({::instance_id, 
tablet_id});
+        err = txn->get(tablet_idx_key, &val);
+        EXPECT_EQ(err, TxnErrorCode::TXN_OK)
+                << "Tablet index key should still exist after failed recycle 
attempt";
+    }
+
+    // Clear the sync point to allow delete_directory to succeed
+    sp->clear_all_call_backs();
+    sp->disable_processing();
+
+    // Second recycle attempt - should succeed without hanging at 
check_lazy_txn_finished
+    // This verifies the fix: empty keys are properly filtered, so KV entries 
are consistent
+    ret = recycler.recycle_tablets(table_id, index_id, ctx);
+    EXPECT_EQ(ret, 0) << "Second recycle attempt should succeed";
+
+    // Verify all related kv have been deleted
+    {
+        std::unique_ptr<Transaction> txn;
+        ASSERT_EQ(txn_kv->create_txn(&txn), TxnErrorCode::TXN_OK);
+
+        // Check tablet meta key is deleted
+        std::string tablet_key =
+                meta_tablet_key({::instance_id, table_id, index_id, 
partition_id, tablet_id});
+        std::string val;
+        TxnErrorCode err = txn->get(tablet_key, &val);
+        EXPECT_EQ(err, TxnErrorCode::TXN_KEY_NOT_FOUND)
+                << "Tablet key should be deleted after successful recycle";
+
+        // Check tablet index key is deleted
+        std::string tablet_idx_key = meta_tablet_idx_key({::instance_id, 
tablet_id});
+        err = txn->get(tablet_idx_key, &val);
+        EXPECT_EQ(err, TxnErrorCode::TXN_KEY_NOT_FOUND)
+                << "Tablet index key should be deleted after successful 
recycle";
+
+        // Check restore job key is deleted
+        std::string restore_job_key = job_restore_tablet_key({::instance_id, 
tablet_id});
+        err = txn->get(restore_job_key, &val);
+        EXPECT_EQ(err, TxnErrorCode::TXN_KEY_NOT_FOUND)
+                << "Restore job key should be deleted after successful 
recycle";
+
+        // Check no recycle rowset keys remain
+        std::string recyc_rs_key_begin = recycle_rowset_key({::instance_id, 
tablet_id, ""});
+        std::string recyc_rs_key_end = recycle_rowset_key({::instance_id, 
tablet_id + 1, ""});
+        std::unique_ptr<RangeGetIterator> it;
+        ASSERT_EQ(txn->get(recyc_rs_key_begin, recyc_rs_key_end, &it), 
TxnErrorCode::TXN_OK);
+        EXPECT_EQ(it->size(), 0) << "All recycle rowset keys should be 
deleted";
+    }
+}
 } // namespace doris::cloud


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to