This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 085bf949fe2 branch-4.0: [fix](recycler) Prevent KV deletion on
Accessor failure and handle invalid keys #60059 (#60135)
085bf949fe2 is described below
commit 085bf949fe2d37b6cb70f659abf026f4676ccb17
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Jan 22 16:24:17 2026 +0800
branch-4.0: [fix](recycler) Prevent KV deletion on Accessor failure and
handle invalid keys #60059 (#60135)
Cherry-picked from #60059
Co-authored-by: Yixuan Wang <[email protected]>
---
cloud/src/recycler/recycler.cpp | 4 ++
cloud/test/mock_accessor.h | 2 +-
cloud/test/recycler_test.cpp | 145 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 150 insertions(+), 1 deletion(-)
diff --git a/cloud/src/recycler/recycler.cpp b/cloud/src/recycler/recycler.cpp
index 3e0ee296f33..c5b72d53457 100644
--- a/cloud/src/recycler/recycler.cpp
+++ b/cloud/src/recycler/recycler.cpp
@@ -2799,6 +2799,10 @@ int InstanceRecycler::recycle_tablets(int64_t table_id,
int64_t index_id,
return -1;
}
if (tablet_keys.empty() && tablet_idx_keys.empty()) return 0;
+ if (!tablet_keys.empty() &&
+ std::ranges::all_of(tablet_keys, [](const auto& k) { return
k.first.empty(); })) {
+ return -1;
+ }
// sort the vector using key's order
std::sort(tablet_keys.begin(), tablet_keys.end(),
[](const auto& prev, const auto& last) { return prev.first <
last.first; });
diff --git a/cloud/test/mock_accessor.h b/cloud/test/mock_accessor.h
index 01953ebd3a4..6c187b7ca99 100644
--- a/cloud/test/mock_accessor.h
+++ b/cloud/test/mock_accessor.h
@@ -109,7 +109,7 @@ inline auto MockAccessor::get_prefix_range(const
std::string& path_prefix) {
}
inline int MockAccessor::delete_prefix_impl(const std::string& path_prefix) {
- TEST_SYNC_POINT("MockAccessor::delete_prefix");
+ TEST_SYNC_POINT_RETURN_WITH_VALUE("MockAccessor::delete_prefix", (int)0);
LOG(INFO) << "delete object of prefix=" << path_prefix;
std::lock_guard lock(mtx_);
diff --git a/cloud/test/recycler_test.cpp b/cloud/test/recycler_test.cpp
index 269a23eb44b..976ca7a18d9 100644
--- a/cloud/test/recycler_test.cpp
+++ b/cloud/test/recycler_test.cpp
@@ -8406,4 +8406,149 @@ TEST(RecyclerTest,
abort_job_for_related_rowset_when_tablet_recycled) {
<< "Should return 0 when tablet is already recycled (parallel
recycle scenario)";
}
+TEST(RecyclerTest, recycle_tablet_with_delete_file_failure) {
+ // Test case: recycle_tablet with delete_files failure should not cause
+ // the next recycle to hang due to improperly deleted KV entries.
+ //
+ // Bug scenario:
+ // 1. recycle_tablet calls accessor->delete_files, but it fails (returns
-1)
+ // 2. recycle_tablet returns empty key (line 2784)
+ // 3. The empty key is filtered out by SyncExecutor (line 2740)
+ // 4. Old bug: tablet_keys with empty keys caused use_range_remove logic
issue (line 2801-2810)
+ // 5. restore_job_keys were still deleted (line 2857-2859) despite tablet
not being recycled
+ // 6. Next recycle attempt would hang at check_lazy_txn_finished (line
2760-2763)
+ //
+ // After fix: Empty keys are filtered out by SyncExecutor, so tablet meta
keys
+ // won't be deleted if recycle_tablet fails.
+
+ auto* sp = SyncPoint::get_instance();
+ DORIS_CLOUD_DEFER {
+ sp->clear_all_call_backs();
+ sp->clear_trace();
+ sp->disable_processing();
+ };
+
+ auto txn_kv = std::make_shared<MemTxnKv>();
+ ASSERT_EQ(txn_kv->init(), 0);
+
+ InstanceInfoPB instance;
+ instance.set_instance_id(::instance_id);
+ auto obj_info = instance.add_obj_info();
+ obj_info->set_id("recycle_tablet_with_delete_file_failure");
+ obj_info->set_ak(config::test_s3_ak);
+ obj_info->set_sk(config::test_s3_sk);
+ obj_info->set_endpoint(config::test_s3_endpoint);
+ obj_info->set_region(config::test_s3_region);
+ obj_info->set_bucket(config::test_s3_bucket);
+ obj_info->set_prefix("recycle_tablet_with_delete_file_failure");
+
+ InstanceRecycler recycler(txn_kv, instance, thread_group,
+ std::make_shared<TxnLazyCommitter>(txn_kv));
+ ASSERT_EQ(recycler.init(), 0);
+
+ doris::TabletSchemaCloudPB schema;
+ schema.set_schema_version(1);
+
+ constexpr int64_t table_id = 21000;
+ constexpr int64_t index_id = 21001;
+ constexpr int64_t partition_id = 21002;
+ constexpr int64_t tablet_id = 21003;
+
+ auto accessor = recycler.accessor_map_.begin()->second;
+
+ // Create 5 tablet with its metadata and index keys
+ for (int i = 0; i < 5; ++i) {
+ ASSERT_EQ(create_tablet(txn_kv.get(), table_id, index_id,
partition_id, tablet_id), 0);
+ }
+
+ // Create some committed rowsets for the tablet (in meta_rowset_key)
+ // These will be picked up by recycle_tablet when it scans for rowsets
+ for (int i = 0; i < 5; ++i) {
+ create_committed_rowset(txn_kv.get(), accessor.get(),
+ "recycle_tablet_with_delete_file_failure",
tablet_id, i, index_id,
+ 2, 1, true);
+ }
+
+ // Create partition version kv (required for lazy txn check)
+ ASSERT_EQ(create_partition_version_kv(txn_kv.get(), table_id,
partition_id), 0);
+
+ // Inject failure: make delete_directory return -1
+ // This simulates the scenario where accessor fails to delete directory
from object storage
+ std::atomic<int> delete_directory_call_count {0};
+ sp->set_call_back("MockAccessor::delete_prefix",
[&delete_directory_call_count](auto&& args) {
+ auto* ret = try_any_cast_ret<int>(args);
+ delete_directory_call_count++;
+ ret->first = -1; // Return error
+ ret->second = true; // Override return value
+ });
+ sp->enable_processing();
+
+ // First recycle attempt - should fail to recycle tablet due to
delete_directory failure
+ int ret = recycler.recycle_tablets(table_id, index_id, ctx);
+ EXPECT_EQ(ret, -1) << "First recycle attempt should failed";
+ // recycle_tablets may return -1 or 0 depending on implementation,
+ // but the key point is that tablet should NOT be fully recycled
+ EXPECT_GT(delete_directory_call_count.load(), 0) << "delete_directory
should have been called";
+
+ // Verify tablet metadata still exists (because recycle_tablet failed)
+ {
+ std::unique_ptr<Transaction> txn;
+ ASSERT_EQ(txn_kv->create_txn(&txn), TxnErrorCode::TXN_OK);
+ std::string tablet_key =
+ meta_tablet_key({::instance_id, table_id, index_id,
partition_id, tablet_id});
+ std::string val;
+ // Tablet key should still exist because recycle failed
+ TxnErrorCode err = txn->get(tablet_key, &val);
+ EXPECT_EQ(err, TxnErrorCode::TXN_OK)
+ << "Tablet key should still exist after failed recycle
attempt";
+
+ // Check tablet index key also still exists
+ std::string tablet_idx_key = meta_tablet_idx_key({::instance_id,
tablet_id});
+ err = txn->get(tablet_idx_key, &val);
+ EXPECT_EQ(err, TxnErrorCode::TXN_OK)
+ << "Tablet index key should still exist after failed recycle
attempt";
+ }
+
+ // Clear the sync point to allow delete_directory to succeed
+ sp->clear_all_call_backs();
+ sp->disable_processing();
+
+ // Second recycle attempt - should succeed without hanging at
check_lazy_txn_finished
+ // This verifies the fix: empty keys are properly filtered, so KV entries
are consistent
+ ret = recycler.recycle_tablets(table_id, index_id, ctx);
+ EXPECT_EQ(ret, 0) << "Second recycle attempt should succeed";
+
+ // Verify all related kv have been deleted
+ {
+ std::unique_ptr<Transaction> txn;
+ ASSERT_EQ(txn_kv->create_txn(&txn), TxnErrorCode::TXN_OK);
+
+ // Check tablet meta key is deleted
+ std::string tablet_key =
+ meta_tablet_key({::instance_id, table_id, index_id,
partition_id, tablet_id});
+ std::string val;
+ TxnErrorCode err = txn->get(tablet_key, &val);
+ EXPECT_EQ(err, TxnErrorCode::TXN_KEY_NOT_FOUND)
+ << "Tablet key should be deleted after successful recycle";
+
+ // Check tablet index key is deleted
+ std::string tablet_idx_key = meta_tablet_idx_key({::instance_id,
tablet_id});
+ err = txn->get(tablet_idx_key, &val);
+ EXPECT_EQ(err, TxnErrorCode::TXN_KEY_NOT_FOUND)
+ << "Tablet index key should be deleted after successful
recycle";
+
+ // Check restore job key is deleted
+ std::string restore_job_key = job_restore_tablet_key({::instance_id,
tablet_id});
+ err = txn->get(restore_job_key, &val);
+ EXPECT_EQ(err, TxnErrorCode::TXN_KEY_NOT_FOUND)
+ << "Restore job key should be deleted after successful
recycle";
+
+ // Check no recycle rowset keys remain
+ std::string recyc_rs_key_begin = recycle_rowset_key({::instance_id,
tablet_id, ""});
+ std::string recyc_rs_key_end = recycle_rowset_key({::instance_id,
tablet_id + 1, ""});
+ std::unique_ptr<RangeGetIterator> it;
+ ASSERT_EQ(txn->get(recyc_rs_key_begin, recyc_rs_key_end, &it),
TxnErrorCode::TXN_OK);
+ EXPECT_EQ(it->size(), 0) << "All recycle rowset keys should be
deleted";
+ }
+}
} // namespace doris::cloud
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]