This is an automated email from the ASF dual-hosted git repository.

zhangchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 840503e8d58 [improve](cloud-mow) reduce ms update_delete_bitmap kv 
confict (#47375)
840503e8d58 is described below

commit 840503e8d584b53635b92ead247ac02b3772d48b
Author: meiyi <me...@selectdb.com>
AuthorDate: Thu Jun 5 14:51:56 2025 +0800

    [improve](cloud-mow) reduce ms update_delete_bitmap kv confict (#47375)
    
    Problem Summary:
    
    1. `update_delete_bitmap` may split to several transactions to avoid
    delete bitmap size is larger than the fdb transaction limit
    2. multi compaction jobs will change the initiators of the lock_info,
    which will cause txn_conflict of `update_delete_bitmap`.
    3. for update with multi transactions, the txn_confict error is more
    easily to happen, even after some retries, the `update_delete_bitmap`
    will fail
    4. the root cause is multi compactions should not conflict, pr 48024
    solve it
    5. but branch-3.0 does not contain pr 48024, so modify the check lock_id
    to snapshot read to avoid txn_conflict. if lock_id is changed, the final
    `commit_txn` or `commit_job` can handle it
---
 cloud/src/meta-service/meta_service.cpp     | 14 +++++++++-----
 cloud/src/meta-service/meta_service.h       |  2 ++
 cloud/src/meta-service/meta_service_job.cpp |  3 ---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cloud/src/meta-service/meta_service.cpp 
b/cloud/src/meta-service/meta_service.cpp
index 36b32063a96..45e64f475f6 100644
--- a/cloud/src/meta-service/meta_service.cpp
+++ b/cloud/src/meta-service/meta_service.cpp
@@ -1749,10 +1749,12 @@ void 
MetaServiceImpl::get_tablet_stats(::google::protobuf::RpcController* contro
 static bool check_delete_bitmap_lock(MetaServiceCode& code, std::string& msg, 
std::stringstream& ss,
                                      std::unique_ptr<Transaction>& txn, 
int64_t table_id,
                                      int64_t lock_id, int64_t lock_initiator, 
std::string& lock_key,
-                                     DeleteBitmapUpdateLockPB& lock_info) {
+                                     DeleteBitmapUpdateLockPB& lock_info,
+                                     bool snapshot_read = false) {
     std::string lock_val;
-    LOG(INFO) << "check_delete_bitmap_lock, table_id=" << table_id << " key=" 
<< hex(lock_key);
-    auto err = txn->get(lock_key, &lock_val);
+    LOG(INFO) << "check_delete_bitmap_lock, table_id=" << table_id << " key=" 
<< hex(lock_key)
+              << ", lock_id=" << lock_id << ", snapshot_read=" << 
snapshot_read;
+    auto err = txn->get(lock_key, &lock_val, snapshot_read);
     
TEST_SYNC_POINT_CALLBACK("check_delete_bitmap_lock.inject_get_lock_key_err", 
&err);
     if (err == TxnErrorCode::TXN_KEY_NOT_FOUND) {
         msg = "lock id key not found";
@@ -1963,12 +1965,13 @@ void 
MetaServiceImpl::update_delete_bitmap(google::protobuf::RpcController* cont
     }
 
     bool unlock = request->has_unlock() ? request->unlock() : false;
+    bool snapshot_read = request->lock_id() == 
COMPACTION_DELETE_BITMAP_LOCK_ID;
     if (!unlock) {
         // 1. Check whether the lock expires
         std::string lock_key = 
meta_delete_bitmap_update_lock_key({instance_id, table_id, -1});
         DeleteBitmapUpdateLockPB lock_info;
         if (!check_delete_bitmap_lock(code, msg, ss, txn, table_id, 
request->lock_id(),
-                                      request->initiator(), lock_key, 
lock_info)) {
+                                      request->initiator(), lock_key, 
lock_info, snapshot_read)) {
             LOG(WARNING) << "failed to check delete bitmap lock, table_id=" << 
table_id
                          << " request lock_id=" << request->lock_id()
                          << " request initiator=" << request->initiator() << " 
msg " << msg;
@@ -2079,7 +2082,8 @@ void 
MetaServiceImpl::update_delete_bitmap(google::protobuf::RpcController* cont
                         meta_delete_bitmap_update_lock_key({instance_id, 
table_id, -1});
                 DeleteBitmapUpdateLockPB lock_info;
                 if (!check_delete_bitmap_lock(code, msg, ss, txn, table_id, 
request->lock_id(),
-                                              request->initiator(), lock_key, 
lock_info)) {
+                                              request->initiator(), lock_key, 
lock_info,
+                                              snapshot_read)) {
                     LOG(WARNING) << "failed to check delete bitmap lock, 
table_id=" << table_id
                                  << " request lock_id=" << request->lock_id()
                                  << " request initiator=" << 
request->initiator() << " msg " << msg;
diff --git a/cloud/src/meta-service/meta_service.h 
b/cloud/src/meta-service/meta_service.h
index 6df09bd2c20..57f88d51dfe 100644
--- a/cloud/src/meta-service/meta_service.h
+++ b/cloud/src/meta-service/meta_service.h
@@ -39,6 +39,8 @@ namespace doris::cloud {
 class Transaction;
 
 constexpr std::string_view BUILT_IN_STORAGE_VAULT_NAME = 
"built_in_storage_vault";
+static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1;
+static constexpr int SCHEMA_CHANGE_DELETE_BITMAP_LOCK_ID = -2;
 
 void internal_get_rowset(Transaction* txn, int64_t start, int64_t end,
                          const std::string& instance_id, int64_t tablet_id, 
MetaServiceCode& code,
diff --git a/cloud/src/meta-service/meta_service_job.cpp 
b/cloud/src/meta-service/meta_service_job.cpp
index 29f1c9993fd..3dd89afbb61 100644
--- a/cloud/src/meta-service/meta_service_job.cpp
+++ b/cloud/src/meta-service/meta_service_job.cpp
@@ -46,9 +46,6 @@ static inline constexpr size_t get_file_name_offset(const T 
(&s)[S], size_t i =
 
 namespace doris::cloud {
 
-static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1;
-static constexpr int SCHEMA_CHANGE_DELETE_BITMAP_LOCK_ID = -2;
-
 // check compaction input_versions are valid during schema change.
 // If the schema change job doesnt have alter version, it dont need to check
 // because the schema change job is come from old version BE.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to