This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git

The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 813fa6e9e60 [improvement](drop tablet) reduce shard lock wait time 
(#38469)
813fa6e9e60 is described below

commit 813fa6e9e609e71396826bc81fc6f6fb92dcedc6
Author: yujun <yu.jun.re...@gmail.com>
AuthorDate: Mon Jul 29 22:03:26 2024 +0800

    [improvement](drop tablet) reduce shard lock wait time (#38469)
    
    drop tablet's code:
    
    ```
        lock_guard(tablet_manager.shard.lock);
    
        delete tablet from tablet map
    
         {
                lock_guard(tablet.meta_lock)
                ...
         }
    
         ...
    ```
    
    Sometimes, tablet meta lock may held by other threads for a long time,
    it will cause shard.lock also wait for a long time.
    
    For creating tablet thread, it also need to acquire the shard.lock, so
    creating tablet thread will also wait for a long time, then creating
    partition will be timeout.
    
    Fix:   shard.lock   don't wait the time of the tablet meta lock.
---
 be/src/olap/tablet_manager.cpp | 50 +++++++++++++++++++++++-------------------
 be/src/olap/tablet_manager.h   |  4 ++--
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index a234ab93a47..6696dcf2e68 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -226,7 +226,7 @@ Status TabletManager::_add_tablet_to_map_unlocked(TTabletId 
tablet_id,
         // If the new tablet is fresher than the existing one, then replace
         // the existing tablet with the new one.
         // Use default replica_id to ignore whether replica_id is match when 
drop tablet.
-        Status status = _drop_tablet_unlocked(tablet_id, /* replica_id */ 0, 
keep_files, false);
+        Status status = _drop_tablet(tablet_id, /* replica_id */ 0, 
keep_files, false, true);
         COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropOldTablet", "AddTablet"),
                        static_cast<int64_t>(watch.reset()));
         RETURN_NOT_OK_STATUS_WITH_WARN(
@@ -438,7 +438,7 @@ TabletSharedPtr 
TabletManager::_internal_create_tablet_unlocked(
     }
     // something is wrong, we need clear environment
     if (is_tablet_added) {
-        Status status = _drop_tablet_unlocked(new_tablet_id, 
request.replica_id, false, false);
+        Status status = _drop_tablet(new_tablet_id, request.replica_id, false, 
false, true);
         COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropTablet", 
parent_timer_name),
                        static_cast<int64_t>(watch.reset()));
         if (!status.ok()) {
@@ -522,14 +522,12 @@ TabletSharedPtr 
TabletManager::_create_tablet_meta_and_dir_unlocked(
 
 Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id,
                                   bool is_drop_table_or_partition) {
-    auto& shard = _get_tablets_shard(tablet_id);
-    std::lock_guard wrlock(shard.lock);
-    return _drop_tablet_unlocked(tablet_id, replica_id, false, 
is_drop_table_or_partition);
+    return _drop_tablet(tablet_id, replica_id, false, 
is_drop_table_or_partition, false);
 }
 
 // Drop specified tablet.
-Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId 
replica_id,
-                                            bool keep_files, bool 
is_drop_table_or_partition) {
+Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, 
bool keep_files,
+                                   bool is_drop_table_or_partition, bool 
had_held_shard_lock) {
     LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", 
replica_id=" << replica_id
               << ", is_drop_table_or_partition=" << is_drop_table_or_partition;
     DorisMetrics::instance()->drop_tablet_requests_total->increment(1);
@@ -538,23 +536,31 @@ Status TabletManager::_drop_tablet_unlocked(TTabletId 
tablet_id, TReplicaId repl
     Defer defer {[&]() { unregister_transition_tablet(tablet_id, "drop 
tablet"); }};
 
     // Fetch tablet which need to be dropped
-    TabletSharedPtr to_drop_tablet = _get_tablet_unlocked(tablet_id);
-    if (to_drop_tablet == nullptr) {
-        LOG(WARNING) << "fail to drop tablet because it does not exist. "
-                     << "tablet_id=" << tablet_id;
-        return Status::OK();
-    }
+    TabletSharedPtr to_drop_tablet;
+    {
+        std::unique_lock<std::shared_mutex> 
wlock(_get_tablets_shard_lock(tablet_id),
+                                                  std::defer_lock);
+        if (!had_held_shard_lock) {
+            wlock.lock();
+        }
+        to_drop_tablet = _get_tablet_unlocked(tablet_id);
+        if (to_drop_tablet == nullptr) {
+            LOG(WARNING) << "fail to drop tablet because it does not exist. "
+                         << "tablet_id=" << tablet_id;
+            return Status::OK();
+        }
 
-    // We should compare replica id to avoid dropping new cloned tablet.
-    // Iff request replica id is 0, FE may be an older release, then we drop 
this tablet as before.
-    if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) {
-        return Status::Aborted("replica_id not match({} vs {})", 
to_drop_tablet->replica_id(),
-                               replica_id);
-    }
+        // We should compare replica id to avoid dropping new cloned tablet.
+        // Iff request replica id is 0, FE may be an older release, then we 
drop this tablet as before.
+        if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) {
+            return Status::Aborted("replica_id not match({} vs {})", 
to_drop_tablet->replica_id(),
+                                   replica_id);
+        }
 
-    _remove_tablet_from_partition(to_drop_tablet);
-    tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
-    tablet_map.erase(tablet_id);
+        _remove_tablet_from_partition(to_drop_tablet);
+        tablet_map_t& tablet_map = _get_tablet_map(tablet_id);
+        tablet_map.erase(tablet_id);
+    }
 
     to_drop_tablet->clear_cache();
 
diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h
index 809a2237356..42623cf05f2 100644
--- a/be/src/olap/tablet_manager.h
+++ b/be/src/olap/tablet_manager.h
@@ -194,8 +194,8 @@ private:
 
     bool _check_tablet_id_exist_unlocked(TTabletId tablet_id);
 
-    Status _drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, 
bool keep_files,
-                                 bool is_drop_table_or_partition);
+    Status _drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool 
keep_files,
+                        bool is_drop_table_or_partition, bool 
had_held_shard_lock);
 
     TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id);
     TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id, bool 
include_deleted,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to