This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 813fa6e9e60 [improvement](drop tablet) reduce shard lock wait time (#38469) 813fa6e9e60 is described below commit 813fa6e9e609e71396826bc81fc6f6fb92dcedc6 Author: yujun <yu.jun.re...@gmail.com> AuthorDate: Mon Jul 29 22:03:26 2024 +0800 [improvement](drop tablet) reduce shard lock wait time (#38469) drop tablet's code: ``` lock_guard(tablet_manager.shard.lock); delete tablet from tablet map { lock_guard(tablet.meta_lock) ... } ... ``` Sometimes, tablet meta lock may held by other threads for a long time, it will cause shard.lock also wait for a long time. For creating tablet thread, it also need to acquire the shard.lock, so creating tablet thread will also wait for a long time, then creating partition will be timeout. Fix: shard.lock don't wait the time of the tablet meta lock. --- be/src/olap/tablet_manager.cpp | 50 +++++++++++++++++++++++------------------- be/src/olap/tablet_manager.h | 4 ++-- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index a234ab93a47..6696dcf2e68 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -226,7 +226,7 @@ Status TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id, // If the new tablet is fresher than the existing one, then replace // the existing tablet with the new one. // Use default replica_id to ignore whether replica_id is match when drop tablet. - Status status = _drop_tablet_unlocked(tablet_id, /* replica_id */ 0, keep_files, false); + Status status = _drop_tablet(tablet_id, /* replica_id */ 0, keep_files, false, true); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropOldTablet", "AddTablet"), static_cast<int64_t>(watch.reset())); RETURN_NOT_OK_STATUS_WITH_WARN( @@ -438,7 +438,7 @@ TabletSharedPtr TabletManager::_internal_create_tablet_unlocked( } // something is wrong, we need clear environment if (is_tablet_added) { - Status status = _drop_tablet_unlocked(new_tablet_id, request.replica_id, false, false); + Status status = _drop_tablet(new_tablet_id, request.replica_id, false, false, true); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "DropTablet", parent_timer_name), static_cast<int64_t>(watch.reset())); if (!status.ok()) { @@ -522,14 +522,12 @@ TabletSharedPtr TabletManager::_create_tablet_meta_and_dir_unlocked( Status TabletManager::drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool is_drop_table_or_partition) { - auto& shard = _get_tablets_shard(tablet_id); - std::lock_guard wrlock(shard.lock); - return _drop_tablet_unlocked(tablet_id, replica_id, false, is_drop_table_or_partition); + return _drop_tablet(tablet_id, replica_id, false, is_drop_table_or_partition, false); } // Drop specified tablet. -Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, - bool keep_files, bool is_drop_table_or_partition) { +Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, + bool is_drop_table_or_partition, bool had_held_shard_lock) { LOG(INFO) << "begin drop tablet. tablet_id=" << tablet_id << ", replica_id=" << replica_id << ", is_drop_table_or_partition=" << is_drop_table_or_partition; DorisMetrics::instance()->drop_tablet_requests_total->increment(1); @@ -538,23 +536,31 @@ Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId repl Defer defer {[&]() { unregister_transition_tablet(tablet_id, "drop tablet"); }}; // Fetch tablet which need to be dropped - TabletSharedPtr to_drop_tablet = _get_tablet_unlocked(tablet_id); - if (to_drop_tablet == nullptr) { - LOG(WARNING) << "fail to drop tablet because it does not exist. " - << "tablet_id=" << tablet_id; - return Status::OK(); - } + TabletSharedPtr to_drop_tablet; + { + std::unique_lock<std::shared_mutex> wlock(_get_tablets_shard_lock(tablet_id), + std::defer_lock); + if (!had_held_shard_lock) { + wlock.lock(); + } + to_drop_tablet = _get_tablet_unlocked(tablet_id); + if (to_drop_tablet == nullptr) { + LOG(WARNING) << "fail to drop tablet because it does not exist. " + << "tablet_id=" << tablet_id; + return Status::OK(); + } - // We should compare replica id to avoid dropping new cloned tablet. - // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before. - if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) { - return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(), - replica_id); - } + // We should compare replica id to avoid dropping new cloned tablet. + // Iff request replica id is 0, FE may be an older release, then we drop this tablet as before. + if (to_drop_tablet->replica_id() != replica_id && replica_id != 0) { + return Status::Aborted("replica_id not match({} vs {})", to_drop_tablet->replica_id(), + replica_id); + } - _remove_tablet_from_partition(to_drop_tablet); - tablet_map_t& tablet_map = _get_tablet_map(tablet_id); - tablet_map.erase(tablet_id); + _remove_tablet_from_partition(to_drop_tablet); + tablet_map_t& tablet_map = _get_tablet_map(tablet_id); + tablet_map.erase(tablet_id); + } to_drop_tablet->clear_cache(); diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index 809a2237356..42623cf05f2 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -194,8 +194,8 @@ private: bool _check_tablet_id_exist_unlocked(TTabletId tablet_id); - Status _drop_tablet_unlocked(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, - bool is_drop_table_or_partition); + Status _drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool keep_files, + bool is_drop_table_or_partition, bool had_held_shard_lock); TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id); TabletSharedPtr _get_tablet_unlocked(TTabletId tablet_id, bool include_deleted, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org