This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 73c8dbcf1b5 [fix](clone) fix stale tablet report miss the new cloning replica (#38695) 73c8dbcf1b5 is described below commit 73c8dbcf1b5bda5f0d87318aec9fa85ef40e8e39 Author: yujun <yu.jun.re...@gmail.com> AuthorDate: Sun Aug 4 10:52:32 2024 +0800 [fix](clone) fix stale tablet report miss the new cloning replica (#38695) BUG: 1. BE begin collect tablet report; 2. BE clone a new replica A; 3. FE handle this BE's tablet report from step 1. But it's stale, it don't include the replica A, then FE mark replica A as bad; only after 1min later, BE report tablets again, then the new report contains replica A, only after that, FE will change replica A from bad to good. Fix: If BE clone a new replica, it should increase its report version and tell FE to update it. Then if FE handle the stale tablet report, it will compare BE's report version, then found the tablet report is stale and discard it. --- be/src/agent/task_worker_pool.cpp | 18 +++++++++++++----- be/src/olap/task/engine_clone_task.cpp | 11 ++++++----- be/src/olap/task/engine_clone_task.h | 7 +++++-- .../main/java/org/apache/doris/master/MasterImpl.java | 5 +++++ 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index efd15d0711b..7bbd602f571 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -104,6 +104,10 @@ std::unordered_map<TTaskType::type, std::unordered_set<int64_t>> s_task_signatur std::atomic_ulong s_report_version(time(nullptr) * 10000); +void increase_report_version() { + s_report_version.fetch_add(1, std::memory_order_relaxed); +} + // FIXME(plat1ko): Paired register and remove task info bool register_task_info(const TTaskType::type task_type, int64_t signature) { if (task_type == TTaskType::type::PUSH_STORAGE_POLICY || @@ -214,7 +218,7 @@ void alter_tablet(StorageEngine& engine, const TAgentTaskRequest& agent_task_req } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -290,7 +294,7 @@ void alter_cloud_tablet(CloudStorageEngine& engine, const TAgentTaskRequest& age } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -1534,7 +1538,7 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) .tag("tablet_id", create_tablet_req.tablet_id) .error(status); } else { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); // get path hash of the created tablet TabletSharedPtr tablet; { @@ -1629,7 +1633,7 @@ void push_callback(StorageEngine& engine, const TAgentTaskRequest& req) { .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); finish_task_request.__set_finish_tablet_infos(tablet_infos); } else { LOG_WARNING("failed to execute push task") @@ -1675,7 +1679,7 @@ void cloud_push_callback(CloudStorageEngine& engine, const TAgentTaskRequest& re .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); auto& tablet_info = finish_task_request.finish_tablet_infos.emplace_back(); // Just need tablet_id tablet_info.tablet_id = push_req.tablet_id; @@ -1972,6 +1976,10 @@ void clone_callback(StorageEngine& engine, const TMasterInfo& master_info, LOG_INFO("successfully clone tablet") .tag("signature", req.signature) .tag("tablet_id", clone_req.tablet_id); + if (engine_task.is_new_tablet()) { + increase_report_version(); + finish_task_request.__set_report_version(s_report_version); + } finish_task_request.__set_finish_tablet_infos(tablet_infos); } diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 40b789cf873..2b7388aa7c7 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -190,7 +190,7 @@ Status EngineCloneTask::_do_clone() { tablet->replica_id(), false)); tablet.reset(); } - bool is_new_tablet = tablet == nullptr; + _is_new_tablet = tablet == nullptr; // try to incremental clone Versions missed_versions; // try to repair a tablet with missing version @@ -228,7 +228,7 @@ Status EngineCloneTask::_do_clone() { if (missed_versions.empty()) { LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id=" << _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id; - RETURN_IF_ERROR(_set_tablet_info(is_new_tablet)); + RETURN_IF_ERROR(_set_tablet_info()); return Status::OK(); } @@ -307,10 +307,11 @@ Status EngineCloneTask::_do_clone() { TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id); RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path)); } - return _set_tablet_info(is_new_tablet); + + return _set_tablet_info(); } -Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { +Status EngineCloneTask::_set_tablet_info() { // Get clone tablet info TTabletInfo tablet_info; tablet_info.__set_tablet_id(_clone_req.tablet_id); @@ -320,7 +321,7 @@ Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) { // if it is a new tablet and clone failed, then remove the tablet // if it is incremental clone, then must not drop the tablet - if (is_new_tablet) { + if (_is_new_tablet) { // we need to check if this cloned table's version is what we expect. // if not, maybe this is a stale remaining table which is waiting for drop. // we drop it. diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 71dc3a817b8..3161b803c82 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -55,6 +55,8 @@ public: std::vector<TTabletInfo>* tablet_infos); ~EngineCloneTask() override = default; + bool is_new_tablet() const { return _is_new_tablet; } + private: Status _do_clone(); @@ -71,7 +73,7 @@ private: const std::vector<Version>& missing_versions, bool* allow_incremental_clone); - Status _set_tablet_info(bool is_new_tablet); + Status _set_tablet_info(); // Download tablet files from Status _download_files(DataDir* data_dir, const std::string& remote_url_prefix, @@ -95,6 +97,7 @@ private: int64_t _copy_size; int64_t _copy_time_ms; std::vector<PendingRowsetGuard> _pending_rs_guards; + bool _is_new_tablet = false; }; // EngineTask -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java index 54a05da9549..0eef0c684d6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java @@ -542,6 +542,11 @@ public class MasterImpl { private void finishClone(AgentTask task, TFinishTaskRequest request) { CloneTask cloneTask = (CloneTask) task; if (cloneTask.getTaskVersion() == CloneTask.VERSION_2) { + if (request.isSetReportVersion()) { + long reportVersion = request.getReportVersion(); + Env.getCurrentSystemInfo().updateBackendReportVersion( + task.getBackendId(), reportVersion, task.getDbId(), task.getTableId()); + } Env.getCurrentEnv().getTabletScheduler().finishCloneTask(cloneTask, request); } else { LOG.warn("invalid clone task, ignore it. {}", task); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org