This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git

The following commit(s) were added to refs/heads/master by this push:
     new 73c8dbcf1b5 [fix](clone) fix stale tablet report miss the new cloning 
replica (#38695)
73c8dbcf1b5 is described below

commit 73c8dbcf1b5bda5f0d87318aec9fa85ef40e8e39
Author: yujun <yu.jun.re...@gmail.com>
AuthorDate: Sun Aug 4 10:52:32 2024 +0800

    [fix](clone) fix stale tablet report miss the new cloning replica (#38695)
    
    BUG:
    1. BE begin collect tablet report;
    2. BE clone a new replica A;
    3. FE handle this BE's tablet report from step 1. But it's stale, it
    don't include the replica A, then FE mark replica A as bad;
    
    only after 1min later, BE report tablets again, then the new report
    contains replica A, only after that, FE will change replica A from bad
    to good.
    
    Fix:
    If BE clone a new replica, it should increase its report version and
    tell FE to update it. Then if FE handle the stale tablet report, it will
    compare BE's report version, then found the tablet report is stale and
    discard it.
---
 be/src/agent/task_worker_pool.cpp                      | 18 +++++++++++++-----
 be/src/olap/task/engine_clone_task.cpp                 | 11 ++++++-----
 be/src/olap/task/engine_clone_task.h                   |  7 +++++--
 .../main/java/org/apache/doris/master/MasterImpl.java  |  5 +++++
 4 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/be/src/agent/task_worker_pool.cpp 
b/be/src/agent/task_worker_pool.cpp
index efd15d0711b..7bbd602f571 100644
--- a/be/src/agent/task_worker_pool.cpp
+++ b/be/src/agent/task_worker_pool.cpp
@@ -104,6 +104,10 @@ std::unordered_map<TTaskType::type, 
std::unordered_set<int64_t>> s_task_signatur
 
 std::atomic_ulong s_report_version(time(nullptr) * 10000);
 
+void increase_report_version() {
+    s_report_version.fetch_add(1, std::memory_order_relaxed);
+}
+
 // FIXME(plat1ko): Paired register and remove task info
 bool register_task_info(const TTaskType::type task_type, int64_t signature) {
     if (task_type == TTaskType::type::PUSH_STORAGE_POLICY ||
@@ -214,7 +218,7 @@ void alter_tablet(StorageEngine& engine, const 
TAgentTaskRequest& agent_task_req
     }
 
     if (status.ok()) {
-        s_report_version.fetch_add(1, std::memory_order_relaxed);
+        increase_report_version();
     }
 
     // Return result to fe
@@ -290,7 +294,7 @@ void alter_cloud_tablet(CloudStorageEngine& engine, const 
TAgentTaskRequest& age
     }
 
     if (status.ok()) {
-        s_report_version.fetch_add(1, std::memory_order_relaxed);
+        increase_report_version();
     }
 
     // Return result to fe
@@ -1534,7 +1538,7 @@ void create_tablet_callback(StorageEngine& engine, const 
TAgentTaskRequest& req)
                 .tag("tablet_id", create_tablet_req.tablet_id)
                 .error(status);
     } else {
-        s_report_version.fetch_add(1, std::memory_order_relaxed);
+        increase_report_version();
         // get path hash of the created tablet
         TabletSharedPtr tablet;
         {
@@ -1629,7 +1633,7 @@ void push_callback(StorageEngine& engine, const 
TAgentTaskRequest& req) {
                 .tag("signature", req.signature)
                 .tag("tablet_id", push_req.tablet_id)
                 .tag("push_type", push_req.push_type);
-        ++s_report_version;
+        increase_report_version();
         finish_task_request.__set_finish_tablet_infos(tablet_infos);
     } else {
         LOG_WARNING("failed to execute push task")
@@ -1675,7 +1679,7 @@ void cloud_push_callback(CloudStorageEngine& engine, 
const TAgentTaskRequest& re
                 .tag("signature", req.signature)
                 .tag("tablet_id", push_req.tablet_id)
                 .tag("push_type", push_req.push_type);
-        ++s_report_version;
+        increase_report_version();
         auto& tablet_info = 
finish_task_request.finish_tablet_infos.emplace_back();
         // Just need tablet_id
         tablet_info.tablet_id = push_req.tablet_id;
@@ -1972,6 +1976,10 @@ void clone_callback(StorageEngine& engine, const 
TMasterInfo& master_info,
         LOG_INFO("successfully clone tablet")
                 .tag("signature", req.signature)
                 .tag("tablet_id", clone_req.tablet_id);
+        if (engine_task.is_new_tablet()) {
+            increase_report_version();
+            finish_task_request.__set_report_version(s_report_version);
+        }
         finish_task_request.__set_finish_tablet_infos(tablet_infos);
     }
 
diff --git a/be/src/olap/task/engine_clone_task.cpp 
b/be/src/olap/task/engine_clone_task.cpp
index 40b789cf873..2b7388aa7c7 100644
--- a/be/src/olap/task/engine_clone_task.cpp
+++ b/be/src/olap/task/engine_clone_task.cpp
@@ -190,7 +190,7 @@ Status EngineCloneTask::_do_clone() {
                                                               
tablet->replica_id(), false));
         tablet.reset();
     }
-    bool is_new_tablet = tablet == nullptr;
+    _is_new_tablet = tablet == nullptr;
     // try to incremental clone
     Versions missed_versions;
     // try to repair a tablet with missing version
@@ -228,7 +228,7 @@ Status EngineCloneTask::_do_clone() {
         if (missed_versions.empty()) {
             LOG(INFO) << "missed version size = 0, skip clone and return 
success. tablet_id="
                       << _clone_req.tablet_id << " replica_id=" << 
_clone_req.replica_id;
-            RETURN_IF_ERROR(_set_tablet_info(is_new_tablet));
+            RETURN_IF_ERROR(_set_tablet_info());
             return Status::OK();
         }
 
@@ -307,10 +307,11 @@ Status EngineCloneTask::_do_clone() {
                 TabletMeta::construct_header_file_path(tablet_dir, 
_clone_req.tablet_id);
         
RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path));
     }
-    return _set_tablet_info(is_new_tablet);
+
+    return _set_tablet_info();
 }
 
-Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) {
+Status EngineCloneTask::_set_tablet_info() {
     // Get clone tablet info
     TTabletInfo tablet_info;
     tablet_info.__set_tablet_id(_clone_req.tablet_id);
@@ -320,7 +321,7 @@ Status EngineCloneTask::_set_tablet_info(bool 
is_new_tablet) {
     if (_clone_req.__isset.version && tablet_info.version < 
_clone_req.version) {
         // if it is a new tablet and clone failed, then remove the tablet
         // if it is incremental clone, then must not drop the tablet
-        if (is_new_tablet) {
+        if (_is_new_tablet) {
             // we need to check if this cloned table's version is what we 
expect.
             // if not, maybe this is a stale remaining table which is waiting 
for drop.
             // we drop it.
diff --git a/be/src/olap/task/engine_clone_task.h 
b/be/src/olap/task/engine_clone_task.h
index 71dc3a817b8..3161b803c82 100644
--- a/be/src/olap/task/engine_clone_task.h
+++ b/be/src/olap/task/engine_clone_task.h
@@ -55,6 +55,8 @@ public:
                     std::vector<TTabletInfo>* tablet_infos);
     ~EngineCloneTask() override = default;
 
+    bool is_new_tablet() const { return _is_new_tablet; }
+
 private:
     Status _do_clone();
 
@@ -71,7 +73,7 @@ private:
                                         const std::vector<Version>& 
missing_versions,
                                         bool* allow_incremental_clone);
 
-    Status _set_tablet_info(bool is_new_tablet);
+    Status _set_tablet_info();
 
     // Download tablet files from
     Status _download_files(DataDir* data_dir, const std::string& 
remote_url_prefix,
@@ -95,6 +97,7 @@ private:
     int64_t _copy_size;
     int64_t _copy_time_ms;
     std::vector<PendingRowsetGuard> _pending_rs_guards;
+    bool _is_new_tablet = false;
 }; // EngineTask
 
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java 
b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java
index 54a05da9549..0eef0c684d6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java
@@ -542,6 +542,11 @@ public class MasterImpl {
     private void finishClone(AgentTask task, TFinishTaskRequest request) {
         CloneTask cloneTask = (CloneTask) task;
         if (cloneTask.getTaskVersion() == CloneTask.VERSION_2) {
+            if (request.isSetReportVersion()) {
+                long reportVersion = request.getReportVersion();
+                Env.getCurrentSystemInfo().updateBackendReportVersion(
+                        task.getBackendId(), reportVersion, task.getDbId(), 
task.getTableId());
+            }
             
Env.getCurrentEnv().getTabletScheduler().finishCloneTask(cloneTask, request);
         } else {
             LOG.warn("invalid clone task, ignore it. {}", task);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to