This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 3fa4413e3bf branch-2.1: [fix](restore) Lock tablet before modify 
segment files #45711 (#48047)
3fa4413e3bf is described below

commit 3fa4413e3bf71514cb03836a7701f356199e68ec
Author: walter <maoch...@selectdb.com>
AuthorDate: Fri Feb 21 08:33:10 2025 +0800

    branch-2.1: [fix](restore) Lock tablet before modify segment files #45711 
(#48047)
    
    cherry pick from #45711
---
 be/src/olap/tablet.h               |  3 +-
 be/src/runtime/snapshot_loader.cpp | 99 +++++++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index 4da655c38ab..362ffcd2e06 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -215,6 +215,7 @@ public:
     std::mutex& get_push_lock() { return _ingest_lock; }
     std::mutex& get_base_compaction_lock() { return _base_compaction_lock; }
     std::mutex& get_cumulative_compaction_lock() { return 
_cumulative_compaction_lock; }
+    std::shared_mutex& get_meta_store_lock() { return _meta_store_lock; }
 
     std::shared_timed_mutex& get_migration_lock() { return _migration_lock; }
 
@@ -729,7 +730,7 @@ private:
     std::shared_ptr<CumulativeCompactionPolicy> _cumulative_compaction_policy;
     std::string_view _cumulative_compaction_type;
 
-    // use a seperate thread to check all tablets paths existance
+    // use a separate thread to check all tablets paths existence
     std::atomic<bool> _is_tablet_path_exists;
 
     int64_t _last_missed_version;
diff --git a/be/src/runtime/snapshot_loader.cpp 
b/be/src/runtime/snapshot_loader.cpp
index 5d8eae4ca80..2a4860ba9c1 100644
--- a/be/src/runtime/snapshot_loader.cpp
+++ b/be/src/runtime/snapshot_loader.cpp
@@ -793,50 +793,69 @@ Status SnapshotLoader::move(const std::string& 
snapshot_path, TabletSharedPtr ta
         return Status::InternalError(err_msg);
     }
 
-    if (overwrite) {
-        std::vector<std::string> snapshot_files;
-        RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, 
&snapshot_files));
-
-        // 1. simply delete the old dir and replace it with the snapshot dir
-        try {
-            // This remove seems soft enough, because we already get
-            // tablet id and schema hash from this path, which
-            // means this path is a valid path.
-            std::filesystem::remove_all(tablet_path);
-            VLOG_CRITICAL << "remove dir: " << tablet_path;
-            std::filesystem::create_directory(tablet_path);
-            VLOG_CRITICAL << "re-create dir: " << tablet_path;
-        } catch (const std::filesystem::filesystem_error& e) {
-            std::stringstream ss;
-            ss << "failed to move tablet path: " << tablet_path << ". err: " 
<< e.what();
-            LOG(WARNING) << ss.str();
-            return Status::InternalError(ss.str());
-        }
+    if (!overwrite) {
+        LOG(FATAL) << "only support overwrite now";
+        __builtin_unreachable();
+    }
 
-        // link files one by one
-        // files in snapshot dir will be moved in snapshot clean process
-        std::vector<std::string> linked_files;
-        for (auto& file : snapshot_files) {
-            auto full_src_path = fmt::format("{}/{}", snapshot_path, file);
-            auto full_dest_path = fmt::format("{}/{}", tablet_path, file);
-            if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) {
-                LOG(WARNING) << "failed to link file from " << full_src_path 
<< " to "
-                             << full_dest_path << ", err: " << 
std::strerror(errno);
-
-                // clean the already linked files
-                for (auto& linked_file : linked_files) {
-                    remove(linked_file.c_str());
-                }
+    // Medium migration/clone/checkpoint/compaction may change or check the
+    // files and tablet meta, so we need to take these locks.
+    std::unique_lock migration_lock(tablet->get_migration_lock(), 
std::try_to_lock);
+    std::unique_lock base_compact_lock(tablet->get_base_compaction_lock(), 
std::try_to_lock);
+    std::unique_lock 
cumu_compact_lock(tablet->get_cumulative_compaction_lock(), std::try_to_lock);
+    std::unique_lock cold_compact_lock(tablet->get_cold_compaction_lock(), 
std::try_to_lock);
+    std::unique_lock build_idx_lock(tablet->get_build_inverted_index_lock(), 
std::try_to_lock);
+    std::unique_lock meta_store_lock(tablet->get_meta_store_lock(), 
std::try_to_lock);
+    if (!migration_lock.owns_lock() || !base_compact_lock.owns_lock() ||
+        !cumu_compact_lock.owns_lock() || !cold_compact_lock.owns_lock() ||
+        !build_idx_lock.owns_lock() || !meta_store_lock.owns_lock()) {
+        // This error should be retryable
+        auto status = Status::ObtainLockFailed("failed to get tablet locks, 
tablet: {}", tablet_id);
+        LOG(WARNING) << status << ", snapshot path: " << snapshot_path
+                     << ", tablet path: " << tablet_path;
+        return status;
+    }
 
-                return Status::InternalError("move tablet failed");
+    std::vector<std::string> snapshot_files;
+    RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, 
&snapshot_files));
+
+    // FIXME: the below logic will demage the tablet files if failed in the 
middle.
+
+    // 1. simply delete the old dir and replace it with the snapshot dir
+    try {
+        // This remove seems soft enough, because we already get
+        // tablet id and schema hash from this path, which
+        // means this path is a valid path.
+        std::filesystem::remove_all(tablet_path);
+        VLOG_CRITICAL << "remove dir: " << tablet_path;
+        std::filesystem::create_directory(tablet_path);
+        VLOG_CRITICAL << "re-create dir: " << tablet_path;
+    } catch (const std::filesystem::filesystem_error& e) {
+        std::stringstream ss;
+        ss << "failed to move tablet path: " << tablet_path << ". err: " << 
e.what();
+        LOG(WARNING) << ss.str();
+        return Status::InternalError(ss.str());
+    }
+
+    // link files one by one
+    // files in snapshot dir will be moved in snapshot clean process
+    std::vector<std::string> linked_files;
+    for (auto& file : snapshot_files) {
+        auto full_src_path = fmt::format("{}/{}", snapshot_path, file);
+        auto full_dest_path = fmt::format("{}/{}", tablet_path, file);
+        if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) {
+            LOG(WARNING) << "failed to link file from " << full_src_path << " 
to " << full_dest_path
+                         << ", err: " << std::strerror(errno);
+
+            // clean the already linked files
+            for (auto& linked_file : linked_files) {
+                remove(linked_file.c_str());
             }
-            linked_files.push_back(full_dest_path);
-            VLOG_CRITICAL << "link file from " << full_src_path << " to " << 
full_dest_path;
-        }
 
-    } else {
-        LOG(FATAL) << "only support overwrite now";
-        __builtin_unreachable();
+            return Status::InternalError("move tablet failed");
+        }
+        linked_files.push_back(full_dest_path);
+        VLOG_CRITICAL << "link file from " << full_src_path << " to " << 
full_dest_path;
     }
 
     // snapshot loader not need to change tablet uid


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to