This is an automated email from the ASF dual-hosted git repository.

w41ter pushed a commit to branch lock_tablet_before_modify_segment_files
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 83d5b552643ec1c229266f50db4500c0c5304b59
Author: w41ter <maoch...@selectdb.com>
AuthorDate: Fri Dec 20 06:29:05 2024 +0000

    [fix](restore) Lock tablet before modify segment files
---
 be/src/olap/tablet.cpp             |  4 +-
 be/src/olap/tablet.h               |  5 +-
 be/src/runtime/snapshot_loader.cpp | 97 +++++++++++++++++++++++---------------
 3 files changed, 63 insertions(+), 43 deletions(-)

diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index c7919b3f8dc..1758166e76e 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -2766,7 +2766,7 @@ void Tablet::check_table_size_correctness() {
     const std::vector<RowsetMetaSharedPtr>& all_rs_metas = 
_tablet_meta->all_rs_metas();
     for (const auto& rs_meta : all_rs_metas) {
         int64_t total_segment_size = get_segment_file_size(rs_meta);
-        int64_t total_inverted_index_size = 
get_inverted_index_file_szie(rs_meta);
+        int64_t total_inverted_index_size = 
get_inverted_index_file_size(rs_meta);
         if (rs_meta->data_disk_size() != total_segment_size ||
             rs_meta->index_disk_size() != total_inverted_index_size ||
             rs_meta->data_disk_size() + rs_meta->index_disk_size() != 
rs_meta->total_disk_size()) {
@@ -2817,7 +2817,7 @@ int64_t Tablet::get_segment_file_size(const 
RowsetMetaSharedPtr& rs_meta) {
     return total_segment_size;
 }
 
-int64_t Tablet::get_inverted_index_file_szie(const RowsetMetaSharedPtr& 
rs_meta) {
+int64_t Tablet::get_inverted_index_file_size(const RowsetMetaSharedPtr& 
rs_meta) {
     const auto& fs = rs_meta->fs();
     if (!fs) {
         LOG(WARNING) << "get fs failed, resource_id={}" << 
rs_meta->resource_id();
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index d00476f0441..afe043bf151 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -214,6 +214,7 @@ public:
     std::mutex& get_push_lock() { return _ingest_lock; }
     std::mutex& get_base_compaction_lock() { return _base_compaction_lock; }
     std::mutex& get_cumulative_compaction_lock() { return 
_cumulative_compaction_lock; }
+    std::shared_mutex& get_meta_store_lock() { return _meta_store_lock; }
 
     std::shared_timed_mutex& get_migration_lock() { return _migration_lock; }
 
@@ -531,7 +532,7 @@ private:
     void check_table_size_correctness();
     std::string get_segment_path(const RowsetMetaSharedPtr& rs_meta, int64_t 
seg_id);
     int64_t get_segment_file_size(const RowsetMetaSharedPtr& rs_meta);
-    int64_t get_inverted_index_file_szie(const RowsetMetaSharedPtr& rs_meta);
+    int64_t get_inverted_index_file_size(const RowsetMetaSharedPtr& rs_meta);
 
 public:
     static const int64_t K_INVALID_CUMULATIVE_POINT = -1;
@@ -588,7 +589,7 @@ private:
     std::shared_ptr<CumulativeCompactionPolicy> _cumulative_compaction_policy;
     std::string_view _cumulative_compaction_type;
 
-    // use a seperate thread to check all tablets paths existance
+    // use a separate thread to check all tablets paths existence
     std::atomic<bool> _is_tablet_path_exists;
 
     int64_t _last_missed_version;
diff --git a/be/src/runtime/snapshot_loader.cpp 
b/be/src/runtime/snapshot_loader.cpp
index b492a929fca..c5b27c82305 100644
--- a/be/src/runtime/snapshot_loader.cpp
+++ b/be/src/runtime/snapshot_loader.cpp
@@ -765,49 +765,68 @@ Status SnapshotLoader::move(const std::string& 
snapshot_path, TabletSharedPtr ta
         return Status::InternalError(err_msg);
     }
 
-    if (overwrite) {
-        std::vector<std::string> snapshot_files;
-        RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, 
&snapshot_files));
-
-        // 1. simply delete the old dir and replace it with the snapshot dir
-        try {
-            // This remove seems soft enough, because we already get
-            // tablet id and schema hash from this path, which
-            // means this path is a valid path.
-            std::filesystem::remove_all(tablet_path);
-            VLOG_CRITICAL << "remove dir: " << tablet_path;
-            std::filesystem::create_directory(tablet_path);
-            VLOG_CRITICAL << "re-create dir: " << tablet_path;
-        } catch (const std::filesystem::filesystem_error& e) {
-            std::stringstream ss;
-            ss << "failed to move tablet path: " << tablet_path << ". err: " 
<< e.what();
-            LOG(WARNING) << ss.str();
-            return Status::InternalError(ss.str());
-        }
+    if (!overwrite) {
+        throw Exception(Status::FatalError("only support overwrite now"));
+    }
 
-        // link files one by one
-        // files in snapshot dir will be moved in snapshot clean process
-        std::vector<std::string> linked_files;
-        for (auto& file : snapshot_files) {
-            auto full_src_path = fmt::format("{}/{}", snapshot_path, file);
-            auto full_dest_path = fmt::format("{}/{}", tablet_path, file);
-            if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) {
-                LOG(WARNING) << "failed to link file from " << full_src_path 
<< " to "
-                             << full_dest_path << ", err: " << 
std::strerror(errno);
-
-                // clean the already linked files
-                for (auto& linked_file : linked_files) {
-                    remove(linked_file.c_str());
-                }
+    // Medium migration/clone/checkpoint/compaction may change or check the
+    // files and tablet meta, so we need to take these locks.
+    std::unique_lock migration_lock(tablet->get_migration_lock(), 
std::try_to_lock);
+    std::unique_lock base_compact_lock(tablet->get_base_compaction_lock(), 
std::try_to_lock);
+    std::unique_lock 
cumu_compact_lock(tablet->get_cumulative_compaction_lock(), std::try_to_lock);
+    std::unique_lock cold_compact_lock(tablet->get_cold_compaction_lock(), 
std::try_to_lock);
+    std::unique_lock build_idx_lock(tablet->get_build_inverted_index_lock(), 
std::try_to_lock);
+    std::unique_lock meta_store_lock(tablet->get_meta_store_lock(), 
std::try_to_lock);
+    if (!migration_lock.owns_lock() || !base_compact_lock.owns_lock() ||
+        !cumu_compact_lock.owns_lock() || !cold_compact_lock.owns_lock() ||
+        !build_idx_lock.owns_lock() || !meta_store_lock.owns_lock()) {
+        // This error should be retryable
+        auto status = Status::ObtainLockFailed("failed to get tablet locks, 
tablet: {}", tablet_id);
+        LOG(WARNING) << status << ", snapshot path: " << snapshot_path
+                     << ", tablet path: " << tablet_path;
+        return status;
+    }
 
-                return Status::InternalError("move tablet failed");
+    std::vector<std::string> snapshot_files;
+    RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, 
&snapshot_files));
+
+    // FIXME: the below logic will demage the tablet files if failed in the 
middle.
+
+    // 1. simply delete the old dir and replace it with the snapshot dir
+    try {
+        // This remove seems soft enough, because we already get
+        // tablet id and schema hash from this path, which
+        // means this path is a valid path.
+        std::filesystem::remove_all(tablet_path);
+        VLOG_CRITICAL << "remove dir: " << tablet_path;
+        std::filesystem::create_directory(tablet_path);
+        VLOG_CRITICAL << "re-create dir: " << tablet_path;
+    } catch (const std::filesystem::filesystem_error& e) {
+        std::stringstream ss;
+        ss << "failed to move tablet path: " << tablet_path << ". err: " << 
e.what();
+        LOG(WARNING) << ss.str();
+        return Status::InternalError(ss.str());
+    }
+
+    // link files one by one
+    // files in snapshot dir will be moved in snapshot clean process
+    std::vector<std::string> linked_files;
+    for (auto& file : snapshot_files) {
+        auto full_src_path = fmt::format("{}/{}", snapshot_path, file);
+        auto full_dest_path = fmt::format("{}/{}", tablet_path, file);
+        if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) {
+            LOG(WARNING) << "failed to link file from " << full_src_path << " 
to " << full_dest_path
+                         << ", err: " << std::strerror(errno);
+
+            // clean the already linked files
+            for (auto& linked_file : linked_files) {
+                remove(linked_file.c_str());
             }
-            linked_files.push_back(full_dest_path);
-            VLOG_CRITICAL << "link file from " << full_src_path << " to " << 
full_dest_path;
-        }
 
-    } else {
-        throw Exception(Status::FatalError("only support overwrite now"));
+            return Status::InternalError("move tablet failed");
+        }
+        linked_files.push_back(full_dest_path);
+        VLOG_CRITICAL << "link file from " << full_src_path << " to " << 
full_dest_path;
     }
 
     // snapshot loader not need to change tablet uid


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to