This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 3fa4413e3bf branch-2.1: [fix](restore) Lock tablet before modify segment files #45711 (#48047) 3fa4413e3bf is described below commit 3fa4413e3bf71514cb03836a7701f356199e68ec Author: walter <maoch...@selectdb.com> AuthorDate: Fri Feb 21 08:33:10 2025 +0800 branch-2.1: [fix](restore) Lock tablet before modify segment files #45711 (#48047) cherry pick from #45711 --- be/src/olap/tablet.h | 3 +- be/src/runtime/snapshot_loader.cpp | 99 +++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 41 deletions(-) diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 4da655c38ab..362ffcd2e06 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -215,6 +215,7 @@ public: std::mutex& get_push_lock() { return _ingest_lock; } std::mutex& get_base_compaction_lock() { return _base_compaction_lock; } std::mutex& get_cumulative_compaction_lock() { return _cumulative_compaction_lock; } + std::shared_mutex& get_meta_store_lock() { return _meta_store_lock; } std::shared_timed_mutex& get_migration_lock() { return _migration_lock; } @@ -729,7 +730,7 @@ private: std::shared_ptr<CumulativeCompactionPolicy> _cumulative_compaction_policy; std::string_view _cumulative_compaction_type; - // use a seperate thread to check all tablets paths existance + // use a separate thread to check all tablets paths existence std::atomic<bool> _is_tablet_path_exists; int64_t _last_missed_version; diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index 5d8eae4ca80..2a4860ba9c1 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -793,50 +793,69 @@ Status SnapshotLoader::move(const std::string& snapshot_path, TabletSharedPtr ta return Status::InternalError(err_msg); } - if (overwrite) { - std::vector<std::string> snapshot_files; - RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, &snapshot_files)); - - // 1. simply delete the old dir and replace it with the snapshot dir - try { - // This remove seems soft enough, because we already get - // tablet id and schema hash from this path, which - // means this path is a valid path. - std::filesystem::remove_all(tablet_path); - VLOG_CRITICAL << "remove dir: " << tablet_path; - std::filesystem::create_directory(tablet_path); - VLOG_CRITICAL << "re-create dir: " << tablet_path; - } catch (const std::filesystem::filesystem_error& e) { - std::stringstream ss; - ss << "failed to move tablet path: " << tablet_path << ". err: " << e.what(); - LOG(WARNING) << ss.str(); - return Status::InternalError(ss.str()); - } + if (!overwrite) { + LOG(FATAL) << "only support overwrite now"; + __builtin_unreachable(); + } - // link files one by one - // files in snapshot dir will be moved in snapshot clean process - std::vector<std::string> linked_files; - for (auto& file : snapshot_files) { - auto full_src_path = fmt::format("{}/{}", snapshot_path, file); - auto full_dest_path = fmt::format("{}/{}", tablet_path, file); - if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) { - LOG(WARNING) << "failed to link file from " << full_src_path << " to " - << full_dest_path << ", err: " << std::strerror(errno); - - // clean the already linked files - for (auto& linked_file : linked_files) { - remove(linked_file.c_str()); - } + // Medium migration/clone/checkpoint/compaction may change or check the + // files and tablet meta, so we need to take these locks. + std::unique_lock migration_lock(tablet->get_migration_lock(), std::try_to_lock); + std::unique_lock base_compact_lock(tablet->get_base_compaction_lock(), std::try_to_lock); + std::unique_lock cumu_compact_lock(tablet->get_cumulative_compaction_lock(), std::try_to_lock); + std::unique_lock cold_compact_lock(tablet->get_cold_compaction_lock(), std::try_to_lock); + std::unique_lock build_idx_lock(tablet->get_build_inverted_index_lock(), std::try_to_lock); + std::unique_lock meta_store_lock(tablet->get_meta_store_lock(), std::try_to_lock); + if (!migration_lock.owns_lock() || !base_compact_lock.owns_lock() || + !cumu_compact_lock.owns_lock() || !cold_compact_lock.owns_lock() || + !build_idx_lock.owns_lock() || !meta_store_lock.owns_lock()) { + // This error should be retryable + auto status = Status::ObtainLockFailed("failed to get tablet locks, tablet: {}", tablet_id); + LOG(WARNING) << status << ", snapshot path: " << snapshot_path + << ", tablet path: " << tablet_path; + return status; + } - return Status::InternalError("move tablet failed"); + std::vector<std::string> snapshot_files; + RETURN_IF_ERROR(_get_existing_files_from_local(snapshot_path, &snapshot_files)); + + // FIXME: the below logic will demage the tablet files if failed in the middle. + + // 1. simply delete the old dir and replace it with the snapshot dir + try { + // This remove seems soft enough, because we already get + // tablet id and schema hash from this path, which + // means this path is a valid path. + std::filesystem::remove_all(tablet_path); + VLOG_CRITICAL << "remove dir: " << tablet_path; + std::filesystem::create_directory(tablet_path); + VLOG_CRITICAL << "re-create dir: " << tablet_path; + } catch (const std::filesystem::filesystem_error& e) { + std::stringstream ss; + ss << "failed to move tablet path: " << tablet_path << ". err: " << e.what(); + LOG(WARNING) << ss.str(); + return Status::InternalError(ss.str()); + } + + // link files one by one + // files in snapshot dir will be moved in snapshot clean process + std::vector<std::string> linked_files; + for (auto& file : snapshot_files) { + auto full_src_path = fmt::format("{}/{}", snapshot_path, file); + auto full_dest_path = fmt::format("{}/{}", tablet_path, file); + if (link(full_src_path.c_str(), full_dest_path.c_str()) != 0) { + LOG(WARNING) << "failed to link file from " << full_src_path << " to " << full_dest_path + << ", err: " << std::strerror(errno); + + // clean the already linked files + for (auto& linked_file : linked_files) { + remove(linked_file.c_str()); } - linked_files.push_back(full_dest_path); - VLOG_CRITICAL << "link file from " << full_src_path << " to " << full_dest_path; - } - } else { - LOG(FATAL) << "only support overwrite now"; - __builtin_unreachable(); + return Status::InternalError("move tablet failed"); + } + linked_files.push_back(full_dest_path); + VLOG_CRITICAL << "link file from " << full_src_path << " to " << full_dest_path; } // snapshot loader not need to change tablet uid --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org