This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new d4c1908902 [enhancement](baddisk) record bad disk in be_custom.conf to handle (#… (#24759) d4c1908902 is described below commit d4c1908902918a0dc8742324ae6372d5ee85ecff Author: Yongqiang YANG <98214048+dataroar...@users.noreply.github.com> AuthorDate: Fri Sep 22 09:37:39 2023 +0800 [enhancement](baddisk) record bad disk in be_custom.conf to handle (#… (#24759) --- be/src/common/config.cpp | 9 ++-- be/src/common/config.h | 1 + be/src/olap/data_dir.cpp | 5 +- be/src/olap/options.cpp | 26 ++++++++- be/src/olap/options.h | 3 ++ be/src/olap/storage_engine.cpp | 35 ++++++++++++ be/src/olap/storage_engine.h | 10 ++++ be/src/olap/tablet_manager.cpp | 35 ------------ be/src/olap/tablet_manager.h | 2 - be/src/service/doris_main.cpp | 14 ++++- be/src/vec/exec/vset_operation_node.cpp | 2 +- be/src/vec/runtime/vdata_stream_recvr.cpp | 1 - be/test/common/config_test.cpp | 1 + be/test/olap/options_test.cpp | 72 +++++++++++++++++++++++++ be/test/olap/storage_engine_test.cpp | 89 +++++++++++++++++++++++++++++++ be/test/testutil/run_all_tests.cpp | 7 ++- 16 files changed, 261 insertions(+), 51 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index e0e765da77..f370fae5f1 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -274,6 +274,7 @@ DEFINE_mInt32(tablet_lookup_cache_clean_interval, "30"); DEFINE_mInt32(disk_stat_monitor_interval, "5"); DEFINE_mInt32(unused_rowset_monitor_interval, "30"); DEFINE_String(storage_root_path, "${DORIS_HOME}/storage"); +DEFINE_mString(broken_storage_path, ""); // Config is used to check incompatible old format hdr_ format // whether doris uses strict way. When config is true, process will log fatal @@ -1316,9 +1317,9 @@ void Properties::set_force(const std::string& key, const std::string& val) { } Status Properties::dump(const std::string& conffile) { - RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(conffile)); + std::string conffile_tmp = conffile + ".tmp"; io::FileWriterPtr file_writer; - RETURN_IF_ERROR(io::global_local_filesystem()->create_file(conffile, &file_writer)); + RETURN_IF_ERROR(io::global_local_filesystem()->create_file(conffile_tmp, &file_writer)); RETURN_IF_ERROR(file_writer->append("# THIS IS AN AUTO GENERATED CONFIG FILE.\n")); RETURN_IF_ERROR(file_writer->append( "# You can modify this file manually, and the configurations in this file\n")); @@ -1331,7 +1332,9 @@ Status Properties::dump(const std::string& conffile) { RETURN_IF_ERROR(file_writer->append("\n")); } - return file_writer->close(); + RETURN_IF_ERROR(file_writer->close()); + + return io::global_local_filesystem()->rename(conffile_tmp, conffile); } template <typename T> diff --git a/be/src/common/config.h b/be/src/common/config.h index 14c58fec10..3eddafea40 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -319,6 +319,7 @@ DECLARE_mInt32(tablet_lookup_cache_clean_interval); DECLARE_mInt32(disk_stat_monitor_interval); DECLARE_mInt32(unused_rowset_monitor_interval); DECLARE_String(storage_root_path); +DECLARE_mString(broken_storage_path); // Config is used to check incompatible old format hdr_ format // whether doris uses strict way. When config is true, process will log fatal diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 237e761cbb..96b231571c 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -240,9 +240,8 @@ void DataDir::health_check() { if (!res) { LOG(WARNING) << "store read/write test file occur IO Error. path=" << _path << ", err: " << res; - if (res.is_io_error()) { - _is_used = false; - } + StorageEngine::instance()->add_broken_path(_path); + _is_used = !res.is<IO_ERROR>(); } } disks_state->set_value(_is_used ? 1 : 0); diff --git a/be/src/olap/options.cpp b/be/src/olap/options.cpp index 9ea21928d8..f424a8d28e 100644 --- a/be/src/olap/options.cpp +++ b/be/src/olap/options.cpp @@ -156,11 +156,20 @@ Status parse_conf_store_paths(const string& config_path, std::vector<StorePath>* // deal with the case that user add `;` to the tail path_vec.pop_back(); } + + std::set<std::string> real_paths; for (auto& item : path_vec) { StorePath path; auto res = parse_root_path(item, &path); if (res.ok()) { - paths->emplace_back(std::move(path)); + auto success = real_paths.emplace(path.path).second; + if (success) { + paths->emplace_back(std::move(path)); + } else { + LOG(WARNING) << "a duplicated path is found " << path.path; + return Status::Error<INVALID_ARGUMENT>("a duplicated path is found, path={}", + path.path); + } } else { LOG(WARNING) << "failed to parse store path " << item << ", res=" << res; } @@ -172,6 +181,21 @@ Status parse_conf_store_paths(const string& config_path, std::vector<StorePath>* return Status::OK(); } +void parse_conf_broken_store_paths(const string& config_path, std::set<std::string>* paths) { + std::vector<string> path_vec = strings::Split(config_path, ";", strings::SkipWhitespace()); + if (path_vec.empty()) { + return; + } + if (path_vec.back().empty()) { + // deal with the case that user add `;` to the tail + path_vec.pop_back(); + } + for (auto& item : path_vec) { + paths->emplace(item); + } + return; +} + /** format: * [ * {"path": "storage1", "total_size":53687091200,"query_limit": "10737418240"}, diff --git a/be/src/olap/options.h b/be/src/olap/options.h index c82875f832..ebc2301dc6 100644 --- a/be/src/olap/options.h +++ b/be/src/olap/options.h @@ -47,6 +47,8 @@ Status parse_root_path(const std::string& root_path, StorePath* path); Status parse_conf_store_paths(const std::string& config_path, std::vector<StorePath>* path); +void parse_conf_broken_store_paths(const std::string& config_path, std::set<std::string>* paths); + struct CachePath { io::FileCacheSettings init_settings() const; CachePath(std::string path, int64_t total_bytes, int64_t query_limit_bytes) @@ -62,6 +64,7 @@ Status parse_conf_cache_paths(const std::string& config_path, std::vector<CacheP struct EngineOptions { // list paths that tablet will be put into. std::vector<StorePath> store_paths; + std::set<std::string> broken_paths; // BE's UUID. It will be reset every time BE restarts. UniqueId backend_uid {0, 0}; }; diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 83f5705b6a..1d53269c46 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -135,6 +135,8 @@ StorageEngine::StorageEngine(const EngineOptions& options) // std::lock_guard<std::mutex> lock(_gc_mutex); return _unused_rowsets.size(); }); + + _broken_paths = options.broken_paths; } StorageEngine::~StorageEngine() { @@ -1313,4 +1315,37 @@ void StorageEngine::evict_querying_rowset(RowsetId rs_id) { _querying_rowsets.erase(rs_id); } +bool StorageEngine::add_broken_path(std::string path) { + std::lock_guard<std::mutex> lock(_broken_paths_mutex); + auto success = _broken_paths.emplace(path).second; + if (success) { + _persist_broken_paths(); + } + return success; +} + +bool StorageEngine::remove_broken_path(std::string path) { + std::lock_guard<std::mutex> lock(_broken_paths_mutex); + auto count = _broken_paths.erase(path); + if (count > 0) { + _persist_broken_paths(); + } + return count > 0; +} + +Status StorageEngine::_persist_broken_paths() { + std::string config_value; + for (const std::string& path : _broken_paths) { + config_value += path + ";"; + } + + if (config_value.length() > 0) { + auto st = config::set_config("broken_storage_path", config_value, true); + LOG(INFO) << "persist broken_storae_path " << config_value << st; + return st; + } + + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 0de5ff2668..784837fa06 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -235,6 +235,11 @@ public: void evict_querying_rowset(RowsetId rs_id); + bool add_broken_path(std::string path); + bool remove_broken_path(std::string path); + + std::set<string> get_broken_paths() { return _broken_paths; } + private: // Instance should be inited from `static open()` // MUST NOT be called in other circumstances. @@ -339,6 +344,8 @@ private: void _async_publish_callback(); + Status _persist_broken_paths(); + private: struct CompactionCandidate { CompactionCandidate(uint32_t nicumulative_compaction_, int64_t tablet_id_, uint32_t index_) @@ -373,6 +380,9 @@ private: std::mutex _store_lock; std::mutex _trash_sweep_lock; std::map<std::string, DataDir*> _store_map; + std::set<std::string> _broken_paths; + std::mutex _broken_paths_mutex; + uint32_t _available_storage_medium_type_count; int32_t _effective_cluster_id; diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index aa9ef37041..28137309c0 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -590,41 +590,6 @@ Status TabletManager::_drop_tablet_unlocked(TTabletId tablet_id, TReplicaId repl return Status::OK(); } -Status TabletManager::drop_tablets_on_error_root_path( - const std::vector<TabletInfo>& tablet_info_vec) { - SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); - Status res = Status::OK(); - if (tablet_info_vec.empty()) { // This is a high probability event - return res; - } - std::vector<std::set<size_t>> local_tmp_vector(_tablets_shards_size); - for (size_t idx = 0; idx < tablet_info_vec.size(); ++idx) { - local_tmp_vector[tablet_info_vec[idx].tablet_id & _tablets_shards_mask].insert(idx); - } - for (int32 i = 0; i < _tablets_shards_size; ++i) { - if (local_tmp_vector[i].empty()) { - continue; - } - std::lock_guard<std::shared_mutex> wrlock(_tablets_shards[i].lock); - for (size_t idx : local_tmp_vector[i]) { - const TabletInfo& tablet_info = tablet_info_vec[idx]; - TTabletId tablet_id = tablet_info.tablet_id; - VLOG_NOTICE << "drop_tablet begin. tablet_id=" << tablet_id; - TabletSharedPtr dropped_tablet = _get_tablet_unlocked(tablet_id); - if (dropped_tablet == nullptr) { - LOG(WARNING) << "dropping tablet not exist, " - << " tablet=" << tablet_id; - continue; - } else { - _remove_tablet_from_partition(dropped_tablet); - tablet_map_t& tablet_map = _get_tablet_map(tablet_id); - tablet_map.erase(tablet_id); - } - } - } - return res; -} - TabletSharedPtr TabletManager::get_tablet(TTabletId tablet_id, bool include_deleted, string* err) { std::shared_lock rdlock(_get_tablets_shard_lock(tablet_id)); return _get_tablet_unlocked(tablet_id, include_deleted, err); diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index ae7d27bded..4f6b327819 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -73,8 +73,6 @@ public: // If `is_drop_table_or_partition` is true, we need to remove all remote rowsets in this tablet. Status drop_tablet(TTabletId tablet_id, TReplicaId replica_id, bool is_drop_table_or_partition); - Status drop_tablets_on_error_root_path(const std::vector<TabletInfo>& tablet_info_vec); - TabletSharedPtr find_best_tablet_to_compaction( CompactionType compaction_type, DataDir* data_dir, const std::unordered_set<TTabletId>& tablet_submitted_compaction, uint32_t* score, diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index bc32f50fe3..89683ac7b6 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -353,9 +353,20 @@ int main(int argc, char** argv) { LOG(FATAL) << "parse config storage path failed, path=" << doris::config::storage_root_path; exit(-1); } + std::set<std::string> broken_paths; + doris::parse_conf_broken_store_paths(doris::config::broken_storage_path, &broken_paths); + auto it = paths.begin(); for (; it != paths.end();) { - if (!doris::check_datapath_rw(it->path)) { + if (broken_paths.count(it->path) > 0) { + if (doris::config::ignore_broken_disk) { + LOG(WARNING) << "ignore broken disk, path = " << it->path; + it = paths.erase(it); + } else { + LOG(FATAL) << "a broken disk is found " << it->path; + exit(-1); + } + } else if (!doris::check_datapath_rw(it->path)) { if (doris::config::ignore_broken_disk) { LOG(WARNING) << "read write test file failed, path=" << it->path; it = paths.erase(it); @@ -462,6 +473,7 @@ int main(int argc, char** argv) { // init and open storage engine doris::EngineOptions options; options.store_paths = paths; + options.broken_paths = broken_paths; options.backend_uid = doris::UniqueId::gen_uid(); doris::StorageEngine* engine = nullptr; auto st = doris::StorageEngine::open(options, &engine); diff --git a/be/src/vec/exec/vset_operation_node.cpp b/be/src/vec/exec/vset_operation_node.cpp index 8e1a85e781..9196d40668 100644 --- a/be/src/vec/exec/vset_operation_node.cpp +++ b/be/src/vec/exec/vset_operation_node.cpp @@ -16,7 +16,6 @@ // under the License. #include "vec/exec/vset_operation_node.h" -#include "vec/utils/template_helpers.hpp" #include <fmt/format.h> #include <gen_cpp/Exprs_types.h> @@ -47,6 +46,7 @@ #include "vec/exec/join/join_op.h" #include "vec/exprs/vexpr.h" #include "vec/exprs/vexpr_context.h" +#include "vec/utils/template_helpers.hpp" namespace doris { class DescriptorTbl; diff --git a/be/src/vec/runtime/vdata_stream_recvr.cpp b/be/src/vec/runtime/vdata_stream_recvr.cpp index fcd7d014c6..ec602f143e 100644 --- a/be/src/vec/runtime/vdata_stream_recvr.cpp +++ b/be/src/vec/runtime/vdata_stream_recvr.cpp @@ -227,7 +227,6 @@ void VDataStreamRecvr::SenderQueue::add_block(Block* block, bool use_move) { _pending_closures.emplace_back(iter->second.get(), monotonicStopWatch); iter->second->wait(l); } - } void VDataStreamRecvr::SenderQueue::decrement_senders(int be_number) { diff --git a/be/test/common/config_test.cpp b/be/test/common/config_test.cpp index c1484c9475..afeff7010c 100644 --- a/be/test/common/config_test.cpp +++ b/be/test/common/config_test.cpp @@ -33,6 +33,7 @@ using namespace config; class ConfigTest : public testing::Test { void SetUp() override { config::Register::_s_field_map->clear(); } + void TearDown() override { config::Register::_s_field_map->clear(); } }; TEST_F(ConfigTest, DumpAllConfigs) { diff --git a/be/test/olap/options_test.cpp b/be/test/olap/options_test.cpp index 5bab61bdeb..f954862fc2 100644 --- a/be/test/olap/options_test.cpp +++ b/be/test/olap/options_test.cpp @@ -127,4 +127,76 @@ TEST_F(OptionsTest, parse_root_path) { } } +TEST_F(OptionsTest, parse_conf_store_path) { + std::string path_prefix = std::filesystem::absolute("./test_run").string(); + std::string path1 = path_prefix + "/palo"; + std::string path2 = path_prefix + "/palo.ssd"; + + { + std::vector<StorePath> paths; + std::string config_path = path1; + auto st = parse_conf_store_paths(config_path, &paths); + EXPECT_EQ(Status::OK(), st); + EXPECT_EQ(paths.size(), 1); + EXPECT_STREQ(paths[0].path.c_str(), config_path.c_str()); + EXPECT_EQ(paths[0].capacity_bytes, -1); + EXPECT_EQ(paths[0].storage_medium, TStorageMedium::HDD); + } + { + std::vector<StorePath> paths; + std::string config_path = path1 + ";"; + auto st = parse_conf_store_paths(config_path, &paths); + EXPECT_EQ(Status::OK(), st); + EXPECT_EQ(paths.size(), 1); + EXPECT_STREQ(paths[0].path.c_str(), path1.c_str()); + EXPECT_EQ(paths[0].capacity_bytes, -1); + EXPECT_EQ(paths[0].storage_medium, TStorageMedium::HDD); + } + { + std::vector<StorePath> paths; + std::string config_path = path1 + ";" + path1; + auto st = parse_conf_store_paths(config_path, &paths); + EXPECT_EQ(Status::Error<ErrorCode::INVALID_ARGUMENT>("a duplicated path is found, path={}", + path1), + st); + } + { + std::vector<StorePath> paths; + std::string config_path = path1 + ";" + path2 + ";"; + auto st = parse_conf_store_paths(config_path, &paths); + EXPECT_EQ(Status::OK(), st); + EXPECT_EQ(paths.size(), 2); + EXPECT_STREQ(paths[0].path.c_str(), path1.c_str()); + EXPECT_EQ(paths[0].capacity_bytes, -1); + EXPECT_EQ(paths[0].storage_medium, TStorageMedium::HDD); + EXPECT_STREQ(paths[1].path.c_str(), path2.c_str()); + EXPECT_EQ(paths[1].capacity_bytes, -1); + EXPECT_EQ(paths[1].storage_medium, TStorageMedium::SSD); + } +} + +TEST_F(OptionsTest, parse_broken_path) { + { + std::string broken_paths = "path1"; + std::set<std::string> parsed_paths; + parse_conf_broken_store_paths(broken_paths, &parsed_paths); + EXPECT_EQ(parsed_paths.size(), 1); + } + { + std::string broken_paths = "path1;path1;"; + std::set<std::string> parsed_paths; + parse_conf_broken_store_paths(broken_paths, &parsed_paths); + EXPECT_EQ(parsed_paths.size(), 1); + EXPECT_EQ(parsed_paths.count("path1"), 1); + } + { + std::string broken_paths = "path1;path2;"; + std::set<std::string> parsed_paths; + parse_conf_broken_store_paths(broken_paths, &parsed_paths); + EXPECT_EQ(parsed_paths.size(), 2); + EXPECT_EQ(parsed_paths.count("path1"), 1); + EXPECT_EQ(parsed_paths.count("path2"), 1); + } +} + } // namespace doris diff --git a/be/test/olap/storage_engine_test.cpp b/be/test/olap/storage_engine_test.cpp new file mode 100644 index 0000000000..ebf572ef5f --- /dev/null +++ b/be/test/olap/storage_engine_test.cpp @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/storage_engine.h" + +#include <gmock/gmock-actions.h> +#include <gmock/gmock-matchers.h> +#include <gtest/gtest-message.h> +#include <gtest/gtest-test-part.h> + +#include <filesystem> + +#include "common/status.h" +#include "gtest/gtest_pred_impl.h" +#include "testutil/test_util.h" + +using ::testing::_; +using ::testing::Return; +using ::testing::SetArgPointee; +using std::string; + +namespace doris { +using namespace config; + +class StorageEngineTest : public testing::Test { +public: + virtual void SetUp() { + EngineOptions options; + + _storage_engine.reset(new StorageEngine(options)); + } + + virtual void TearDown() {} + + std::unique_ptr<StorageEngine> _storage_engine; +}; + +TEST_F(StorageEngineTest, TestBrokenDisk) { + DEFINE_mString(broken_storage_path, ""); + std::string path = config::custom_config_dir + "/be_custom.conf"; + + std::error_code ec; + { + _storage_engine->add_broken_path("broken_path1"); + EXPECT_EQ(std::filesystem::exists(path, ec), true); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path1"), 1); + EXPECT_EQ(broken_storage_path, "broken_path1;"); + } + + { + _storage_engine->add_broken_path("broken_path2"); + EXPECT_EQ(std::filesystem::exists(path, ec), true); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path1"), 1); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path2"), 1); + EXPECT_EQ(broken_storage_path, "broken_path1;broken_path2;"); + } + + { + _storage_engine->add_broken_path("broken_path2"); + EXPECT_EQ(std::filesystem::exists(path, ec), true); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path1"), 1); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path2"), 1); + EXPECT_EQ(broken_storage_path, "broken_path1;broken_path2;"); + } + + { + _storage_engine->remove_broken_path("broken_path2"); + EXPECT_EQ(std::filesystem::exists(path, ec), true); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path1"), 1); + EXPECT_EQ(_storage_engine->get_broken_paths().count("broken_path2"), 0); + EXPECT_EQ(broken_storage_path, "broken_path1;"); + } +} + +} // namespace doris diff --git a/be/test/testutil/run_all_tests.cpp b/be/test/testutil/run_all_tests.cpp index ab3b0b1ea0..fa1237fb80 100644 --- a/be/test/testutil/run_all_tests.cpp +++ b/be/test/testutil/run_all_tests.cpp @@ -43,10 +43,9 @@ int main(int argc, char** argv) { doris::StoragePageCache::create_global_cache(1 << 30, 10, 0); doris::SegmentLoader::create_global_instance(1000); std::string conf = std::string(getenv("DORIS_HOME")) + "/conf/be.conf"; - if (!doris::config::init(conf.c_str(), false)) { - fprintf(stderr, "error read config file. \n"); - return -1; - } + auto st = doris::config::init(conf.c_str(), false); + LOG(INFO) << "init config " << st; + doris::init_glog("be-test"); ::testing::InitGoogleTest(&argc, argv); doris::CpuInfo::init(); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org