This is an automated email from the ASF dual-hosted git repository.

weizuo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 5c1cd058f2 [Feature] Add interface to check tablet segment lost 
(#10711)
5c1cd058f2 is described below

commit 5c1cd058f29e0840a8cbea3adbf8319c520f98f8
Author: weizuo93 <wei...@apache.org>
AuthorDate: Tue Aug 2 09:40:04 2022 +0800

    [Feature] Add interface to check tablet segment lost (#10711)
    
    Co-authored-by: weizuo <wei...@xiaomi.com>
---
 be/src/http/CMakeLists.txt                         |  1 +
 be/src/http/action/check_tablet_segment_action.cpp | 70 ++++++++++++++++++++++
 be/src/http/action/check_tablet_segment_action.h   | 36 +++++++++++
 be/src/olap/rowset/beta_rowset.cpp                 | 17 ++++++
 be/src/olap/rowset/beta_rowset.h                   |  2 +
 be/src/olap/rowset/rowset.cpp                      |  5 ++
 be/src/olap/rowset/rowset.h                        |  5 ++
 be/src/olap/tablet.cpp                             | 15 +++++
 be/src/olap/tablet.h                               |  2 +
 be/src/olap/tablet_manager.cpp                     | 26 ++++++++
 be/src/olap/tablet_manager.h                       |  2 +
 be/src/service/http_service.cpp                    |  7 +++
 be/test/testutil/mock_rowset.h                     |  2 +
 .../http-actions/check-tablet-segment-action.md    | 53 ++++++++++++++++
 .../http-actions/check-tablet-segment-action.md    | 53 ++++++++++++++++
 15 files changed, 296 insertions(+)

diff --git a/be/src/http/CMakeLists.txt b/be/src/http/CMakeLists.txt
index b956d0982b..22fe2665b2 100644
--- a/be/src/http/CMakeLists.txt
+++ b/be/src/http/CMakeLists.txt
@@ -52,4 +52,5 @@ add_library(Webserver STATIC
   action/config_action.cpp
   action/check_rpc_channel_action.cpp
   action/reset_rpc_channel_action.cpp
+  action/check_tablet_segment_action.cpp
 )
diff --git a/be/src/http/action/check_tablet_segment_action.cpp 
b/be/src/http/action/check_tablet_segment_action.cpp
new file mode 100644
index 0000000000..6e822ace7d
--- /dev/null
+++ b/be/src/http/action/check_tablet_segment_action.cpp
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "http/action/check_tablet_segment_action.h"
+
+#include <string>
+
+#include "http/http_channel.h"
+#include "http/http_headers.h"
+#include "http/http_request.h"
+#include "http/http_status.h"
+#include "olap/storage_engine.h"
+#include "service/backend_options.h"
+
+namespace doris {
+
+const static std::string HEADER_JSON = "application/json";
+
+CheckTabletSegmentAction::CheckTabletSegmentAction() {
+    _host = BackendOptions::get_localhost();
+}
+
+void CheckTabletSegmentAction::handle(HttpRequest* req) {
+    bool repair = false;
+    std::string is_repair = req->param("repair");
+    if (is_repair == "true") {
+        repair = true;
+    } else if (is_repair != "" && is_repair != "false") {
+        EasyJson result_ej;
+        result_ej["status"] = "Fail";
+        result_ej["msg"] = "Parameter 'repair' must be set to 'true' or 
'false'";
+        req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str());
+        HttpChannel::send_reply(req, HttpStatus::OK, result_ej.ToString());
+        return;
+    }
+
+    LOG(INFO) << "start to check tablet segment.";
+    std::set<int64_t> bad_tablets =
+            
StorageEngine::instance()->tablet_manager()->check_all_tablet_segment(repair);
+    LOG(INFO) << "finish to check tablet segment.";
+
+    EasyJson result_ej;
+    result_ej["status"] = "Success";
+    result_ej["msg"] = "Succeed to check all tablet segment";
+    result_ej["num"] = bad_tablets.size();
+    EasyJson tablets = result_ej.Set("bad_tablets", EasyJson::kArray);
+    for (int64_t tablet_id : bad_tablets) {
+        tablets.PushBack<int64_t>(tablet_id);
+    }
+    result_ej["set_bad"] = repair ? "true" : "false";
+    result_ej["host"] = _host;
+    req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str());
+    HttpChannel::send_reply(req, HttpStatus::OK, result_ej.ToString());
+}
+
+} // namespace doris
diff --git a/be/src/http/action/check_tablet_segment_action.h 
b/be/src/http/action/check_tablet_segment_action.h
new file mode 100644
index 0000000000..353c372a13
--- /dev/null
+++ b/be/src/http/action/check_tablet_segment_action.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "http/http_handler.h"
+#include "util/easy_json.h"
+
+namespace doris {
+
+class CheckTabletSegmentAction : public HttpHandler {
+public:
+    CheckTabletSegmentAction();
+    void handle(HttpRequest* req) override;
+    std::string host() { return _host; }
+
+private:
+    std::string _host;
+};
+} // namespace doris
diff --git a/be/src/olap/rowset/beta_rowset.cpp 
b/be/src/olap/rowset/beta_rowset.cpp
index 1a7c26ec0e..07e321ed0e 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -234,4 +234,21 @@ bool BetaRowset::check_file_exist() {
     return true;
 }
 
+bool BetaRowset::check_current_rowset_segment() {
+    auto fs = _rowset_meta->fs();
+    if (!fs) {
+        return false;
+    }
+    for (int seg_id = 0; seg_id < num_segments(); ++seg_id) {
+        auto seg_path = segment_file_path(seg_id);
+        std::shared_ptr<segment_v2::Segment> segment;
+        auto s = segment_v2::Segment::open(fs, seg_path, seg_id, _schema, 
&segment);
+        if (!s.ok()) {
+            LOG(WARNING) << "segment can not be opened. file=" << seg_path;
+            return false;
+        }
+    }
+    return true;
+}
+
 } // namespace doris
diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h
index 95aa52b317..cc098da20f 100644
--- a/be/src/olap/rowset/beta_rowset.h
+++ b/be/src/olap/rowset/beta_rowset.h
@@ -87,6 +87,8 @@ protected:
 
     void do_close() override;
 
+    bool check_current_rowset_segment() override;
+
 private:
     friend class RowsetFactory;
     friend class BetaRowsetReader;
diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp
index 42fdb077a5..230fc83ffe 100644
--- a/be/src/olap/rowset/rowset.cpp
+++ b/be/src/olap/rowset/rowset.cpp
@@ -76,4 +76,9 @@ void Rowset::make_visible(Version version) {
     make_visible_extra(version);
 }
 
+bool Rowset::check_rowset_segment() {
+    std::lock_guard<std::mutex> load_lock(_lock);
+    return check_current_rowset_segment();
+}
+
 } // namespace doris
diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h
index 68a4b21a80..bda106d462 100644
--- a/be/src/olap/rowset/rowset.h
+++ b/be/src/olap/rowset/rowset.h
@@ -264,6 +264,8 @@ public:
         return Status::OK();
     }
 
+    bool check_rowset_segment();
+
 protected:
     friend class RowsetFactory;
 
@@ -284,7 +286,10 @@ protected:
     // allow subclass to add custom logic when rowset is being published
     virtual void make_visible_extra(Version version) {}
 
+    virtual bool check_current_rowset_segment() = 0;
+
     TabletSchemaSPtr _schema;
+
     std::string _tablet_path;
     RowsetMetaSharedPtr _rowset_meta;
     // init in constructor
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index 2ecbd4d3cd..6553a3fc6d 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -1422,6 +1422,10 @@ void Tablet::build_tablet_report_info(TTabletInfo* 
tablet_info,
         tablet_info->__set_used(false);
     }
 
+    if (tablet_state() == TABLET_SHUTDOWN) {
+        tablet_info->__set_used(false);
+    }
+
     // the report version is the largest continuous version, same logic as in 
FE side
     tablet_info->version = cversion.second;
     // Useless but it is a required filed in TTabletInfo
@@ -1939,4 +1943,15 @@ void Tablet::update_self_owned_remote_rowsets(
     }
 }
 
+bool Tablet::check_all_rowset_segment() {
+    for (auto& version_rowset : _rs_version_map) {
+        RowsetSharedPtr rowset = version_rowset.second;
+        if (!rowset->check_rowset_segment()) {
+            LOG(WARNING) << "Tablet Segment Check. find a bad tablet, 
tablet_id=" << tablet_id();
+            return false;
+        }
+    }
+    return true;
+}
+
 } // namespace doris
diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h
index 422c1d9cde..2fb368930d 100644
--- a/be/src/olap/tablet.h
+++ b/be/src/olap/tablet.h
@@ -318,6 +318,8 @@ public:
     void record_unused_remote_rowset(const RowsetId& rowset_id, const 
io::ResourceId& resource,
                                      int64_t num_segments);
 
+    bool check_all_rowset_segment();
+
 private:
     Status _init_once_action();
     void _print_missed_versions(const std::vector<Version>& missed_versions) 
const;
diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp
index cb7fe6c1bd..900bb2ef18 100644
--- a/be/src/olap/tablet_manager.cpp
+++ b/be/src/olap/tablet_manager.cpp
@@ -1318,4 +1318,30 @@ void 
TabletManager::get_all_tablets_storage_format(TCheckStorageFormatResult* re
     result->__isset.v2_tablets = true;
 }
 
+std::set<int64_t> TabletManager::check_all_tablet_segment(bool repair) {
+    std::set<int64_t> bad_tablets;
+    for (const auto& tablets_shard : _tablets_shards) {
+        std::lock_guard<std::shared_mutex> wrlock(tablets_shard.lock);
+        for (const auto& item : tablets_shard.tablet_map) {
+            TabletSharedPtr tablet = item.second;
+            if (!tablet->check_all_rowset_segment()) {
+                bad_tablets.insert(tablet->tablet_id());
+                if (repair) {
+                    tablet->set_tablet_state(TABLET_SHUTDOWN);
+                    tablet->save_meta();
+                    {
+                        std::lock_guard<std::shared_mutex> 
shutdown_tablets_wrlock(
+                                _shutdown_tablets_lock);
+                        _shutdown_tablets.push_back(tablet);
+                    }
+                    LOG(WARNING) << "There are some segments lost, set tablet 
to shutdown state."
+                                 << "tablet_id=" << tablet->tablet_id()
+                                 << ", tablet_path=" << tablet->tablet_path();
+                }
+            }
+        }
+    }
+    return bad_tablets;
+}
+
 } // end namespace doris
diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h
index 159b8baefd..5765228ae1 100644
--- a/be/src/olap/tablet_manager.h
+++ b/be/src/olap/tablet_manager.h
@@ -138,6 +138,8 @@ public:
 
     void get_all_tablets_storage_format(TCheckStorageFormatResult* result);
 
+    std::set<int64_t> check_all_tablet_segment(bool repair);
+
 private:
     // Add a tablet pointer to StorageEngine
     // If force, drop the existing tablet add this new one
diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp
index ce9919f3b4..777ed02de8 100644
--- a/be/src/service/http_service.cpp
+++ b/be/src/service/http_service.cpp
@@ -18,6 +18,7 @@
 #include "service/http_service.h"
 
 #include "http/action/check_rpc_channel_action.h"
+#include "http/action/check_tablet_segment_action.h"
 #include "http/action/checksum_action.h"
 #include "http/action/compaction_action.h"
 #include "http/action/config_action.h"
@@ -166,6 +167,12 @@ Status HttpService::start() {
     ResetRPCChannelAction* reset_rpc_channel_action = _pool.add(new 
ResetRPCChannelAction(_env));
     _ev_http_server->register_handler(HttpMethod::GET, 
"/api/reset_rpc_channel/{endpoints}",
                                       reset_rpc_channel_action);
+
+    CheckTabletSegmentAction* check_tablet_segment_action =
+            _pool.add(new CheckTabletSegmentAction());
+    _ev_http_server->register_handler(HttpMethod::POST, 
"/api/check_tablet_segment_lost",
+                                      check_tablet_segment_action);
+
     _ev_http_server->start();
     return Status::OK();
 }
diff --git a/be/test/testutil/mock_rowset.h b/be/test/testutil/mock_rowset.h
index 71a17876fa..17865ce3d8 100644
--- a/be/test/testutil/mock_rowset.h
+++ b/be/test/testutil/mock_rowset.h
@@ -90,6 +90,8 @@ protected:
         // Do nothing.
     }
 
+    virtual bool check_current_rowset_segment() override { return true; };
+
 private:
     bool is_mem_rowset_;
 };
diff --git 
a/docs/en/docs/admin-manual/http-actions/check-tablet-segment-action.md 
b/docs/en/docs/admin-manual/http-actions/check-tablet-segment-action.md
new file mode 100644
index 0000000000..1a46baedcc
--- /dev/null
+++ b/docs/en/docs/admin-manual/http-actions/check-tablet-segment-action.md
@@ -0,0 +1,53 @@
+---
+{
+    "title": "CHECK ALL TABLET SEGMENT LOST",
+    "language": "en"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# CHECK ALL TABLET SEGMENT LOST
+   
+There may be some exceptions that cause segment to be lost on BE node. 
However, the metadata shows that the tablet is normal. This abnormal replica is 
not detected by FE and cannot be automatically repaired.
+When query comes, exception information is thrown that `failed to initialize 
storage reader`. The function of this interface is to check all tablets on the 
current BE node that have lost segment.
+
+```
+curl -X POST 
http://be_host:webserver_port/api/check_tablet_segment_lost?repair=xxx
+```
+
+When parameter `repair` is set to `true`, tablets with lost segment will be 
set to `SHUTDOWN` status and treated as bad replica, which can be detected and 
repaired by FE. Otherwise, all tablets with missing segment are returned and 
nothing is done.
+
+The return is all tablets on the current BE node that have lost segment:
+
+```
+{
+    status: "Success",
+    msg: "Succeed to check all tablet segment",
+    num: 3,
+    bad_tablets: [
+        11190,
+        11210,
+        11216
+    ],
+    set_bad: true,
+    host: "172.3.0.101"
+}
+```
\ No newline at end of file
diff --git 
a/docs/zh-CN/docs/admin-manual/http-actions/check-tablet-segment-action.md 
b/docs/zh-CN/docs/admin-manual/http-actions/check-tablet-segment-action.md
new file mode 100644
index 0000000000..b58c75d136
--- /dev/null
+++ b/docs/zh-CN/docs/admin-manual/http-actions/check-tablet-segment-action.md
@@ -0,0 +1,53 @@
+---
+{
+    "title": "CHECK ALL TABLET SEGMENT LOST",
+    "language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# CHECK ALL TABLET SEGMENT LOST
+   
+在BE节点上,可能会因为一些异常情况导致数据文件丢失,但是元数据显示正常,这种副本异常不会被FE检测到,也不能被修复。
+当用户查询时,会报错`failed to initialize storage 
reader`。该接口的功能是检测出当前BE节点上所有存在文件丢失的tablet。
+
+```
+curl -X POST 
http://be_host:webserver_port/api/check_tablet_segment_lost?repair=xxx
+```
+
+当参数`repair`设置为`true`时,存在文件丢失的tablet都会被设为`SHUTDOWN`状态,该副本会被作为坏副本处理,进而能够被FE检测和修复。否则,只会返回所有存在文件丢失的tablet,并不做任何处理。
+
+返回值是当前BE节点上所有存在文件丢失的tablet:
+
+```
+{
+    status: "Success",
+    msg: "Succeed to check all tablet segment",
+    num: 3,
+    bad_tablets: [
+        11190,
+        11210,
+        11216
+    ],
+    set_bad: true,
+    host: "172.3.0.101"
+}
+```


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to