This is an automated email from the ASF dual-hosted git repository.

gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d177d5b0048 [chore](cloud) Support starting both meta-service and 
recycler within single process (#40223)
d177d5b0048 is described below

commit d177d5b004876c11a06fa2975794286436ec9e22
Author: Gavin Chou <gavineaglec...@gmail.com>
AuthorDate: Wed Sep 4 21:27:17 2024 +0800

    [chore](cloud) Support starting both meta-service and recycler within 
single process (#40223)
    
    e.g. the following will start meta-service and recycler within single
    process.
    ```
    ./bin/start.sh --daemon
    ```
    the log file will be meta_service.INFO*
    
    and it is the same effect as `./bin/start.sh --meta-service --recycler
    --daemon`
    
    doc PR https://github.com/apache/doris-website/pull/1073
---
 cloud/script/start.sh                   | 27 ++++++++++-----
 cloud/src/common/config.h               |  1 +
 cloud/src/main.cpp                      | 60 +++++++++++++++++++--------------
 cloud/src/recycler/checker.cpp          |  2 ++
 cloud/src/recycler/recycler.cpp         |  5 +++
 cloud/src/recycler/recycler_service.cpp |  2 +-
 cloud/test/recycler_test.cpp            |  1 +
 7 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/cloud/script/start.sh b/cloud/script/start.sh
index 28e986166ae..582c80c2e6f 100644
--- a/cloud/script/start.sh
+++ b/cloud/script/start.sh
@@ -122,7 +122,10 @@ fi
 
 echo "LIBHDFS3_CONF=${LIBHDFS3_CONF}"
 
-export 
JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:5000,dirty_decay_ms:5000,oversize_threshold:0,prof:false,lg_prof_interval:-1"
+# to enable dump jeprof heap stats prodigally, change `prof:false` to 
`prof:true`
+# to control the dump interval change `lg_prof_interval` to a specific value, 
it is pow/exponent of 2 in size of bytes, default 34 means 2 ** 34 = 16GB
+# to control the dump path, change `prof_prefix` to a specific path, e.g. 
/doris_cloud/log/ms_, by default it dumps at the path where the start command 
called
+export 
JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:5000,dirty_decay_ms:5000,oversize_threshold:0,prof_prefix:ms_,prof:false,lg_prof_interval:34"
 
 if [[ "${RUN_VERSION}" -eq 1 ]]; then
     "${bin}" --version
@@ -131,14 +134,22 @@ fi
 
 mkdir -p "${DORIS_HOME}/log"
 echo "starts ${process} with args: $*"
+out_file=${DORIS_HOME}/log/${process}.out
 if [[ "${RUN_DAEMON}" -eq 1 ]]; then
-    date >>"${DORIS_HOME}/log/${process}.out"
-    nohup "${bin}" "$@" >>"${DORIS_HOME}/log/${process}.out" 2>&1 &
-    # wait for log flush
-    sleep 1.5
-    tail -n10 "${DORIS_HOME}/log/${process}.out" | grep 'working directory' 
-B1 -A10
-    echo "please check process log for more details"
-    echo ""
+    # append 10 blank lines to ensure the following tail -n10 works correctly
+    printf "\n\n\n\n\n\n\n\n\n\n" >>"${out_file}"
+    echo "$(date +'%F %T') try to start ${process}" >>"${out_file}"
+    nohup "${bin}" "$@" >>"${out_file}" 2>&1 &
+    echo "wait and check ${process} start successfully"
+    sleep 3
+    tail -n10 "${out_file}" | grep 'successfully started brpc'
+    ret=$?
+    if [[ ${ret} -ne 0 ]]; then
+        echo "${process} may not start successfully please check process log 
for more details"
+        exit 1
+    fi
+    echo "${process} start successfully"
+    exit 0
 elif [[ "${RUN_CONSOLE}" -eq 1 ]]; then
     export DORIS_LOG_TO_STDERR=1
     date
diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h
index e31a60a0d69..b1db41a6eb7 100644
--- a/cloud/src/common/config.h
+++ b/cloud/src/common/config.h
@@ -77,6 +77,7 @@ CONF_mInt32(scan_instances_interval_seconds, "60"); // 1min
 CONF_mInt32(check_object_interval_seconds, "43200"); // 12hours
 
 CONF_mInt64(check_recycle_task_interval_seconds, "600"); // 10min
+CONF_mInt64(recycler_sleep_before_scheduling_seconds, "60");
 // log a warning if a recycle task takes longer than this duration
 CONF_mInt64(recycle_task_threshold_seconds, "10800"); // 3h
 
diff --git a/cloud/src/main.cpp b/cloud/src/main.cpp
index 9356a3546d0..74e6a8daaf1 100644
--- a/cloud/src/main.cpp
+++ b/cloud/src/main.cpp
@@ -161,13 +161,13 @@ DECLARE_int64(socket_max_unwritten_bytes);
 int main(int argc, char** argv) {
     if (argc > 1) {
         if (auto ret = args.parse(argc - 1, argv + 1); !ret.empty()) {
-            std::cerr << ret << std::endl;
+            std::cerr << "parse arguments error: " << ret << std::endl;
             help();
             return -1;
         }
     }
 
-    if (argc < 2 || args.get<bool>(ARG_HELP)) {
+    if (args.get<bool>(ARG_HELP)) {
         help();
         return 0;
     }
@@ -177,21 +177,16 @@ int main(int argc, char** argv) {
         return 0;
     }
 
-    // FIXME(gavin): do we need to enable running both MS and recycler within
-    //               single process
-    if (!(args.get<bool>(ARG_META_SERVICE) ^ args.get<bool>(ARG_RECYCLER))) {
-        std::cerr << "only one of --meta-service and --recycler must be 
specified" << std::endl;
-        return 1;
-    }
-
-    // There may be more roles to play
+    // There may be more roles to play in the future, if there are multi roles 
specified,
+    // use meta_service as the process name
     std::string process_name = args.get<bool>(ARG_META_SERVICE) ? 
"meta_service"
                                : args.get<bool>(ARG_RECYCLER)   ? "recycler"
-                                                                : "";
-    if (process_name.empty()) {
-        std::cerr << "failed to determine prcess name with given args" << 
std::endl;
-        return 1;
-    }
+                                                                : 
"meta_service";
+
+    using namespace std::chrono;
+
+    auto start = steady_clock::now();
+    auto end = start;
 
     auto pid_file_fd_holder = gen_pidfile("doris_cloud");
     if (pid_file_fd_holder == nullptr) {
@@ -215,11 +210,19 @@ int main(int argc, char** argv) {
     }
 
     // We can invoke glog from now on
-
     std::string msg;
+    LOG(INFO) << "try to start doris_cloud";
     LOG(INFO) << build_info();
     std::cout << build_info() << std::endl;
 
+    if (!args.get<bool>(ARG_META_SERVICE) && !args.get<bool>(ARG_RECYCLER)) {
+        std::get<0>(args.args()[ARG_META_SERVICE]) = true;
+        std::get<0>(args.args()[ARG_RECYCLER]) = true;
+        LOG(INFO) << "meta_service and recycler are both not specified, "
+                     "run doris_cloud as meta_service and recycler by default";
+        std::cout << "run doris_cloud as meta_service and recycler by default" 
<< std::endl;
+    }
+
     brpc::Server server;
     brpc::FLAGS_max_body_size = config::brpc_max_body_size;
     brpc::FLAGS_socket_max_unwritten_bytes = 
config::brpc_socket_max_unwritten_bytes;
@@ -238,19 +241,22 @@ int main(int argc, char** argv) {
         return 1;
     }
     LOG(INFO) << "begin to init txn kv";
+    auto start_init_kv = steady_clock::now();
     int ret = txn_kv->init();
     if (ret != 0) {
         LOG(WARNING) << "failed to init txnkv, ret=" << ret;
         return 1;
     }
-    LOG(INFO) << "successfully init txn kv";
+    end = steady_clock::now();
+    LOG(INFO) << "successfully init txn kv, elapsed milliseconds: "
+              << duration_cast<milliseconds>(end - start_init_kv).count();
 
     if (init_global_encryption_key_info_map(txn_kv.get()) != 0) {
         LOG(WARNING) << "failed to init global encryption key map";
         return -1;
     }
 
-    std::unique_ptr<MetaServer> meta_server;
+    std::unique_ptr<MetaServer> meta_server; // meta-service
     std::unique_ptr<Recycler> recycler;
     std::thread periodiccally_log_thread;
     std::mutex periodiccally_log_thread_lock;
@@ -269,7 +275,8 @@ int main(int argc, char** argv) {
         msg = "meta-service started";
         LOG(INFO) << msg;
         std::cout << msg << std::endl;
-    } else if (args.get<bool>(ARG_RECYCLER)) {
+    }
+    if (args.get<bool>(ARG_RECYCLER)) {
         recycler = std::make_unique<Recycler>(txn_kv);
         int ret = recycler->start(&server);
         if (ret != 0) {
@@ -284,15 +291,12 @@ int main(int argc, char** argv) {
         auto periodiccally_log = [&]() {
             while (periodiccally_log_thread_run) {
                 std::unique_lock<std::mutex> lck 
{periodiccally_log_thread_lock};
-                periodiccally_log_thread_cv.wait_for(
-                        lck, 
std::chrono::milliseconds(config::periodically_log_ms));
+                periodiccally_log_thread_cv.wait_for(lck,
+                                                     
milliseconds(config::periodically_log_ms));
                 LOG(INFO) << "Periodically log for recycler";
             }
         };
         periodiccally_log_thread = std::thread {periodiccally_log};
-    } else {
-        std::cerr << "cloud starts without doing anything and exits" << 
std::endl;
-        return -1;
     }
     // start service
     brpc::ServerOptions options;
@@ -309,7 +313,11 @@ int main(int argc, char** argv) {
                      << ", errmsg=" << strerror_r(errno, buf, 64) << ", port=" 
<< port;
         return -1;
     }
-    LOG(INFO) << "successfully started brpc listening on port=" << port;
+    end = steady_clock::now();
+    msg = "successfully started brpc listening on port=" + 
std::to_string(port) +
+          " time_elapsed_ms=" + std::to_string(duration_cast<milliseconds>(end 
- start).count());
+    LOG(INFO) << msg;
+    std::cout << msg << std::endl;
 
     server.RunUntilAskedToQuit(); // Wait for signals
     server.ClearServices();
@@ -326,7 +334,7 @@ int main(int argc, char** argv) {
             periodiccally_log_thread_run = false;
             // immediately notify the log thread to quickly exit in case it 
block the
             // whole procedure
-            periodiccally_log_thread_cv.notify_one();
+            periodiccally_log_thread_cv.notify_all();
         }
         periodiccally_log_thread.join();
     }
diff --git a/cloud/src/recycler/checker.cpp b/cloud/src/recycler/checker.cpp
index 49421f97ca0..c3e9f69ed9d 100644
--- a/cloud/src/recycler/checker.cpp
+++ b/cloud/src/recycler/checker.cpp
@@ -79,6 +79,8 @@ int Checker::start() {
 
     // launch instance scanner
     auto scanner_func = [this]() {
+        std::this_thread::sleep_for(
+                
std::chrono::seconds(config::recycler_sleep_before_scheduling_seconds));
         while (!stopped()) {
             std::vector<InstanceInfoPB> instances;
             get_all_instances(txn_kv_.get(), instances);
diff --git a/cloud/src/recycler/recycler.cpp b/cloud/src/recycler/recycler.cpp
index 9db16a18c13..76d4a7ca767 100644
--- a/cloud/src/recycler/recycler.cpp
+++ b/cloud/src/recycler/recycler.cpp
@@ -189,6 +189,11 @@ Recycler::~Recycler() {
 }
 
 void Recycler::instance_scanner_callback() {
+    // sleep 60 seconds before scheduling for the launch procedure to complete:
+    // some bad hdfs connection may cause some log to stdout stderr
+    // which may pollute .out file and affect the script to check success
+    std::this_thread::sleep_for(
+            
std::chrono::seconds(config::recycler_sleep_before_scheduling_seconds));
     while (!stopped()) {
         std::vector<InstanceInfoPB> instances;
         get_all_instances(txn_kv_.get(), instances);
diff --git a/cloud/src/recycler/recycler_service.cpp 
b/cloud/src/recycler/recycler_service.cpp
index 3c1a5b2ab48..08e937a4106 100644
--- a/cloud/src/recycler/recycler_service.cpp
+++ b/cloud/src/recycler/recycler_service.cpp
@@ -448,7 +448,7 @@ void 
RecyclerServiceImpl::http(::google::protobuf::RpcController* controller,
     }
 
     status_code = 404;
-    msg = "not found";
+    msg = "http path " + uri.path() + " not found, it may be not implemented";
     response_body = msg;
 }
 
diff --git a/cloud/test/recycler_test.cpp b/cloud/test/recycler_test.cpp
index d767c1bd8b7..14687354839 100644
--- a/cloud/test/recycler_test.cpp
+++ b/cloud/test/recycler_test.cpp
@@ -64,6 +64,7 @@ int main(int argc, char** argv) {
 
     using namespace std::chrono;
     current_time = 
duration_cast<seconds>(system_clock::now().time_since_epoch()).count();
+    config::recycler_sleep_before_scheduling_seconds = 0; // we dont have to 
wait in UT
 
     ::testing::InitGoogleTest(&argc, argv);
     auto s3_producer_pool = 
std::make_shared<SimpleThreadPool>(config::recycle_pool_parallelism);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to