This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e2e52c5c9bd [opt](doris compose)  stop command wait fe / be dead 
(#50745)
e2e52c5c9bd is described below

commit e2e52c5c9bd141ae98c006641a61604cb93c7625
Author: yujun <yu...@selectdb.com>
AuthorDate: Wed May 14 15:17:24 2025 +0800

    [opt](doris compose)  stop command wait fe / be dead (#50745)
    
    ### What problem does this PR solve?
    
    in docker suites, when stop be / fe, the master fe heartbeat may be
    delay due to wait rpc timeout, so even be is dead, fe maynot found it
    dead until next heartbeat.
    
    so regression's stop command need to check fe / be had been dead from
    master fe's perspective.
---
 docker/runtime/doris-compose/Dockerfile            |  67 +++++++++----
 docker/runtime/doris-compose/Readme.md             |   4 +-
 docker/runtime/doris-compose/cluster.py            |   2 +-
 docker/runtime/doris-compose/command.py            | 111 ++++++++++++++++-----
 docker/runtime/doris-compose/database.py           |   6 +-
 docker/runtime/doris-compose/format-code.sh        |   4 +
 .../doris/regression/suite/SuiteCluster.groovy     |  13 +--
 .../suites/clone_p0/test_decommission_mtmv.groovy  |   2 +-
 .../suites/demo_p0/docker_action.groovy            |   1 +
 .../test_min_load_replica_num_complicate.groovy    |   2 +-
 10 files changed, 151 insertions(+), 61 deletions(-)

diff --git a/docker/runtime/doris-compose/Dockerfile 
b/docker/runtime/doris-compose/Dockerfile
index 0d36acfe9de..04b86a30ed1 100644
--- a/docker/runtime/doris-compose/Dockerfile
+++ b/docker/runtime/doris-compose/Dockerfile
@@ -25,36 +25,59 @@
 ARG JDK_IMAGE=openjdk:17-jdk-slim
 #ARG JDK_IMAGE=openjdk:8u342-jdk
 
+# user can download a doris release package, extract it, then build its image 
used arg `OUTPUT_PATH`
+# for example:
+#
+# ```
+# cd ~/tmp
+# wget 
https://apache-doris-releases.oss-accelerate.aliyuncs.com/apache-doris-3.0.5-bin-x64.tar.gz
+# tar xvf apache-doris-3.0.5-bin-x64.tar.gz  # after extract, there will be a 
directory ./apache-doris-3.0.5-bin-x64/{fe, be, ms}
+#
+# docker build \
+#     --build-arg OUTPUT_PATH=./apache-doris-3.0.5-bin-x64 \
+#     -f ~/workspace/doris/docker/runtime/doris-compose/Dockerfile \
+#     -t my-doris:v3.0.5 \
+#     .
+# ```
+ARG OUTPUT_PATH=./output
+
 #### END ARG ####
 
 FROM ${JDK_IMAGE}
 
 # set environment variables
-ENV JACOCO_VERSION 0.8.8
-
-RUN mkdir -p /opt/apache-doris/coverage
-
-RUN  sed -i s@/deb.debian.org/@/mirrors.aliyun.com/@g /etc/apt/sources.list
-RUN  apt-get clean
+ENV JACOCO_VERSION=0.8.8
 
-RUN apt-get update && \
-    apt-get install -y default-mysql-client python lsof tzdata curl unzip 
patchelf jq procps util-linux gosu && \
-    ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
-    dpkg-reconfigure -f noninteractive tzdata && \
-    apt-get clean
+RUN sed -i s@/deb.debian.org/@/mirrors.aliyun.com/@g /etc/apt/sources.list \
+    && apt-get clean \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+        default-mysql-client \
+        python \
+        lsof \
+        tzdata \
+        curl \
+        unzip \
+        patchelf \
+        jq \
+        procps \
+        util-linux \
+        gosu \
+    && ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
+    && dpkg-reconfigure -f noninteractive tzdata \
+    && apt-get clean
 
-RUN curl -f 
https://repo1.maven.org/maven2/org/jacoco/jacoco/${JACOCO_VERSION}/jacoco-${JACOCO_VERSION}.zip
 -o jacoco.zip && \
-    mkdir /jacoco && \
-    unzip jacoco.zip -d /jacoco
+RUN mkdir -p /opt/apache-doris/{fdb,coverage}
 
-# cloud
-COPY --chmod=777 README.md cloud/output* /opt/apache-doris/cloud/
-RUN mkdir /opt/apache-doris/fdb
-RUN if [ -d /opt/apache-doris/cloud/bin ]; then                            \
-        sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/cloud/bin/start.sh ; \
-    fi
+RUN curl -f 
https://repo1.maven.org/maven2/org/jacoco/jacoco/${JACOCO_VERSION}/jacoco-${JACOCO_VERSION}.zip
 -o jacoco.zip \
+    && mkdir /jacoco \
+    && unzip jacoco.zip -d /jacoco
 
-# fe and be
-COPY --chmod=777 output /opt/apache-doris/
+# COPY need refine ARG after FROM
+ARG OUTPUT_PATH
+COPY --chmod=777 ${OUTPUT_PATH} /opt/apache-doris/
 # in docker, run 'chmod 755 doris_be' first time cost 1min, remove it.
 RUN sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/be/bin/start_be.sh
+RUN if [ -d /opt/apache-doris/ms/bin ]; then                            \
+        sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/ms/bin/start.sh ; \
+    fi
\ No newline at end of file
diff --git a/docker/runtime/doris-compose/Readme.md 
b/docker/runtime/doris-compose/Readme.md
index ca358f93bda..98b703be59e 100644
--- a/docker/runtime/doris-compose/Readme.md
+++ b/docker/runtime/doris-compose/Readme.md
@@ -38,10 +38,10 @@ Make sure BuildKit configured in the machine. if not follow 
[docker-with-BuildKi
 ### 2. The doris image should contains
 
 ```shell
-/opt/apache-doris/{fe, be, cloud}
+/opt/apache-doris/{fe, be, ms}
 ```
 
-If don't create cloud cluster, the image no need to contains the cloud pkg.
+If don't create cloud cluster, the image no need to contains the ms pkg.
 
 If build doris use `sh build.sh --fe --be --cloud` **without do any change on 
their origin conf or shells**, then its `output/` satisfy with all above, then 
run command in doris root directory will generate such a image. If you want to 
pack a product that is not the `output/` directory, you can modify `Dockerfile` 
by yourself.
 
diff --git a/docker/runtime/doris-compose/cluster.py 
b/docker/runtime/doris-compose/cluster.py
index 88f597a6283..552b9c038c9 100644
--- a/docker/runtime/doris-compose/cluster.py
+++ b/docker/runtime/doris-compose/cluster.py
@@ -690,7 +690,7 @@ class CLOUD(Node):
         return cfg
 
     def docker_home_dir(self):
-        return os.path.join(DOCKER_DORIS_PATH, "cloud")
+        return os.path.join(DOCKER_DORIS_PATH, "ms")
 
     def get_default_named_ports(self):
         return {
diff --git a/docker/runtime/doris-compose/command.py 
b/docker/runtime/doris-compose/command.py
index d31e853c2ca..f3e3754464e 100644
--- a/docker/runtime/doris-compose/command.py
+++ b/docker/runtime/doris-compose/command.py
@@ -30,7 +30,7 @@ import time
 LOG = utils.get_logger()
 
 
-def wait_ready_service(wait_timeout, cluster, fe_ids, be_ids):
+def wait_service(need_alive, wait_timeout, cluster, fe_ids, be_ids):
     if wait_timeout == 0:
         return
     if wait_timeout == -1:
@@ -39,28 +39,37 @@ def wait_ready_service(wait_timeout, cluster, fe_ids, 
be_ids):
     while True:
         db_mgr = database.get_db_mgr(cluster.name,
                                      cluster.get_all_node_net_infos(), False)
-        dead_frontends = []
+        failed_frontends = []
         for id in fe_ids:
             fe = cluster.get_node(CLUSTER.Node.TYPE_FE, id)
             fe_state = db_mgr.get_fe(id)
-            if not fe_state or not fe_state.alive or not utils.is_socket_avail(
-                    fe.get_ip(), fe.meta["ports"]["query_port"]):
-                dead_frontends.append(id)
-        dead_backends = []
+            fe_alive = fe_state and fe_state.alive
+            if fe_alive and need_alive:
+                # if need alive, check port available,
+                # if need dead, don't check port available because it take 
some time for the disconnect socket
+                fe_alive = utils.is_socket_avail(
+                    fe.get_ip(), fe.meta["ports"]["query_port"])
+            if fe_alive != need_alive:
+                failed_frontends.append(id)
+        failed_backends = []
         for id in be_ids:
             be = cluster.get_node(CLUSTER.Node.TYPE_BE, id)
             be_state = db_mgr.get_be(id)
-            if not be_state or not be_state.alive or not utils.is_socket_avail(
-                    be.get_ip(), be.meta["ports"]["webserver_port"]):
-                dead_backends.append(id)
-        if not dead_frontends and not dead_backends:
+            be_alive = be_state and be_state.alive
+            if be_alive and need_alive:
+                be_alive = utils.is_socket_avail(
+                    be.get_ip(), be.meta["ports"]["webserver_port"])
+            if be_alive != need_alive:
+                failed_backends.append(id)
+        if not failed_frontends and not failed_backends:
             break
         if time.time() >= expire_ts:
             err = ""
-            if dead_frontends:
-                err += "dead fe: " + str(dead_frontends) + ". "
-            if dead_backends:
-                err += "dead be: " + str(dead_backends) + ". "
+            failed_status = "dead" if need_alive else "alive"
+            if failed_frontends:
+                err += failed_status + " fe: " + str(failed_frontends) + ". "
+            if failed_backends:
+                err += failed_status + " be: " + str(failed_backends) + ". "
             raise Exception(err)
         time.sleep(1)
 
@@ -189,10 +198,11 @@ class Command(object):
 
 class SimpleCommand(Command):
 
-    def __init__(self, command, help):
+    def __init__(self, command, help, options=[]):
         super().__init__(command)
         self.command = command
         self.help = help
+        self.options = options
 
     def add_parser(self, args_parsers):
         help = self.help + " If none of --fe-id, --be-id, --ms-id, 
--recycle-id, --fdb-id is specific, "\
@@ -210,18 +220,21 @@ class SimpleCommand(Command):
             args.fdb_id)
         utils.exec_docker_compose_command(cluster.get_compose_file(),
                                           self.command,
+                                          options=self.options,
                                           nodes=related_nodes)
         show_cmd = self.command[0].upper() + self.command[1:]
+
+        if for_all:
+            related_nodes = cluster.get_all_nodes()
+
         LOG.info(
             utils.render_green("{} succ, total related node num {}".format(
                 show_cmd, related_node_num)))
 
-        if for_all:
-            related_nodes = cluster.get_all_nodes()
         return cluster, related_nodes
 
 
-class NeedStartCommand(SimpleCommand):
+class StartBaseCommand(SimpleCommand):
 
     def add_parser(self, args_parsers):
         parser = super().add_parser(args_parsers)
@@ -240,7 +253,47 @@ class NeedStartCommand(SimpleCommand):
         fe_ids = [node.id for node in related_nodes if node.is_fe()]
         be_ids = [node.id for node in related_nodes if node.is_be()]
         if not cluster.is_host_network():
-            wait_ready_service(args.wait_timeout, cluster, fe_ids, be_ids)
+            wait_service(True, args.wait_timeout, cluster, fe_ids, be_ids)
+        return cluster, related_nodes
+
+
+class StartCommand(StartBaseCommand):
+
+    def __init__(self, command):
+        super().__init__(command, "Start the doris containers. "),
+
+
+class RestartCommand(StartBaseCommand):
+
+    def __init__(self, command):
+        super().__init__(command, "Restart the doris containers. ",
+                         ["-t", "1"]),
+
+
+class StopCommand(SimpleCommand):
+
+    def __init__(self, command):
+        super().__init__(command, "Stop the doris containers. ", ["-t", "1"]),
+
+    def add_parser(self, args_parsers):
+        parser = super().add_parser(args_parsers)
+        parser.add_argument(
+            "--wait-timeout",
+            type=int,
+            default=0,
+            help=
+            "Specify wait seconds for fe/be close for service: 0 not wait 
(default), "\
+            "> 0 max wait seconds, -1 wait unlimited."
+        )
+        return parser
+
+    def run(self, args):
+        cluster, related_nodes = super().run(args)
+        fe_ids = [node.id for node in related_nodes if node.is_fe()]
+        be_ids = [node.id for node in related_nodes if node.is_be()]
+        if not cluster.is_host_network():
+            wait_service(False, args.wait_timeout, cluster, fe_ids, be_ids)
+        return cluster, related_nodes
 
 
 class UpCommand(Command):
@@ -727,8 +780,8 @@ class UpCommand(Command):
                     db_mgr.create_default_storage_vault(cloud_store_config)
 
             if not cluster.is_host_network():
-                wait_ready_service(args.wait_timeout, cluster, add_fe_ids,
-                                   add_be_ids)
+                wait_service(True, args.wait_timeout, cluster, add_fe_ids,
+                             add_be_ids)
             LOG.info(
                 utils.render_green(
                     "Up cluster {} succ, related node num {}".format(
@@ -829,6 +882,7 @@ class DownCommand(Command):
     def run(self, args):
         cluster_name = args.NAME
         cluster = None
+        stop_grace = False
 
         try:
             cluster = CLUSTER.Cluster.load(cluster_name)
@@ -840,6 +894,7 @@ class DownCommand(Command):
                 args.recycle_id,
                 args.fdb_id,
                 ignore_not_exists=True)
+            stop_grace = cluster.coverage_dir
         except Exception as e:
             for_all = not args.fe_id and not args.be_id and not args.ms_id and 
not args.recycle_id
             related_nodes = []
@@ -854,8 +909,12 @@ class DownCommand(Command):
             compose_file = CLUSTER.get_compose_file(cluster_name)
             if os.path.exists(compose_file):
                 try:
-                    utils.exec_docker_compose_command(
-                        compose_file, "down", ["-v", "--remove-orphans"])
+                    options = ["-v", "--remove-orphans"]
+                    if not stop_grace:
+                        options.extend(["-t", "1"])
+                    utils.exec_docker_compose_command(compose_file,
+                                                      "down",
+                                                      options=options)
                 except Exception as e:
                     LOG.warn("down cluster has exception: " + str(e))
             try:
@@ -1423,9 +1482,9 @@ class AddRWPermCommand(Command):
 ALL_COMMANDS = [
     UpCommand("up"),
     DownCommand("down"),
-    NeedStartCommand("start", "Start the doris containers. "),
-    SimpleCommand("stop", "Stop the doris containers. "),
-    NeedStartCommand("restart", "Restart the doris containers. "),
+    StartCommand("start"),
+    StopCommand("stop"),
+    RestartCommand("restart"),
     SimpleCommand("pause", "Pause the doris containers. "),
     SimpleCommand("unpause", "Unpause the doris containers. "),
     GenConfCommand("config"),
diff --git a/docker/runtime/doris-compose/database.py 
b/docker/runtime/doris-compose/database.py
index 1a5b2d57642..f46635033b8 100644
--- a/docker/runtime/doris-compose/database.py
+++ b/docker/runtime/doris-compose/database.py
@@ -281,10 +281,12 @@ class DBManager(object):
                         dict(zip(fields, row)) for row in cursor.fetchall()
                     ]
             except Exception as e:
-                LOG.warn(f"Error occurred: {e}")
+                LOG.warn(
+                    f"Error occurred: fe {self.fe_ip}:{self.fe_port}, sql 
`{sql}`, err {e}"
+                )
                 if "timed out" in str(e).lower() and attempt < retries - 1:
                     LOG.warn(
-                        f"Query timed out. Retrying {attempt + 1}/{retries}..."
+                        f"Query timed out, fe {self.fe_ip}:{self.fe_port}. 
Retrying {attempt + 1}/{retries}..."
                     )
                     self._reset_conn()
                 else:
diff --git a/docker/runtime/doris-compose/format-code.sh 
b/docker/runtime/doris-compose/format-code.sh
index 0626662e641..6ec88624f26 100644
--- a/docker/runtime/doris-compose/format-code.sh
+++ b/docker/runtime/doris-compose/format-code.sh
@@ -17,3 +17,7 @@
 
 yapf -i *.py
 shfmt -w resource/*.sh
+
+# format docker file
+# go install github.com/reteps/dockerfmt@latest
+dockerfmt Dockerfile -w
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
index 15802a02888..bf43a57fbf7 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
@@ -544,6 +544,7 @@ class SuiteCluster {
     }
 
     int START_WAIT_TIMEOUT = 120
+    int STOP_WAIT_TIMEOUT = 60
 
     // if not specific fe indices, then start all frontends
     void startFrontends(int... indices) {
@@ -557,13 +558,13 @@ class SuiteCluster {
 
     // if not specific fe indices, then stop all frontends
     void stopFrontends(int... indices) {
-        runFrontendsCmd(60, 'stop', indices)
+        runFrontendsCmd(STOP_WAIT_TIMEOUT + 5, "stop --wait-timeout 
${STOP_WAIT_TIMEOUT}".toString(), indices)
         waitHbChanged()
     }
 
     // if not specific be indices, then stop all backends
     void stopBackends(int... indices) {
-        runBackendsCmd(60, 'stop', indices)
+        runBackendsCmd(STOP_WAIT_TIMEOUT + 5, "stop --wait-timeout 
${STOP_WAIT_TIMEOUT}".toString(), indices)
         waitHbChanged()
     }
 
@@ -580,7 +581,7 @@ class SuiteCluster {
     // if not specific ms indices, then restart all ms
     void restartMs(int... indices) {
         runMsCmd(START_WAIT_TIMEOUT + 5, "restart --wait-timeout 
${START_WAIT_TIMEOUT}".toString(), indices)
-    } 
+    }
 
     // if not specific recycler indices, then restart all recyclers
     void restartRecyclers(int... indices) {
@@ -588,7 +589,7 @@ class SuiteCluster {
     }
 
     // if not specific fe indices, then drop all frontends
-    void dropFrontends(boolean clean=false, int... indices) {
+    void dropFrontends(boolean clean, int... indices) {
         def cmd = 'down'
         if (clean) {
             cmd += ' --clean'
@@ -597,7 +598,7 @@ class SuiteCluster {
     }
 
     // if not specific be indices, then decommission all backends
-    void decommissionBackends(boolean clean=false, int... indices) {
+    void decommissionBackends(boolean clean, int... indices) {
         def cmd = 'down'
         if (clean) {
             cmd += ' --clean'
@@ -606,7 +607,7 @@ class SuiteCluster {
     }
 
     // if not specific be indices, then drop force all backends
-    void dropForceBackends(boolean clean=false, int... indices) {
+    void dropForceBackends(boolean clean, int... indices) {
         def cmd = 'down --drop-force'
         if (clean) {
             cmd += ' --clean'
diff --git a/regression-test/suites/clone_p0/test_decommission_mtmv.groovy 
b/regression-test/suites/clone_p0/test_decommission_mtmv.groovy
index b29d5c13c94..3efd5bc4497 100644
--- a/regression-test/suites/clone_p0/test_decommission_mtmv.groovy
+++ b/regression-test/suites/clone_p0/test_decommission_mtmv.groovy
@@ -73,7 +73,7 @@ suite('test_decommission_mtmv', 'docker') {
 
         def decommissionBeIdx = 1
         def decommissionBe = cluster.getBeByIndex(decommissionBeIdx)
-        cluster.decommissionBackends(decommissionBeIdx)
+        cluster.decommissionBackends(true, decommissionBeIdx)
 
         def backends = sql_return_maparray  'show backends'
         assertEquals(3, backends.size())
diff --git a/regression-test/suites/demo_p0/docker_action.groovy 
b/regression-test/suites/demo_p0/docker_action.groovy
index 502be030b08..7bea52989b5 100644
--- a/regression-test/suites/demo_p0/docker_action.groovy
+++ b/regression-test/suites/demo_p0/docker_action.groovy
@@ -39,6 +39,7 @@ import org.apache.doris.regression.suite.ClusterOptions
 // 1. Need add 'docker' to suite's group, and don't add 'nonConcurrent' to it;
 // 2. In docker closure:
 //    a. Don't use 'Awaitility.await()...until(f)', but use 
'dockerAwaitUntil(..., f)';
+//    b. Don't use java Thread, but use regress framework's ThreadAction(see 
example demo_p0/thread_action.groovy);
 // 3. No need to use code ` if (isCloudMode()) { return } `  in docker suites,
 // instead should use `ClusterOptions.cloudMode = true/false` is enough.
 // Because when run docker suite without an external doris cluster, if suite 
use code `isCloudMode()`, it need specific -runMode=cloud/not_cloud.
diff --git 
a/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
 
b/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
index d6c9c71539b..79b1d3c30d1 100644
--- 
a/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
+++ 
b/regression-test/suites/load/insert/test_min_load_replica_num_complicate.groovy
@@ -119,7 +119,7 @@ suite('test_min_load_replica_num_complicate', 'docker') {
 
                     futures.add(thread {
                         sql '''admin set frontend config 
("disable_tablet_scheduler" = "false")'''
-                        cluster.decommissionBackends(clean = true, 
originBackends.get(0).index)
+                        cluster.decommissionBackends(true, 
originBackends.get(0).index)
                         cluster.clearFrontendDebugPoints()
                         cluster.clearBackendDebugPoints()
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to