This is an automated email from the ASF dual-hosted git repository.

gavinchou pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new a4a9e801cf4 [Fix](cloud) Fix cluster status inconsistent with bes and 
add config disable auto (#40799)
a4a9e801cf4 is described below

commit a4a9e801cf477746cb3665eaa904d8ea20ceee6c
Author: deardeng <565620...@qq.com>
AuthorDate: Sat Sep 14 19:37:54 2024 +0800

    [Fix](cloud) Fix cluster status inconsistent with bes and add config 
disable auto (#40799)
    
    1. add switch for auto start in cloud
    2. fix inconsistent with cluster status  in be tag when add be node
    3. fix docker compose, `[INVALID_ARGUMENT]cloud_instance_id in fe.conf
    and be.conf are not same, fe: , be: reg_cloud_instance`
---
 .../main/java/org/apache/doris/common/Config.java  |  9 ++++++++
 .../doris/cloud/catalog/CloudClusterChecker.java   | 18 ++++++++++++---
 .../doris/cloud/system/CloudSystemInfoService.java | 20 +++++++++++++----
 .../main/java/org/apache/doris/system/Backend.java |  2 +-
 .../java/org/apache/doris/system/HeartbeatMgr.java |  4 +++-
 .../cloud_p0/multi_cluster/test_auto_start.groovy  | 26 +++++++++++++++++++++-
 6 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 968a307c842..282fbf3a7bc 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -3026,6 +3026,15 @@ public class Config extends ConfigBase {
     @ConfField(mutable = true, description = 
{"存算分离模式下,当tablet分布的be异常,是否立即映射tablet到新的be上,默认true"})
     public static boolean enable_immediate_be_assign = true;
 
+    @ConfField(mutable = true, description = {"存算分离模式下是否启用自动启停功能,默认true",
+        "Whether to enable the automatic start-stop feature in cloud model, 
default is true."})
+    public static boolean enable_auto_start_for_cloud_cluster = true;
+
+    @ConfField(mutable = true, description = 
{"存算分离模式下自动启停等待cluster唤醒退避重试次数,默认300次大约5分钟",
+        "The automatic start-stop wait time for cluster wake-up backoff retry 
count in the cloud "
+            + "model is set to 300 times, which is approximately 5 minutes by 
default."})
+    public static int auto_start_wait_to_resume_times = 300;
+
     // ATTN: DONOT add any config not related to cloud mode here
     // ATTN: DONOT add any config not related to cloud mode here
     // ATTN: DONOT add any config not related to cloud mode here
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
index 567dc4b3124..0dfcf322a0c 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
@@ -219,7 +219,19 @@ public class CloudClusterChecker extends MasterDaemon {
             if (LOG.isDebugEnabled()) {
                 LOG.debug("current cluster status {} {}", 
currentClusterStatus, newClusterStatus);
             }
-            if (!currentClusterStatus.equals(newClusterStatus)) {
+            boolean needChange = false;
+            // ATTN: found bug, In the same cluster, the cluster status in the 
tags of BE nodes is inconsistent.
+            // Using a set to collect the cluster statuses from the BE nodes.
+            Set<String> clusterStatusInMem = new HashSet<>();
+            for (Backend backend : currentBes) {
+                String beClusterStatus = 
backend.getTagMap().get(Tag.CLOUD_CLUSTER_STATUS);
+                clusterStatusInMem.add(beClusterStatus == null ? "NOT_SET" : 
beClusterStatus);
+            }
+            if (clusterStatusInMem.size() != 1) {
+                LOG.warn("cluster {}, multi be nodes cluster status 
inconsistent, fix it {}", cid, clusterStatusInMem);
+                needChange = true;
+            }
+            if (!currentClusterStatus.equals(newClusterStatus) || needChange) {
                 // cluster's status changed
                 LOG.info("cluster_status corresponding to cluster_id has been 
changed,"
                         + " cluster_id : {} , current_cluster_status : {}, 
new_cluster_status :{}",
@@ -426,8 +438,8 @@ public class CloudClusterChecker extends MasterDaemon {
             }
             return nodeMap;
         });
-        LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}",
-                expectedFes, currentFes, toAdd, toDel);
+        LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}, 
enable auto start: {}",
+                expectedFes, currentFes, toAdd, toDel, 
Config.enable_auto_start_for_cloud_cluster);
         if (toAdd.isEmpty() && toDel.isEmpty()) {
             if (LOG.isDebugEnabled()) {
                 LOG.debug("runAfterCatalogReady getObserverFes nothing todo");
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
index 606f52369e5..a91892870d6 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
@@ -55,6 +55,7 @@ import java.util.Comparator;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
@@ -567,10 +568,18 @@ public class CloudSystemInfoService extends 
SystemInfoService {
         }
     }
 
+    public Set<String> getClusterStatus(List<Backend> backends) {
+        // ATTN: found bug, In the same cluster, the cluster status in the 
tags of BE nodes is inconsistent.
+        // Using a set to collect the cluster statuses from the BE nodes.
+        return 
backends.stream().map(Backend::getCloudClusterStatus).collect(Collectors.toSet());
+    }
+
     public String getCloudStatusByIdNoLock(final String clusterId) {
-        return clusterIdToBackend.getOrDefault(clusterId, new ArrayList<>())
-            .stream().map(Backend::getCloudClusterStatus).findFirst()
-            .orElse(String.valueOf(Cloud.ClusterStatus.UNKNOWN));
+        List<Backend> bes = clusterIdToBackend.getOrDefault(clusterId, new 
ArrayList<>());
+        Optional<String> hasNormal = 
bes.stream().map(Backend::getCloudClusterStatus)
+                .filter(status -> 
status.equals(String.valueOf(Cloud.ClusterStatus.NORMAL))).findAny();
+        return hasNormal.orElseGet(() -> 
bes.stream().map(Backend::getCloudClusterStatus).findFirst()
+            .orElse(String.valueOf(Cloud.ClusterStatus.NORMAL)));
     }
 
     public void updateClusterNameToId(final String newName,
@@ -949,6 +958,9 @@ public class CloudSystemInfoService extends 
SystemInfoService {
         if (Config.isNotCloudMode()) {
             return null;
         }
+        if (!Config.enable_auto_start_for_cloud_cluster) {
+            return null;
+        }
         clusterName = getClusterNameAutoStart(clusterName);
         if (Strings.isNullOrEmpty(clusterName)) {
             LOG.warn("auto start in cloud mode, but clusterName empty {}", 
clusterName);
@@ -999,7 +1011,7 @@ public class CloudSystemInfoService extends 
SystemInfoService {
             }
         }
         // wait 5 mins
-        int retryTimes = 5 * 60;
+        int retryTimes = Config.auto_start_wait_to_resume_times < 0 ? 300 : 
Config.auto_start_wait_to_resume_times;
         int retryTime = 0;
         StopWatch stopWatch = new StopWatch();
         stopWatch.start();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java 
b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
index 876e6ca40b4..01bf800e97e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
@@ -188,7 +188,7 @@ public class Backend implements Writable {
     }
 
     public String getCloudClusterStatus() {
-        return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS, 
String.valueOf(Cloud.ClusterStatus.UNKNOWN));
+        return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS, 
String.valueOf(Cloud.ClusterStatus.NORMAL));
     }
 
     public void setCloudClusterStatus(final String clusterStatus) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java 
b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
index 5dd8dd9fca1..ee85a242239 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
@@ -99,7 +99,9 @@ public class HeartbeatMgr extends MasterDaemon {
             // Set cloud_instance_id and meta_service_endpoint even if there 
are empty
             // Be can knowns that fe is working in cloud mode.
             // Set the cloud instance ID for cloud deployment identification
-            tMasterInfo.setCloudInstanceId(Config.cloud_instance_id);
+            if (!Strings.isNullOrEmpty(Config.cloud_instance_id)) {
+                tMasterInfo.setCloudInstanceId(Config.cloud_instance_id);
+            }
             // Set the endpoint for the metadata service in cloud mode
             tMasterInfo.setMetaServiceEndpoint(Config.meta_service_endpoint);
         }
diff --git 
a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy 
b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
index 2ce9a9d8f4b..d6db6364d38 100644
--- a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
+++ b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
@@ -22,7 +22,7 @@ import org.awaitility.Awaitility;
 import org.apache.doris.regression.util.Http
 import static java.util.concurrent.TimeUnit.SECONDS;
 
-suite('test_auto_start_in_cloud', 'multi_cluster') {
+suite('test_auto_start_in_cloud', 'multi_cluster, docker') {
     if (!isCloudMode()) {
         return;
     }
@@ -168,5 +168,29 @@ suite('test_auto_start_in_cloud', 'multi_cluster') {
 
         future1.get()
         future2.get()
+
+        tag = getCloudBeTagByName(clusterName)
+        logger.info("tag check = {}", tag) 
+        jsonObject = jsonSlurper.parseText(tag)
+        String cluster_status = jsonObject.cloud_cluster_status
+        assertEquals("NORMAL", cluster_status)
+
+        // add 1 nodes, check it status NORMAL
+        cluster.addBackend(1, null)
+        dockerAwaitUntil(5) {
+            result = sql """SHOW BACKENDS"""
+            result.size() == 4
+        }
+
+        def bes = sql_return_maparray "SHOW BACKENDS"
+        bes.each {
+            tag = it.Tag
+            if (!tag.contains(clusterName)) {
+                return
+            }
+            jsonObject = jsonSlurper.parseText(tag)
+            cluster_status = jsonObject.cloud_cluster_status
+            assertEquals("NORMAL", cluster_status)
+        }
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to