This is an automated email from the ASF dual-hosted git repository. gavinchou pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new a4a9e801cf4 [Fix](cloud) Fix cluster status inconsistent with bes and add config disable auto (#40799) a4a9e801cf4 is described below commit a4a9e801cf477746cb3665eaa904d8ea20ceee6c Author: deardeng <565620...@qq.com> AuthorDate: Sat Sep 14 19:37:54 2024 +0800 [Fix](cloud) Fix cluster status inconsistent with bes and add config disable auto (#40799) 1. add switch for auto start in cloud 2. fix inconsistent with cluster status in be tag when add be node 3. fix docker compose, `[INVALID_ARGUMENT]cloud_instance_id in fe.conf and be.conf are not same, fe: , be: reg_cloud_instance` --- .../main/java/org/apache/doris/common/Config.java | 9 ++++++++ .../doris/cloud/catalog/CloudClusterChecker.java | 18 ++++++++++++--- .../doris/cloud/system/CloudSystemInfoService.java | 20 +++++++++++++---- .../main/java/org/apache/doris/system/Backend.java | 2 +- .../java/org/apache/doris/system/HeartbeatMgr.java | 4 +++- .../cloud_p0/multi_cluster/test_auto_start.groovy | 26 +++++++++++++++++++++- 6 files changed, 69 insertions(+), 10 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 968a307c842..282fbf3a7bc 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -3026,6 +3026,15 @@ public class Config extends ConfigBase { @ConfField(mutable = true, description = {"存算分离模式下,当tablet分布的be异常,是否立即映射tablet到新的be上,默认true"}) public static boolean enable_immediate_be_assign = true; + @ConfField(mutable = true, description = {"存算分离模式下是否启用自动启停功能,默认true", + "Whether to enable the automatic start-stop feature in cloud model, default is true."}) + public static boolean enable_auto_start_for_cloud_cluster = true; + + @ConfField(mutable = true, description = {"存算分离模式下自动启停等待cluster唤醒退避重试次数,默认300次大约5分钟", + "The automatic start-stop wait time for cluster wake-up backoff retry count in the cloud " + + "model is set to 300 times, which is approximately 5 minutes by default."}) + public static int auto_start_wait_to_resume_times = 300; + // ATTN: DONOT add any config not related to cloud mode here // ATTN: DONOT add any config not related to cloud mode here // ATTN: DONOT add any config not related to cloud mode here diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java index 567dc4b3124..0dfcf322a0c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java @@ -219,7 +219,19 @@ public class CloudClusterChecker extends MasterDaemon { if (LOG.isDebugEnabled()) { LOG.debug("current cluster status {} {}", currentClusterStatus, newClusterStatus); } - if (!currentClusterStatus.equals(newClusterStatus)) { + boolean needChange = false; + // ATTN: found bug, In the same cluster, the cluster status in the tags of BE nodes is inconsistent. + // Using a set to collect the cluster statuses from the BE nodes. + Set<String> clusterStatusInMem = new HashSet<>(); + for (Backend backend : currentBes) { + String beClusterStatus = backend.getTagMap().get(Tag.CLOUD_CLUSTER_STATUS); + clusterStatusInMem.add(beClusterStatus == null ? "NOT_SET" : beClusterStatus); + } + if (clusterStatusInMem.size() != 1) { + LOG.warn("cluster {}, multi be nodes cluster status inconsistent, fix it {}", cid, clusterStatusInMem); + needChange = true; + } + if (!currentClusterStatus.equals(newClusterStatus) || needChange) { // cluster's status changed LOG.info("cluster_status corresponding to cluster_id has been changed," + " cluster_id : {} , current_cluster_status : {}, new_cluster_status :{}", @@ -426,8 +438,8 @@ public class CloudClusterChecker extends MasterDaemon { } return nodeMap; }); - LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}", - expectedFes, currentFes, toAdd, toDel); + LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}, enable auto start: {}", + expectedFes, currentFes, toAdd, toDel, Config.enable_auto_start_for_cloud_cluster); if (toAdd.isEmpty() && toDel.isEmpty()) { if (LOG.isDebugEnabled()) { LOG.debug("runAfterCatalogReady getObserverFes nothing todo"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java index 606f52369e5..a91892870d6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java @@ -55,6 +55,7 @@ import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; @@ -567,10 +568,18 @@ public class CloudSystemInfoService extends SystemInfoService { } } + public Set<String> getClusterStatus(List<Backend> backends) { + // ATTN: found bug, In the same cluster, the cluster status in the tags of BE nodes is inconsistent. + // Using a set to collect the cluster statuses from the BE nodes. + return backends.stream().map(Backend::getCloudClusterStatus).collect(Collectors.toSet()); + } + public String getCloudStatusByIdNoLock(final String clusterId) { - return clusterIdToBackend.getOrDefault(clusterId, new ArrayList<>()) - .stream().map(Backend::getCloudClusterStatus).findFirst() - .orElse(String.valueOf(Cloud.ClusterStatus.UNKNOWN)); + List<Backend> bes = clusterIdToBackend.getOrDefault(clusterId, new ArrayList<>()); + Optional<String> hasNormal = bes.stream().map(Backend::getCloudClusterStatus) + .filter(status -> status.equals(String.valueOf(Cloud.ClusterStatus.NORMAL))).findAny(); + return hasNormal.orElseGet(() -> bes.stream().map(Backend::getCloudClusterStatus).findFirst() + .orElse(String.valueOf(Cloud.ClusterStatus.NORMAL))); } public void updateClusterNameToId(final String newName, @@ -949,6 +958,9 @@ public class CloudSystemInfoService extends SystemInfoService { if (Config.isNotCloudMode()) { return null; } + if (!Config.enable_auto_start_for_cloud_cluster) { + return null; + } clusterName = getClusterNameAutoStart(clusterName); if (Strings.isNullOrEmpty(clusterName)) { LOG.warn("auto start in cloud mode, but clusterName empty {}", clusterName); @@ -999,7 +1011,7 @@ public class CloudSystemInfoService extends SystemInfoService { } } // wait 5 mins - int retryTimes = 5 * 60; + int retryTimes = Config.auto_start_wait_to_resume_times < 0 ? 300 : Config.auto_start_wait_to_resume_times; int retryTime = 0; StopWatch stopWatch = new StopWatch(); stopWatch.start(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java index 876e6ca40b4..01bf800e97e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java @@ -188,7 +188,7 @@ public class Backend implements Writable { } public String getCloudClusterStatus() { - return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS, String.valueOf(Cloud.ClusterStatus.UNKNOWN)); + return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS, String.valueOf(Cloud.ClusterStatus.NORMAL)); } public void setCloudClusterStatus(final String clusterStatus) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java index 5dd8dd9fca1..ee85a242239 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java @@ -99,7 +99,9 @@ public class HeartbeatMgr extends MasterDaemon { // Set cloud_instance_id and meta_service_endpoint even if there are empty // Be can knowns that fe is working in cloud mode. // Set the cloud instance ID for cloud deployment identification - tMasterInfo.setCloudInstanceId(Config.cloud_instance_id); + if (!Strings.isNullOrEmpty(Config.cloud_instance_id)) { + tMasterInfo.setCloudInstanceId(Config.cloud_instance_id); + } // Set the endpoint for the metadata service in cloud mode tMasterInfo.setMetaServiceEndpoint(Config.meta_service_endpoint); } diff --git a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy index 2ce9a9d8f4b..d6db6364d38 100644 --- a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy +++ b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy @@ -22,7 +22,7 @@ import org.awaitility.Awaitility; import org.apache.doris.regression.util.Http import static java.util.concurrent.TimeUnit.SECONDS; -suite('test_auto_start_in_cloud', 'multi_cluster') { +suite('test_auto_start_in_cloud', 'multi_cluster, docker') { if (!isCloudMode()) { return; } @@ -168,5 +168,29 @@ suite('test_auto_start_in_cloud', 'multi_cluster') { future1.get() future2.get() + + tag = getCloudBeTagByName(clusterName) + logger.info("tag check = {}", tag) + jsonObject = jsonSlurper.parseText(tag) + String cluster_status = jsonObject.cloud_cluster_status + assertEquals("NORMAL", cluster_status) + + // add 1 nodes, check it status NORMAL + cluster.addBackend(1, null) + dockerAwaitUntil(5) { + result = sql """SHOW BACKENDS""" + result.size() == 4 + } + + def bes = sql_return_maparray "SHOW BACKENDS" + bes.each { + tag = it.Tag + if (!tag.contains(clusterName)) { + return + } + jsonObject = jsonSlurper.parseText(tag) + cluster_status = jsonObject.cloud_cluster_status + assertEquals("NORMAL", cluster_status) + } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org