This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new e99be9f98ba [improvement](fe) Add virtual compute group switch metric
#63036 (#64386)
e99be9f98ba is described below
commit e99be9f98baa8c0bdcb2d4922cbe4f3d93d91820
Author: Luwei <[email protected]>
AuthorDate: Wed Jun 17 16:55:54 2026 +0800
[improvement](fe) Add virtual compute group switch metric #63036 (#64386)
pick #63036
Problem Summary: Add an FE cloud metric that records virtual compute
group active-standby switch events. The metric key uses virtual/src/dst
compute group ids so a compute group rename updates the exposed labels
without leaving stale old-name series.
Prometheus output example:
```text
doris_fe_virtual_compute_group_switch_total{virtual_compute_group_id="id1",virtual_compute_group_name="v_group_1",src_compute_group_id="id2",src_compute_group_name="p_group_1",dst_compute_group_id="id3",dst_compute_group_name="p_group_2"}
1
```
The metric value is the accumulated switch count for the labeled virtual
compute group switch path.
Add FE metric doris_fe_virtual_compute_group_switch_total for virtual
compute group active-standby switches.
- Test:
- Unit Test: bash run-fe-ut.sh --run
org.apache.doris.cloud.system.CloudSystemInfoServiceTest
- Unit Test: bash run-fe-ut.sh --run org.apache.doris.metric.MetricsTest
- Manual test: git diff --check
- FE checkstyle: bash -lc "export DORIS_HOME=$PWD && source env.sh && cd
fe && ${MVN_CMD} -pl fe-core -DskipTests checkstyle:check"
- Behavior changed: Yes. Add a new FE metric for virtual compute group
active-standby switches.
- Does this need documentation: No
---
.../doris/cloud/system/CloudSystemInfoService.java | 3 +
.../java/org/apache/doris/metric/CloudMetrics.java | 5 ++
.../java/org/apache/doris/metric/MetricRepo.java | 39 +++++++++++++
.../cloud/system/CloudSystemInfoServiceTest.java | 66 ++++++++++++++++++++++
.../java/org/apache/doris/metric/MetricsTest.java | 36 ++++++++++++
5 files changed, 149 insertions(+)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
index a60d5d46e3f..5660d9697e7 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
@@ -966,6 +966,9 @@ public class CloudSystemInfoService extends
SystemInfoService {
if (acg == null || System.currentTimeMillis() -
acg.getUnavailableSince()
> policy.getFailoverFailureThreshold() *
Config.heartbeat_interval_second * 1000) {
switchActiveStandby(cg, acgName, scgName);
+ String acgId = acg == null ?
clusterNameToId.get(acgName) : acg.getId();
+
MetricRepo.increaseVirtualComputeGroupSwitch(cg.getId(), cg.getName(), acgId,
+ acgName, scg.getId(), scgName);
policy.setActiveComputeGroup(scgName);
policy.setStandbyComputeGroup(acgName);
cg.setNeedRebuildFileCache(true);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/CloudMetrics.java
b/fe/fe-core/src/main/java/org/apache/doris/metric/CloudMetrics.java
index a14f988cf63..7076a036967 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/CloudMetrics.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/CloudMetrics.java
@@ -66,6 +66,7 @@ public class CloudMetrics {
protected static AutoMappedMetric<LongCounterMetric>
CLUSTER_CLOUD_GLOBAL_BALANCE_NUM;
protected static AutoMappedMetric<LongCounterMetric>
CLUSTER_CLOUD_SMOOTH_UPGRADE_BALANCE_NUM;
protected static AutoMappedMetric<LongCounterMetric>
CLUSTER_CLOUD_WARM_UP_CACHE_BALANCE_NUM;
+ protected static AutoMappedMetric<LongCounterMetric>
VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER;
protected static void init() {
if (Config.isNotCloudMode()) {
@@ -179,5 +180,9 @@ public class CloudMetrics {
CLUSTER_CLOUD_WARM_UP_CACHE_BALANCE_NUM = new AutoMappedMetric<>(name
-> new LongCounterMetric(
"cloud_warm_up_balance_num", MetricUnit.NOUNIT,
"current cluster cloud warm up cache sync edit log number"));
+
+ VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER = new AutoMappedMetric<>(name ->
new LongCounterMetric(
+ "virtual_compute_group_switch_total", MetricUnit.NOUNIT,
+ "virtual compute group active standby switch count"));
}
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
index cacb8e5adb7..f02b8141f93 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
@@ -1914,6 +1914,33 @@ public final class MetricRepo {
CloudMetrics.CLUSTER_QUERY_LATENCY_HISTO.getOrAdd(key).update(elapseMs);
}
+ public static void increaseVirtualComputeGroupSwitch(String
virtualComputeGroupId, String virtualComputeGroupName,
+ String
srcComputeGroupId, String srcComputeGroupName,
+ String
dstComputeGroupId, String dstComputeGroupName) {
+ if (!MetricRepo.isInit || Config.isNotCloudMode() ||
Strings.isNullOrEmpty(virtualComputeGroupId)
+ || Strings.isNullOrEmpty(virtualComputeGroupName) ||
Strings.isNullOrEmpty(srcComputeGroupId)
+ || Strings.isNullOrEmpty(srcComputeGroupName) ||
Strings.isNullOrEmpty(dstComputeGroupId)
+ || Strings.isNullOrEmpty(dstComputeGroupName)) {
+ return;
+ }
+ String key = virtualComputeGroupId +
CloudMetrics.CLOUD_CLUSTER_DELIMITER + srcComputeGroupId
+ + CloudMetrics.CLOUD_CLUSTER_DELIMITER + dstComputeGroupId;
+ LongCounterMetric counter =
CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER.getOrAdd(key);
+ List<MetricLabel> labels = new ArrayList<>();
+ counter.increase(1L);
+ labels.add(new MetricLabel("virtual_compute_group_id",
virtualComputeGroupId));
+ labels.add(new MetricLabel("virtual_compute_group_name",
virtualComputeGroupName));
+ labels.add(new MetricLabel("src_compute_group_id", srcComputeGroupId));
+ labels.add(new MetricLabel("src_compute_group_name",
srcComputeGroupName));
+ labels.add(new MetricLabel("dst_compute_group_id", dstComputeGroupId));
+ labels.add(new MetricLabel("dst_compute_group_name",
dstComputeGroupName));
+ if (!counter.getLabels().isEmpty()) {
+
MetricRepo.DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(counter.getName(),
counter.getLabels());
+ }
+ counter.setLabels(labels);
+ MetricRepo.DORIS_METRIC_REGISTER.addMetrics(counter);
+ }
+
public static void unregisterCloudMetrics(String clusterId, String
clusterName, List<Backend> backends) {
if (!MetricRepo.isInit || Config.isNotCloudMode() ||
Strings.isNullOrEmpty(clusterId)) {
return;
@@ -1968,6 +1995,18 @@ public final class MetricRepo {
+ clusterName);
// Meta-service RPC latency is keyed by method name only, so it is
not removed by cluster.
+ String delimiter = CloudMetrics.CLOUD_CLUSTER_DELIMITER;
+ for (String key : new ArrayList<>(
+
CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER.getMetrics().keySet())) {
+ if (key.startsWith(clusterId + delimiter) ||
key.contains(delimiter + clusterId + delimiter)
+ || key.endsWith(delimiter + clusterId)) {
+ LongCounterMetric switchCounter =
CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER.getOrAdd(key);
+
CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER.remove(key);
+ MetricRepo.DORIS_METRIC_REGISTER
+ .removeMetricsByNameAndLabels(switchCounter.getName(),
switchCounter.getLabels());
+ }
+ }
+
for (Backend backend : backends) {
List<MetricLabel> backendLabels = new ArrayList<>();
backendLabels.add(new MetricLabel("cluster_id", clusterId));
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/cloud/system/CloudSystemInfoServiceTest.java
b/fe/fe-core/src/test/java/org/apache/doris/cloud/system/CloudSystemInfoServiceTest.java
index 11b288dfc2a..83f62b351ee 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/cloud/system/CloudSystemInfoServiceTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/cloud/system/CloudSystemInfoServiceTest.java
@@ -19,8 +19,12 @@ package org.apache.doris.cloud.system;
import org.apache.doris.analysis.UserIdentity;
import org.apache.doris.catalog.Env;
+import org.apache.doris.cloud.catalog.CloudEnv;
import org.apache.doris.cloud.catalog.ComputeGroup;
+import org.apache.doris.cloud.proto.Cloud;
+import org.apache.doris.cloud.rpc.MetaServiceProxy;
import org.apache.doris.common.Config;
+import org.apache.doris.metric.MetricRepo;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.resource.Tag;
import org.apache.doris.system.Backend;
@@ -28,6 +32,8 @@ import org.apache.doris.system.Backend;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.mockito.MockedStatic;
+import org.mockito.Mockito;
import java.util.ArrayList;
import java.util.List;
@@ -40,6 +46,7 @@ public class CloudSystemInfoServiceTest {
public void setUp() {
// Enable cloud mode for testing
Config.cloud_unique_id = "test_cloud_unique_id";
+ Config.meta_service_endpoint = "127.0.0.1:5000";
}
@Test
@@ -267,6 +274,65 @@ public class CloudSystemInfoServiceTest {
Assert.assertEquals(pcgName2, res);
}
+ @Test
+ public void testGetPhysicalClusterSwitchActiveStandbyMetric() throws
Exception {
+ infoService = new CloudSystemInfoService();
+
+ String vcgName = "v_cluster_1";
+ String vcgId = "id1";
+ String pcgName1 = "p_cluster_1";
+ String pcgName2 = "p_cluster_2";
+
+ ComputeGroup vcg = new ComputeGroup(vcgId, vcgName,
ComputeGroup.ComputeTypeEnum.VIRTUAL);
+ ComputeGroup.Policy policy = new ComputeGroup.Policy();
+ policy.setActiveComputeGroup(pcgName1);
+ policy.setStandbyComputeGroup(pcgName2);
+ policy.setUnhealthyNodeThresholdPercent(100);
+ vcg.setPolicy(policy);
+
+ ComputeGroup pcg2 = new ComputeGroup("id3", pcgName2,
ComputeGroup.ComputeTypeEnum.COMPUTE);
+ infoService.addComputeGroup(vcgId, vcg);
+ infoService.clusterNameToId.put(pcgName1, "id2");
+ infoService.addComputeGroup("id3", pcg2);
+
+ List<Backend> toAdd2 = new ArrayList<>();
+ for (int i = 0; i < 3; ++i) {
+ Backend b = new Backend(i + 4, "", i);
+ Map<String, String> newTagMap = Tag.DEFAULT_BACKEND_TAG.toMap();
+ newTagMap.put(Tag.CLOUD_CLUSTER_NAME, pcgName2);
+ newTagMap.put(Tag.CLOUD_CLUSTER_ID, "id3");
+ b.setTagMap(newTagMap);
+ b.setAlive(true);
+ toAdd2.add(b);
+ }
+ infoService.updateCloudClusterMapNoLock(toAdd2, new ArrayList<>());
+ Assert.assertNull(infoService.getComputeGroupByName(pcgName1));
+ Assert.assertTrue(infoService.isComputeGroupAvailable(pcgName2,
policy.getUnhealthyNodeThresholdPercent()));
+
+ CloudEnv cloudEnv = Mockito.mock(CloudEnv.class);
+ Mockito.when(cloudEnv.getCloudInstanceId()).thenReturn("instance_id");
+ MetaServiceProxy metaServiceProxy =
Mockito.mock(MetaServiceProxy.class);
+ Cloud.AlterClusterResponse response =
Cloud.AlterClusterResponse.newBuilder()
+ .setStatus(Cloud.MetaServiceResponseStatus.newBuilder()
+ .setCode(Cloud.MetaServiceCode.OK)
+ .setMsg("OK"))
+ .build();
+
Mockito.when(metaServiceProxy.alterCluster(Mockito.any())).thenReturn(response);
+
+ try (MockedStatic<Env> mockedEnv = Mockito.mockStatic(Env.class);
+ MockedStatic<MetaServiceProxy> mockedMetaServiceProxy =
Mockito.mockStatic(MetaServiceProxy.class);
+ MockedStatic<MetricRepo> mockedMetricRepo =
Mockito.mockStatic(MetricRepo.class)) {
+ mockedEnv.when(Env::getCurrentEnv).thenReturn(cloudEnv);
+
mockedMetaServiceProxy.when(MetaServiceProxy::getInstance).thenReturn(metaServiceProxy);
+
+ String res = infoService.getPhysicalCluster(vcgName);
+
+ Assert.assertEquals(pcgName2, res);
+ mockedMetricRepo.verify(() ->
+ MetricRepo.increaseVirtualComputeGroupSwitch(vcgId,
vcgName, "id2", pcgName1, "id3", pcgName2));
+ }
+ }
+
// active has 1 alive be and 2 dead be, standby has 3 alive be
@Test
public void testGetPhysicalClusterActive1AliveBe2DeadBe() {
diff --git a/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java
b/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java
index b9994bb5706..164d767b66e 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java
@@ -17,6 +17,7 @@
package org.apache.doris.metric;
+import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.util.JsonUtil;
import org.apache.doris.metric.Metric.MetricUnit;
@@ -160,6 +161,41 @@ public class MetricsTest {
Assert.assertFalse(prometheusResult.contains("doris_fe_disabled_latency_ms"));
}
+ @Test
+ public void testVirtualComputeGroupSwitchMetricRename() {
+ String originCloudUniqueId = Config.cloud_unique_id;
+ AutoMappedMetric<LongCounterMetric> originMetric =
CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER;
+ try {
+ Config.cloud_unique_id = "test_cloud_unique_id";
+ CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER = new
AutoMappedMetric<>(name -> new LongCounterMetric(
+ "virtual_compute_group_switch_total", MetricUnit.NOUNIT,
+ "virtual compute group active standby switch count"));
+
+ MetricRepo.increaseVirtualComputeGroupSwitch("virtual_id",
"virtual_name",
+ "src_id", "src_old_name", "dst_id", "dst_name");
+ MetricRepo.increaseVirtualComputeGroupSwitch("virtual_id",
"virtual_name",
+ "src_id", "src_new_name", "dst_id", "dst_name");
+
+ MetricVisitor visitor = new PrometheusMetricVisitor();
+ MetricRepo.DORIS_METRIC_REGISTER.accept(visitor);
+ String metricResult = visitor.finish();
+ Assert.assertTrue(metricResult.contains("# TYPE
doris_fe_virtual_compute_group_switch_total counter"));
+
Assert.assertTrue(metricResult.contains("src_compute_group_name=\"src_new_name\""));
+
Assert.assertTrue(metricResult.contains("doris_fe_virtual_compute_group_switch_total"
+ + "{virtual_compute_group_id=\"virtual_id\",
virtual_compute_group_name=\"virtual_name\", "
+ + "src_compute_group_id=\"src_id\",
src_compute_group_name=\"src_new_name\", "
+ + "dst_compute_group_id=\"dst_id\",
dst_compute_group_name=\"dst_name\"} 2"));
+
Assert.assertFalse(metricResult.contains("src_compute_group_name=\"src_old_name\""));
+ } finally {
+
MetricRepo.DORIS_METRIC_REGISTER.removeMetrics("virtual_compute_group_switch_total");
+ if (CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER != null) {
+
CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER.getMetrics().clear();
+ }
+ CloudMetrics.VIRTUAL_COMPUTE_GROUP_SWITCH_COUNTER = originMetric;
+ Config.cloud_unique_id = originCloudUniqueId;
+ }
+ }
+
@Test
public void testGc() {
PrometheusMetricVisitor visitor = new PrometheusMetricVisitor();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]