This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 6f34e43b20a branch-3.0: [fix](sc) retry on network error #54419
(#54488)
6f34e43b20a is described below
commit 6f34e43b20af2e92a031edd4988d57df9c11a3cd
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Aug 12 10:50:19 2025 +0800
branch-3.0: [fix](sc) retry on network error #54419 (#54488)
Cherry-picked from #54419
Co-authored-by: Yongqiang YANG <[email protected]>
Co-authored-by: Yongqiang YANG <[email protected]>
---
.../src/main/java/org/apache/doris/common/Config.java | 10 +++++-----
.../src/main/java/org/apache/doris/alter/AlterJobV2.java | 11 +++++++++++
.../src/main/java/org/apache/doris/alter/RollupJobV2.java | 9 +--------
.../main/java/org/apache/doris/alter/SchemaChangeJobV2.java | 13 +++----------
.../src/main/java/org/apache/doris/task/AgentBatchTask.java | 4 ++++
.../test_schema_change_with_compaction11.groovy | 3 ++-
6 files changed, 26 insertions(+), 24 deletions(-)
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 94728fe3f9d..51240074645 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -3295,12 +3295,12 @@ public class Config extends ConfigBase {
"Maximal concurrent num of get tablet stat job."})
public static int max_get_tablet_stat_task_threads_num = 4;
- @ConfField(mutable = true, description = {"存算分离模式下schema change失败是否重试",
- "Whether to enable retry when schema change failed in cloud model,
default is true."})
- public static boolean enable_schema_change_retry_in_cloud_mode = true;
+ @ConfField(mutable = true, description = {"schema change job 失败是否重试",
+ "Whether to enable retry when a schema change job fails, default
is true."})
+ public static boolean enable_schema_change_retry = true;
- @ConfField(mutable = true, description = {"存算分离模式下schema change重试次数",
- "Max retry times when schema change failed in cloud model, default
is 3."})
+ @ConfField(mutable = true, description = {"schema change job 重试次数",
+ "Max retry times when a schema change job fails, default is 3."})
public static int schema_change_max_retry_time = 3;
@ConfField(mutable = true, description = {"是否允许使用ShowCacheHotSpotStmt语句",
diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
b/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
index b1ccf230526..d13cacae06d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
@@ -30,6 +30,7 @@ import org.apache.doris.common.util.DebugPointUtil;
import org.apache.doris.persist.gson.GsonUtils;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.task.AgentTask;
+import org.apache.doris.thrift.TStatusCode;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
@@ -263,6 +264,16 @@ public abstract class AlterJobV2 implements Writable {
return cancelImpl(errMsg);
}
+ protected int getRetryTimes(AgentTask task) {
+ int maxFailedTimes = 0;
+ if (Config.enable_schema_change_retry && task.getErrorCode() != null
+ &&
(task.getErrorCode().equals(TStatusCode.DELETE_BITMAP_LOCK_ERROR)
+ || task.getErrorCode().equals(TStatusCode.NETWORK_ERROR)))
{
+ maxFailedTimes = Config.schema_change_max_retry_time;
+ }
+ return maxFailedTimes;
+ }
+
/**
* should be call before executing the job.
* return false if table is not stable.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
b/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
index e5d97d1f97b..f9a3ba16190 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
@@ -64,7 +64,6 @@ import org.apache.doris.task.AgentTaskExecutor;
import org.apache.doris.task.AgentTaskQueue;
import org.apache.doris.task.AlterReplicaTask;
import org.apache.doris.task.CreateReplicaTask;
-import org.apache.doris.thrift.TStatusCode;
import org.apache.doris.thrift.TStorageFormat;
import org.apache.doris.thrift.TStorageMedium;
import org.apache.doris.thrift.TStorageType;
@@ -543,13 +542,7 @@ public class RollupJobV2 extends AlterJobV2 implements
GsonPostProcessable {
List<AgentTask> tasks = rollupBatchTask.getUnfinishedTasks(2000);
ensureCloudClusterExist(tasks);
for (AgentTask task : tasks) {
- int maxFailedTimes = 0;
- if (Config.isCloudMode() &&
Config.enable_schema_change_retry_in_cloud_mode) {
- if (task.getErrorCode() != null && task.getErrorCode()
- .equals(TStatusCode.DELETE_BITMAP_LOCK_ERROR)) {
- maxFailedTimes = Config.schema_change_max_retry_time;
- }
- }
+ int maxFailedTimes = getRetryTimes(task);
if (task.getFailedTimes() > maxFailedTimes) {
task.setFinished(true);
AgentTaskQueue.removeTask(task.getBackendId(),
TTaskType.ALTER, task.getSignature());
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
index 28d58e4f6fa..2346cdea92a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
@@ -57,7 +57,6 @@ import org.apache.doris.task.AgentTaskExecutor;
import org.apache.doris.task.AgentTaskQueue;
import org.apache.doris.task.AlterReplicaTask;
import org.apache.doris.task.CreateReplicaTask;
-import org.apache.doris.thrift.TStatusCode;
import org.apache.doris.thrift.TStorageFormat;
import org.apache.doris.thrift.TStorageMedium;
import org.apache.doris.thrift.TStorageType;
@@ -587,19 +586,13 @@ public class SchemaChangeJobV2 extends AlterJobV2 {
List<AgentTask> tasks =
schemaChangeBatchTask.getUnfinishedTasks(2000);
ensureCloudClusterExist(tasks);
for (AgentTask task : tasks) {
- int maxFailedTimes = 0;
- if (Config.isCloudMode() &&
Config.enable_schema_change_retry_in_cloud_mode) {
- if (task.getErrorCode() != null && task.getErrorCode()
- .equals(TStatusCode.DELETE_BITMAP_LOCK_ERROR)) {
- maxFailedTimes = Config.schema_change_max_retry_time;
- }
- }
+ int maxFailedTimes = getRetryTimes(task);
if (task.getFailedTimes() > maxFailedTimes) {
task.setFinished(true);
if (!FeConstants.runningUnitTest) {
AgentTaskQueue.removeTask(task.getBackendId(),
TTaskType.ALTER, task.getSignature());
- LOG.warn("schema change task failed, failedTimes: {},
maxFailedTimes: {}, err: {}",
- task.getFailedTimes(), maxFailedTimes,
task.getErrorMsg());
+ LOG.warn("schema change task failed, job: {},
failedTimes: {}, maxFailedTimes: {}, err: {}",
+ jobId, task.getFailedTimes(), maxFailedTimes,
task.getErrorMsg());
List<Long> failedBackends =
failedTabletBackends.get(task.getTabletId());
if (failedBackends == null) {
failedBackends = Lists.newArrayList();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
b/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
index ead255ace36..a821aa143ef 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
@@ -49,6 +49,7 @@ import org.apache.doris.thrift.TPushReq;
import org.apache.doris.thrift.TPushStoragePolicyReq;
import org.apache.doris.thrift.TReleaseSnapshotRequest;
import org.apache.doris.thrift.TSnapshotRequest;
+import org.apache.doris.thrift.TStatusCode;
import org.apache.doris.thrift.TStorageMediumMigrateReq;
import org.apache.doris.thrift.TTaskType;
import org.apache.doris.thrift.TUpdateTabletMetaInfoReq;
@@ -222,6 +223,9 @@ public class AgentBatchTask implements Runnable {
List<AgentTask> tasks =
this.backendIdToTasks.get(backendId);
for (AgentTask task : tasks) {
task.failedWithMsg(errMsg);
+ if (errMsg.contains("Socket is closed")) {
+ task.setErrorCode(TStatusCode.NETWORK_ERROR);
+ }
}
}
}
diff --git
a/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
b/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
index 4c1c772da37..0e765e46b7b 100644
---
a/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
+++
b/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
@@ -144,8 +144,9 @@ suite('test_schema_change_with_compaction11', 'docker') {
GetDebugPoint().disableDebugPointForAllBEs(injectName)
}
int max_try_time = 3000
+ def result = null
while (max_try_time--){
- def result = getJobState("date")
+ result = getJobState("date")
if (result == "FINISHED" || result == "CANCELLED") {
sleep(3000)
break
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]