This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 6f34e43b20a branch-3.0: [fix](sc) retry on network error #54419 
(#54488)
6f34e43b20a is described below

commit 6f34e43b20af2e92a031edd4988d57df9c11a3cd
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Aug 12 10:50:19 2025 +0800

    branch-3.0: [fix](sc) retry on network error #54419 (#54488)
    
    Cherry-picked from #54419
    
    Co-authored-by: Yongqiang YANG <[email protected]>
    Co-authored-by: Yongqiang YANG <[email protected]>
---
 .../src/main/java/org/apache/doris/common/Config.java       | 10 +++++-----
 .../src/main/java/org/apache/doris/alter/AlterJobV2.java    | 11 +++++++++++
 .../src/main/java/org/apache/doris/alter/RollupJobV2.java   |  9 +--------
 .../main/java/org/apache/doris/alter/SchemaChangeJobV2.java | 13 +++----------
 .../src/main/java/org/apache/doris/task/AgentBatchTask.java |  4 ++++
 .../test_schema_change_with_compaction11.groovy             |  3 ++-
 6 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java 
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 94728fe3f9d..51240074645 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -3295,12 +3295,12 @@ public class Config extends ConfigBase {
         "Maximal concurrent num of get tablet stat job."})
     public static int max_get_tablet_stat_task_threads_num = 4;
 
-    @ConfField(mutable = true, description = {"存算分离模式下schema change失败是否重试",
-            "Whether to enable retry when schema change failed in cloud model, 
default is true."})
-    public static boolean enable_schema_change_retry_in_cloud_mode = true;
+    @ConfField(mutable = true, description = {"schema change job 失败是否重试",
+            "Whether to enable retry when a schema change job fails, default 
is true."})
+    public static boolean enable_schema_change_retry = true;
 
-    @ConfField(mutable = true, description = {"存算分离模式下schema change重试次数",
-            "Max retry times when schema change failed in cloud model, default 
is 3."})
+    @ConfField(mutable = true, description = {"schema change job 重试次数",
+            "Max retry times when a schema change job fails, default is 3."})
     public static int schema_change_max_retry_time = 3;
 
     @ConfField(mutable = true, description = {"是否允许使用ShowCacheHotSpotStmt语句",
diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java 
b/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
index b1ccf230526..d13cacae06d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/AlterJobV2.java
@@ -30,6 +30,7 @@ import org.apache.doris.common.util.DebugPointUtil;
 import org.apache.doris.persist.gson.GsonUtils;
 import org.apache.doris.qe.ConnectContext;
 import org.apache.doris.task.AgentTask;
+import org.apache.doris.thrift.TStatusCode;
 
 import com.google.common.base.Strings;
 import com.google.common.collect.Maps;
@@ -263,6 +264,16 @@ public abstract class AlterJobV2 implements Writable {
         return cancelImpl(errMsg);
     }
 
+    protected int getRetryTimes(AgentTask task) {
+        int maxFailedTimes = 0;
+        if (Config.enable_schema_change_retry && task.getErrorCode() != null
+                && 
(task.getErrorCode().equals(TStatusCode.DELETE_BITMAP_LOCK_ERROR)
+                    || task.getErrorCode().equals(TStatusCode.NETWORK_ERROR))) 
{
+            maxFailedTimes = Config.schema_change_max_retry_time;
+        }
+        return maxFailedTimes;
+    }
+
     /**
     * should be call before executing the job.
     * return false if table is not stable.
diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java 
b/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
index e5d97d1f97b..f9a3ba16190 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/RollupJobV2.java
@@ -64,7 +64,6 @@ import org.apache.doris.task.AgentTaskExecutor;
 import org.apache.doris.task.AgentTaskQueue;
 import org.apache.doris.task.AlterReplicaTask;
 import org.apache.doris.task.CreateReplicaTask;
-import org.apache.doris.thrift.TStatusCode;
 import org.apache.doris.thrift.TStorageFormat;
 import org.apache.doris.thrift.TStorageMedium;
 import org.apache.doris.thrift.TStorageType;
@@ -543,13 +542,7 @@ public class RollupJobV2 extends AlterJobV2 implements 
GsonPostProcessable {
             List<AgentTask> tasks = rollupBatchTask.getUnfinishedTasks(2000);
             ensureCloudClusterExist(tasks);
             for (AgentTask task : tasks) {
-                int maxFailedTimes = 0;
-                if (Config.isCloudMode() && 
Config.enable_schema_change_retry_in_cloud_mode) {
-                    if (task.getErrorCode() != null && task.getErrorCode()
-                            .equals(TStatusCode.DELETE_BITMAP_LOCK_ERROR)) {
-                        maxFailedTimes = Config.schema_change_max_retry_time;
-                    }
-                }
+                int maxFailedTimes = getRetryTimes(task);
                 if (task.getFailedTimes() > maxFailedTimes) {
                     task.setFinished(true);
                     AgentTaskQueue.removeTask(task.getBackendId(), 
TTaskType.ALTER, task.getSignature());
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java 
b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
index 28d58e4f6fa..2346cdea92a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java
@@ -57,7 +57,6 @@ import org.apache.doris.task.AgentTaskExecutor;
 import org.apache.doris.task.AgentTaskQueue;
 import org.apache.doris.task.AlterReplicaTask;
 import org.apache.doris.task.CreateReplicaTask;
-import org.apache.doris.thrift.TStatusCode;
 import org.apache.doris.thrift.TStorageFormat;
 import org.apache.doris.thrift.TStorageMedium;
 import org.apache.doris.thrift.TStorageType;
@@ -587,19 +586,13 @@ public class SchemaChangeJobV2 extends AlterJobV2 {
             List<AgentTask> tasks = 
schemaChangeBatchTask.getUnfinishedTasks(2000);
             ensureCloudClusterExist(tasks);
             for (AgentTask task : tasks) {
-                int maxFailedTimes = 0;
-                if (Config.isCloudMode() && 
Config.enable_schema_change_retry_in_cloud_mode) {
-                    if (task.getErrorCode() != null && task.getErrorCode()
-                            .equals(TStatusCode.DELETE_BITMAP_LOCK_ERROR)) {
-                        maxFailedTimes = Config.schema_change_max_retry_time;
-                    }
-                }
+                int maxFailedTimes = getRetryTimes(task);
                 if (task.getFailedTimes() > maxFailedTimes) {
                     task.setFinished(true);
                     if (!FeConstants.runningUnitTest) {
                         AgentTaskQueue.removeTask(task.getBackendId(), 
TTaskType.ALTER, task.getSignature());
-                        LOG.warn("schema change task failed, failedTimes: {}, 
maxFailedTimes: {}, err: {}",
-                                task.getFailedTimes(), maxFailedTimes, 
task.getErrorMsg());
+                        LOG.warn("schema change task failed, job: {}, 
failedTimes: {}, maxFailedTimes: {}, err: {}",
+                                 jobId, task.getFailedTimes(), maxFailedTimes, 
task.getErrorMsg());
                         List<Long> failedBackends = 
failedTabletBackends.get(task.getTabletId());
                         if (failedBackends == null) {
                             failedBackends = Lists.newArrayList();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java 
b/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
index ead255ace36..a821aa143ef 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/task/AgentBatchTask.java
@@ -49,6 +49,7 @@ import org.apache.doris.thrift.TPushReq;
 import org.apache.doris.thrift.TPushStoragePolicyReq;
 import org.apache.doris.thrift.TReleaseSnapshotRequest;
 import org.apache.doris.thrift.TSnapshotRequest;
+import org.apache.doris.thrift.TStatusCode;
 import org.apache.doris.thrift.TStorageMediumMigrateReq;
 import org.apache.doris.thrift.TTaskType;
 import org.apache.doris.thrift.TUpdateTabletMetaInfoReq;
@@ -222,6 +223,9 @@ public class AgentBatchTask implements Runnable {
                     List<AgentTask> tasks = 
this.backendIdToTasks.get(backendId);
                     for (AgentTask task : tasks) {
                         task.failedWithMsg(errMsg);
+                        if (errMsg.contains("Socket is closed")) {
+                            task.setErrorCode(TStatusCode.NETWORK_ERROR);
+                        }
                     }
                 }
             }
diff --git 
a/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
 
b/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
index 4c1c772da37..0e765e46b7b 100644
--- 
a/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
+++ 
b/regression-test/suites/cloud_p0/schema_change/compaction11/test_schema_change_with_compaction11.groovy
@@ -144,8 +144,9 @@ suite('test_schema_change_with_compaction11', 'docker') {
                 GetDebugPoint().disableDebugPointForAllBEs(injectName)
             }
             int max_try_time = 3000
+            def result = null
             while (max_try_time--){
-                def result = getJobState("date")
+                result = getJobState("date")
                 if (result == "FINISHED" || result == "CANCELLED") {
                     sleep(3000)
                     break


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to