This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 5f5cb635df3 [chore](tablet scheduler) update sched ctx err #27514 
(#31189)
5f5cb635df3 is described below

commit 5f5cb635df332ffc74c22eac38834e3c9c4e2ca7
Author: yujun <yu.jun.re...@gmail.com>
AuthorDate: Tue Apr 30 14:18:04 2024 +0800

    [chore](tablet scheduler) update sched ctx err #27514 (#31189)
---
 .../org/apache/doris/clone/BeLoadRebalancer.java   |  6 ++---
 .../org/apache/doris/clone/SchedException.java     |  1 +
 .../org/apache/doris/clone/TabletSchedCtx.java     | 12 +++++++++
 .../org/apache/doris/clone/TabletScheduler.java    | 30 ++++++++++++----------
 4 files changed, 33 insertions(+), 16 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/BeLoadRebalancer.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/BeLoadRebalancer.java
index 93d884af5ce..c81271eb890 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/BeLoadRebalancer.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/BeLoadRebalancer.java
@@ -288,7 +288,7 @@ public class BeLoadRebalancer extends Rebalancer {
             }
         }
         if (!setSource) {
-            throw new SchedException(Status.UNRECOVERABLE, "unable to take src 
slot");
+            throw new SchedException(Status.UNRECOVERABLE, 
SubCode.DIAGNOSE_IGNORE, "unable to take src backend slot");
         }
 
         // Select a low load backend as destination.
@@ -331,7 +331,7 @@ public class BeLoadRebalancer extends Rebalancer {
         }
 
         if (candidates.isEmpty()) {
-            throw new SchedException(Status.UNRECOVERABLE, "unable to find low 
backend");
+            throw new SchedException(Status.UNRECOVERABLE, 
SubCode.DIAGNOSE_IGNORE, "unable to find low dest backend");
         }
 
         List<BePathLoadStatPair> candFitPaths = Lists.newArrayList();
@@ -368,7 +368,7 @@ public class BeLoadRebalancer extends Rebalancer {
         }
 
         throw new SchedException(Status.SCHEDULE_FAILED, SubCode.WAITING_SLOT,
-                "unable to find low backend");
+                "beload waiting for dest backend slot");
     }
 
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/SchedException.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/SchedException.java
index a343e6543c3..cb7080299ec 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/SchedException.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/SchedException.java
@@ -31,6 +31,7 @@ public class SchedException extends Exception {
         NONE,
         WAITING_DECOMMISSION,
         WAITING_SLOT,
+        DIAGNOSE_IGNORE,
     }
 
     private Status status;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
index b9606f09d0a..912fc1bb316 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletSchedCtx.java
@@ -424,6 +424,18 @@ public class TabletSchedCtx implements 
Comparable<TabletSchedCtx> {
         this.errMsg = errMsg;
     }
 
+    public String getErrMsg() {
+        return errMsg;
+    }
+
+    public SubCode getSchedFailedCode() {
+        return schedFailedCode;
+    }
+
+    public void setSchedFailedCode(SubCode code) {
+        schedFailedCode = code;
+    }
+
     public CloneTask getCloneTask() {
         return cloneTask;
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java 
b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
index ede3e541675..4a22878424d 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/clone/TabletScheduler.java
@@ -390,15 +390,6 @@ public class TabletScheduler extends MasterDaemon {
         AgentBatchTask batchTask = new AgentBatchTask();
         for (TabletSchedCtx tabletCtx : currentBatch) {
             try {
-                if (Config.disable_tablet_scheduler) {
-                    // do not schedule more tablet is tablet scheduler is 
disabled.
-                    throw new SchedException(Status.FINISHED, "tablet 
scheduler is disabled");
-                }
-                if (Config.disable_balance && tabletCtx.getType() == 
Type.BALANCE) {
-                    finalizeTabletCtx(tabletCtx, 
TabletSchedCtx.State.CANCELLED, Status.UNRECOVERABLE,
-                            "config disable balance");
-                    continue;
-                }
                 scheduleTablet(tabletCtx, batchTask);
             } catch (SchedException e) {
                 tabletCtx.setErrMsg(e.getMessage());
@@ -422,6 +413,7 @@ public class TabletScheduler extends MasterDaemon {
                     Preconditions.checkState(e.getStatus() == 
Status.UNRECOVERABLE, e.getStatus());
                     // discard
                     stat.counterTabletScheduledDiscard.incrementAndGet();
+                    tabletCtx.setSchedFailedCode(e.getSubCode());
                     finalizeTabletCtx(tabletCtx, 
TabletSchedCtx.State.CANCELLED, e.getStatus(), e.getMessage());
                 }
                 continue;
@@ -429,6 +421,8 @@ public class TabletScheduler extends MasterDaemon {
                 LOG.warn("got unexpected exception, discard this schedule. 
tablet: {}",
                         tabletCtx.getTabletId(), e);
                 stat.counterTabletScheduledFailed.incrementAndGet();
+                tabletCtx.setSchedFailedCode(SubCode.NONE);
+                tabletCtx.setErrMsg(e.getMessage());
                 finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.UNEXPECTED, 
Status.UNRECOVERABLE, e.getMessage());
                 continue;
             }
@@ -470,6 +464,14 @@ public class TabletScheduler extends MasterDaemon {
      * Try to schedule a single tablet.
      */
     private void scheduleTablet(TabletSchedCtx tabletCtx, AgentBatchTask 
batchTask) throws SchedException {
+        if (Config.disable_tablet_scheduler) {
+            // do not schedule more tablet is tablet scheduler is disabled.
+            throw new SchedException(Status.UNRECOVERABLE, 
SubCode.DIAGNOSE_IGNORE, "tablet scheduler is disabled");
+        }
+        if (Config.disable_balance && tabletCtx.getType() == Type.BALANCE) {
+            throw new SchedException(Status.UNRECOVERABLE, 
SubCode.DIAGNOSE_IGNORE, "balance is disabled");
+        }
+
         long currentTime = System.currentTimeMillis();
         tabletCtx.setLastSchedTime(currentTime);
         tabletCtx.setLastVisitedTime(currentTime);
@@ -1447,7 +1449,7 @@ public class TabletScheduler extends MasterDaemon {
 
         if (hasBePath) {
             throw new SchedException(Status.SCHEDULE_FAILED, 
SubCode.WAITING_SLOT,
-                    "unable to find dest path which can be fit in");
+                    "scheduler waiting for dest backend slot");
         } else {
             throw new SchedException(Status.UNRECOVERABLE,
                     "unable to find dest path which can be fit in");
@@ -1647,8 +1649,9 @@ public class TabletScheduler extends MasterDaemon {
             updateDestPathHash(tabletCtx);
             finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.FINISHED, 
Status.FINISHED, "finished");
         } else {
-            finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, 
Status.UNRECOVERABLE,
-                    request.getTaskStatus().getErrorMsgs().get(0));
+            String errMsg = request.getTaskStatus().getErrorMsgs().get(0);
+            tabletCtx.setErrMsg(errMsg);
+            finalizeTabletCtx(tabletCtx, TabletSchedCtx.State.CANCELLED, 
Status.UNRECOVERABLE, errMsg);
         }
 
         return true;
@@ -1776,6 +1779,7 @@ public class TabletScheduler extends MasterDaemon {
             // Set "resetReplicaState" to true because
             // the timeout task should also be considered as UNRECOVERABLE,
             // so need to reset replica state.
+            t.setErrMsg("timeout");
             finalizeTabletCtx(t, TabletSchedCtx.State.CANCELLED, 
Status.UNRECOVERABLE, "timeout");
             stat.counterCloneTaskTimeout.incrementAndGet();
         });
@@ -1946,7 +1950,7 @@ public class TabletScheduler extends MasterDaemon {
                     LOG.debug("path hash is not set.", new Exception());
                 }
                 throw new SchedException(Status.SCHEDULE_FAILED, 
SubCode.WAITING_SLOT,
-                        "path hash is not set");
+                        "backend " + beId + " path hash is not set");
             }
 
             Slot slot = pathSlots.get(pathHash);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to