This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new ee7f9a4f268 [fix](oom) avoid oom when a lot of tablets fail on load 
(#36944)
ee7f9a4f268 is described below

commit ee7f9a4f2688c1800a8ed9b7e510cc5530b39274
Author: Yongqiang YANG <98214048+dataroar...@users.noreply.github.com>
AuthorDate: Thu Jun 27 22:12:42 2024 +0800

    [fix](oom) avoid oom when a lot of tablets fail on load (#36944)
    
    pick #36873
---
 .../doris/transaction/DatabaseTransactionMgr.java  | 62 +++++++++++++++++-----
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
 
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
index 3996664708a..368415c3abd 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java
@@ -1159,6 +1159,45 @@ public class DatabaseTransactionMgr {
         return true;
     }
 
+    private class TabletsPublishResultLogs {
+        public List<String> quorumSuccLogs = Lists.newArrayList();
+        public List<String> timeoutSuccLogs = Lists.newArrayList();
+        public List<String> failedLogs = Lists.newArrayList();
+
+        public void addQuorumSuccLog(String log) {
+            if (quorumSuccLogs.size() < 16) {
+                quorumSuccLogs.add(log);
+            }
+        }
+
+        public void addTimeoutSuccLog(String log) {
+            if (timeoutSuccLogs.size() < 16) {
+                timeoutSuccLogs.add(log);
+            }
+        }
+
+        public void addFailedLog(String log) {
+            if (failedLogs.size() < 16) {
+                failedLogs.add(log);
+            }
+        }
+
+        public void log() {
+            // log failed logs
+            for (String log : failedLogs) {
+                LOG.info(log);
+            }
+            // log timeout succ logs
+            for (String log : timeoutSuccLogs) {
+                LOG.info(log);
+            }
+            // log quorum succ logs
+            for (String log : quorumSuccLogs) {
+                LOG.info(log);
+            }
+        }
+    }
+
     private PublishResult finishCheckQuorumReplicas(TransactionState 
transactionState,
             List<Pair<OlapTable, Partition>> relatedTblPartitions,
             Set<Long> errorReplicaIds) {
@@ -1173,7 +1212,7 @@ public class DatabaseTransactionMgr {
         List<Replica> tabletSuccReplicas = Lists.newArrayList();
         List<Replica> tabletWriteFailedReplicas = Lists.newArrayList();
         List<Replica> tabletVersionFailedReplicas = Lists.newArrayList();
-        List<String> logs = Lists.newArrayList();
+        TabletsPublishResultLogs logs = new TabletsPublishResultLogs();
 
         Map<Long, PublishVersionTask> publishTasks = 
transactionState.getPublishVersionTasks();
         PublishResult publishResult = PublishResult.QUORUM_SUCC;
@@ -1224,9 +1263,9 @@ public class DatabaseTransactionMgr {
                         if (hasFailedReplica) {
                             String writeDetail = 
getTabletWriteDetail(tabletSuccReplicas,
                                     tabletWriteFailedReplicas, 
tabletVersionFailedReplicas);
-                            logs.add(String.format("publish version quorum 
succ for transaction %s on tablet %s"
-                                    + " with version %s, and has failed 
replicas, load require replica num %s. "
-                                    + "table %s, partition: [ id=%s, commit 
version=%s ], tablet detail: %s",
+                            logs.addQuorumSuccLog(String.format("publish 
version quorum succ for transaction %s "
+                                    + "on tablet %s with version %s, and has 
failed replicas, load require replica "
+                                    + "num %s. table %s, partition: [ id=%s, 
commit version=%s ], tablet detail: %s",
                                     transactionState, tablet.getId(), 
newVersion, loadRequiredReplicaNum, tableId,
                                     partitionId, 
partition.getCommittedVersion(), writeDetail));
                         }
@@ -1248,9 +1287,9 @@ public class DatabaseTransactionMgr {
                         // that are being publised exists on a few replicas we 
should go
                         // ahead, otherwise data may be lost and thre
                         // publish task hangs forever.
-                        logs.add(String.format("publish version timeout succ 
for transaction %s on tablet %s "
-                                + "with version %s, and has failed replicas, 
load require replica num %s. "
-                                + "table %s, partition %s, tablet detail: %s",
+                        logs.addTimeoutSuccLog(String.format("publish version 
timeout succ for transaction %s "
+                                + "on tablet %s with version %s, and has 
failed replicas, load require replica num %s."
+                                + " table %s, partition %s, tablet detail: %s",
                                 transactionState, tablet.getId(), newVersion,
                                 loadRequiredReplicaNum, tableId, partitionId, 
writeDetail));
                     } else {
@@ -1261,8 +1300,8 @@ public class DatabaseTransactionMgr {
                                 tablet.getId(), healthReplicaNum, 
loadRequiredReplicaNum, tableId,
                                 partitionId, newVersion);
                         transactionState.setErrorMsg(errMsg);
-                        logs.add(String.format("publish version failed for 
transaction %s on tablet %s with version"
-                                + " %s, and has failed replicas, load required 
replica num %s. table %s, "
+                        logs.addQuorumSuccLog(String.format("publish version 
failed for transaction %s on tablet %s "
+                                + " with version %s, and has failed replicas, 
load required replica num %s. table %s, "
                                 + "partition %s, tablet detail: %s",
                                 transactionState, tablet.getId(), newVersion,
                                 loadRequiredReplicaNum, tableId, partitionId, 
writeDetail));
@@ -1275,10 +1314,7 @@ public class DatabaseTransactionMgr {
                 || now - transactionState.getLastPublishLogTime() > 
Config.publish_fail_log_interval_second * 1000L;
         if (needLog) {
             transactionState.setLastPublishLogTime(now);
-            for (String log : logs) {
-                LOG.info("{}. publish times {}, whole txn publish result {}",
-                        log, transactionState.getPublishCount(), 
publishResult.name());
-            }
+            logs.log();
         }
 
         return publishResult;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to