This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new ee7f9a4f268 [fix](oom) avoid oom when a lot of tablets fail on load (#36944) ee7f9a4f268 is described below commit ee7f9a4f2688c1800a8ed9b7e510cc5530b39274 Author: Yongqiang YANG <98214048+dataroar...@users.noreply.github.com> AuthorDate: Thu Jun 27 22:12:42 2024 +0800 [fix](oom) avoid oom when a lot of tablets fail on load (#36944) pick #36873 --- .../doris/transaction/DatabaseTransactionMgr.java | 62 +++++++++++++++++----- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index 3996664708a..368415c3abd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -1159,6 +1159,45 @@ public class DatabaseTransactionMgr { return true; } + private class TabletsPublishResultLogs { + public List<String> quorumSuccLogs = Lists.newArrayList(); + public List<String> timeoutSuccLogs = Lists.newArrayList(); + public List<String> failedLogs = Lists.newArrayList(); + + public void addQuorumSuccLog(String log) { + if (quorumSuccLogs.size() < 16) { + quorumSuccLogs.add(log); + } + } + + public void addTimeoutSuccLog(String log) { + if (timeoutSuccLogs.size() < 16) { + timeoutSuccLogs.add(log); + } + } + + public void addFailedLog(String log) { + if (failedLogs.size() < 16) { + failedLogs.add(log); + } + } + + public void log() { + // log failed logs + for (String log : failedLogs) { + LOG.info(log); + } + // log timeout succ logs + for (String log : timeoutSuccLogs) { + LOG.info(log); + } + // log quorum succ logs + for (String log : quorumSuccLogs) { + LOG.info(log); + } + } + } + private PublishResult finishCheckQuorumReplicas(TransactionState transactionState, List<Pair<OlapTable, Partition>> relatedTblPartitions, Set<Long> errorReplicaIds) { @@ -1173,7 +1212,7 @@ public class DatabaseTransactionMgr { List<Replica> tabletSuccReplicas = Lists.newArrayList(); List<Replica> tabletWriteFailedReplicas = Lists.newArrayList(); List<Replica> tabletVersionFailedReplicas = Lists.newArrayList(); - List<String> logs = Lists.newArrayList(); + TabletsPublishResultLogs logs = new TabletsPublishResultLogs(); Map<Long, PublishVersionTask> publishTasks = transactionState.getPublishVersionTasks(); PublishResult publishResult = PublishResult.QUORUM_SUCC; @@ -1224,9 +1263,9 @@ public class DatabaseTransactionMgr { if (hasFailedReplica) { String writeDetail = getTabletWriteDetail(tabletSuccReplicas, tabletWriteFailedReplicas, tabletVersionFailedReplicas); - logs.add(String.format("publish version quorum succ for transaction %s on tablet %s" - + " with version %s, and has failed replicas, load require replica num %s. " - + "table %s, partition: [ id=%s, commit version=%s ], tablet detail: %s", + logs.addQuorumSuccLog(String.format("publish version quorum succ for transaction %s " + + "on tablet %s with version %s, and has failed replicas, load require replica " + + "num %s. table %s, partition: [ id=%s, commit version=%s ], tablet detail: %s", transactionState, tablet.getId(), newVersion, loadRequiredReplicaNum, tableId, partitionId, partition.getCommittedVersion(), writeDetail)); } @@ -1248,9 +1287,9 @@ public class DatabaseTransactionMgr { // that are being publised exists on a few replicas we should go // ahead, otherwise data may be lost and thre // publish task hangs forever. - logs.add(String.format("publish version timeout succ for transaction %s on tablet %s " - + "with version %s, and has failed replicas, load require replica num %s. " - + "table %s, partition %s, tablet detail: %s", + logs.addTimeoutSuccLog(String.format("publish version timeout succ for transaction %s " + + "on tablet %s with version %s, and has failed replicas, load require replica num %s." + + " table %s, partition %s, tablet detail: %s", transactionState, tablet.getId(), newVersion, loadRequiredReplicaNum, tableId, partitionId, writeDetail)); } else { @@ -1261,8 +1300,8 @@ public class DatabaseTransactionMgr { tablet.getId(), healthReplicaNum, loadRequiredReplicaNum, tableId, partitionId, newVersion); transactionState.setErrorMsg(errMsg); - logs.add(String.format("publish version failed for transaction %s on tablet %s with version" - + " %s, and has failed replicas, load required replica num %s. table %s, " + logs.addQuorumSuccLog(String.format("publish version failed for transaction %s on tablet %s " + + " with version %s, and has failed replicas, load required replica num %s. table %s, " + "partition %s, tablet detail: %s", transactionState, tablet.getId(), newVersion, loadRequiredReplicaNum, tableId, partitionId, writeDetail)); @@ -1275,10 +1314,7 @@ public class DatabaseTransactionMgr { || now - transactionState.getLastPublishLogTime() > Config.publish_fail_log_interval_second * 1000L; if (needLog) { transactionState.setLastPublishLogTime(now); - for (String log : logs) { - LOG.info("{}. publish times {}, whole txn publish result {}", - log, transactionState.getPublishCount(), publishResult.name()); - } + logs.log(); } return publishResult; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org