This is an automated email from the ASF dual-hosted git repository. gavinchou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 1bebb90d71f [enhancement](metrics) add some table metrics (#49930) 1bebb90d71f is described below commit 1bebb90d71fda46eef6fe18e7f5d7676338446ae Author: yagagagaga <zhangmi...@selectdb.com> AuthorDate: Fri May 9 16:45:22 2025 +0800 [enhancement](metrics) add some table metrics (#49930) add these metrics in Doris' FE: ```properties # HELP doris_fe_min_tablet_size_bytes minimal local data size of all (internal) tables' tablets # TYPE doris_fe_min_tablet_size_bytes gauge doris_fe_min_tablet_size_bytes 0 # HELP doris_fe_min_partition_size_bytes minimal local data size of all (internal) tables' partitions # TYPE doris_fe_min_partition_size_bytes gauge doris_fe_min_partition_size_bytes 0 # HELP doris_fe_min_table_size_bytes minimal local data size of all (internal) tables # TYPE doris_fe_min_table_size_bytes gauge doris_fe_min_table_size_bytes 1360 # HELP doris_fe_max_tablet_size_bytes maximal local data size of all (internal) tables' tablets # TYPE doris_fe_max_tablet_size_bytes gauge doris_fe_max_tablet_size_bytes 4739 # HELP doris_fe_max_partition_size_bytes maximal local data size of all (internal) tables' partitions # TYPE doris_fe_max_partition_size_bytes gauge doris_fe_max_partition_size_bytes 9431 # HELP doris_fe_max_table_size_bytes maximal local data size of all (internal) tables # TYPE doris_fe_max_table_size_bytes gauge doris_fe_max_table_size_bytes 9431 # HELP doris_fe_avg_tablet_size_bytes average local data size of all (internal) tables' tablets # TYPE doris_fe_avg_tablet_size_bytes gauge doris_fe_avg_tablet_size_bytes 830 # HELP doris_fe_avg_partition_size_bytes average local data size of all (internal) tables' partitions # TYPE doris_fe_avg_partition_size_bytes gauge doris_fe_avg_partition_size_bytes 1541 # HELP doris_fe_avg_table_size_bytes average local data size of all (internal) tables # TYPE doris_fe_avg_table_size_bytes gauge doris_fe_avg_table_size_bytes 5395 ``` --- .../apache/doris/catalog/CloudTabletStatMgr.java | 71 ++++++++++++++++++++-- .../org/apache/doris/catalog/TabletStatMgr.java | 67 +++++++++++++++++++- .../java/org/apache/doris/metric/MetricRepo.java | 37 +++++++++++ 3 files changed, 168 insertions(+), 7 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java index 3babb0e001a..a7c30c38cb5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/CloudTabletStatMgr.java @@ -27,6 +27,7 @@ import org.apache.doris.cloud.rpc.MetaServiceProxy; import org.apache.doris.common.Config; import org.apache.doris.common.Pair; import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.metric.MetricRepo; import org.apache.doris.rpc.RpcException; import org.apache.logging.log4j.LogManager; @@ -148,19 +149,28 @@ public class CloudTabletStatMgr extends MasterDaemon { // after update replica in all backends, update index row num start = System.currentTimeMillis(); - + Pair<String, Long> maxTabletSize = Pair.of(/* tablet id= */null, /* byte size= */0L); + Pair<String, Long> maxPartitionSize = Pair.of(/* partition id= */null, /* byte size= */0L); + Pair<String, Long> maxTableSize = Pair.of(/* table id= */null, /* byte size= */0L); + Pair<String, Long> minTabletSize = Pair.of(/* tablet id= */null, /* byte size= */Long.MAX_VALUE); + Pair<String, Long> minPartitionSize = Pair.of(/* partition id= */null, /* byte size= */Long.MAX_VALUE); + Pair<String, Long> minTableSize = Pair.of(/* tablet id= */null, /* byte size= */Long.MAX_VALUE); + Long totalTableSize = 0L; + Long tabletCount = 0L; + Long partitionCount = 0L; + Long tableCount = 0L; Map<Pair<Long, Long>, OlapTable.Statistics> newCloudTableStatsMap = new HashMap<>(); for (Long dbId : dbIds) { Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId); if (db == null) { continue; } - List<Table> tableList = db.getTables(); for (Table table : tableList) { if (!table.isManagedTable()) { continue; } + tableCount++; OlapTable olapTable = (OlapTable) table; Long tableDataSize = 0L; @@ -178,10 +188,15 @@ public class CloudTabletStatMgr extends MasterDaemon { continue; } try { - for (Partition partition : olapTable.getAllPartitions()) { + List<Partition> allPartitions = olapTable.getAllPartitions(); + partitionCount += allPartitions.size(); + for (Partition partition : allPartitions) { + Long partitionDataSize = 0L; for (MaterializedIndex index : partition.getMaterializedIndices(IndexExtState.VISIBLE)) { long indexRowCount = 0L; - for (Tablet tablet : index.getTablets()) { + List<Tablet> tablets = index.getTablets(); + tabletCount += tablets.size(); + for (Tablet tablet : tablets) { long tabletDataSize = 0L; long tabletRowsetCount = 0L; @@ -219,6 +234,13 @@ public class CloudTabletStatMgr extends MasterDaemon { } tableDataSize += tabletDataSize; + partitionDataSize += tabletDataSize; + if (maxTabletSize.second <= tabletDataSize) { + maxTabletSize = Pair.of("" + tablet.getId(), tabletDataSize); + } + if (minTabletSize.second >= tabletDataSize) { + minTabletSize = Pair.of("" + tablet.getId(), tabletDataSize); + } tableRowCount += tabletRowCount; indexRowCount += tabletRowCount; @@ -231,7 +253,19 @@ public class CloudTabletStatMgr extends MasterDaemon { index.setRowCountReported(true); index.setRowCount(indexRowCount); } // end for indices + if (maxPartitionSize.second <= partitionDataSize) { + maxPartitionSize = Pair.of("" + partition.getId(), partitionDataSize); + } + if (minPartitionSize.second >= partitionDataSize) { + minPartitionSize = Pair.of("" + partition.getId(), partitionDataSize); + } } // end for partitions + if (maxTableSize.second <= tableDataSize) { + maxTableSize = Pair.of("" + table.getId(), tableDataSize); + } + if (minTableSize.second >= tableDataSize) { + minTableSize = Pair.of("" + table.getId(), tableDataSize); + } // this is only one thread to update table statistics, readLock is enough olapTable.setStatistics(new OlapTable.Statistics(db.getName(), @@ -243,15 +277,42 @@ public class CloudTabletStatMgr extends MasterDaemon { } finally { table.readUnlock(); } - + totalTableSize += tableDataSize; newCloudTableStatsMap.put(Pair.of(dbId, table.getId()), new OlapTable.Statistics(db.getName(), table.getName(), tableDataSize, tableTotalReplicaDataSize, 0L, tableReplicaCount, tableRowCount, tableRowsetCount, tableSegmentCount, 0L, 0L, 0L, 0L)); } } this.cloudTableStatsMap = newCloudTableStatsMap; + + MetricRepo.GAUGE_MAX_TABLE_SIZE_BYTES.setValue(maxTableSize.second); + MetricRepo.GAUGE_MAX_PARTITION_SIZE_BYTES.setValue(maxPartitionSize.second); + MetricRepo.GAUGE_MAX_TABLET_SIZE_BYTES.setValue(maxTabletSize.second); + long minTableSizeTmp = minTableSize.second == Long.MAX_VALUE ? 0 : minTableSize.second; + MetricRepo.GAUGE_MIN_TABLE_SIZE_BYTES.setValue(minTableSizeTmp); + long minPartitionSizeTmp = minPartitionSize.second == Long.MAX_VALUE ? 0 : minPartitionSize.second; + MetricRepo.GAUGE_MIN_PARTITION_SIZE_BYTES.setValue(minPartitionSizeTmp); + long minTabletSizeTmp = minTabletSize.second == Long.MAX_VALUE ? 0 : minTabletSize.second; + MetricRepo.GAUGE_MIN_TABLET_SIZE_BYTES.setValue(minTabletSizeTmp); + long avgTableSize = totalTableSize / Math.max(1, tableCount); // avoid ArithmeticException: / by zero + MetricRepo.GAUGE_AVG_TABLE_SIZE_BYTES.setValue(avgTableSize); + long avgPartitionSize = totalTableSize / Math.max(1, partitionCount); // avoid ArithmeticException: / by zero + MetricRepo.GAUGE_AVG_PARTITION_SIZE_BYTES.setValue(avgPartitionSize); + long avgTabletSize = totalTableSize / Math.max(1, tabletCount); // avoid ArithmeticException: / by zero + MetricRepo.GAUGE_AVG_TABLET_SIZE_BYTES.setValue(avgTabletSize); LOG.info("finished to update index row num of all databases. cost: {} ms", (System.currentTimeMillis() - start)); + LOG.info("Olap table num=" + tableCount + ", partition num=" + partitionCount + ", tablet num=" + tabletCount + + ", max tablet byte size=" + maxTabletSize.second + "(tablet_id=" + maxTableSize.first + ")" + + ", min tablet byte size=" + minTabletSizeTmp + "(tablet_id=" + minTabletSize.first + ")" + + ", avg tablet byte size=" + avgTabletSize + + ", max partition byte size=" + maxPartitionSize.second + "(partition_id=" + maxPartitionSize.first + + ")" + + ", min partition byte size=" + minPartitionSizeTmp + "(partition_id=" + minPartitionSize.first + ")" + + ", avg partition byte size=" + avgPartitionSize + + ", max table byte size=" + maxTableSize.second + "(table_id=" + maxTableSize.first + ")" + + ", min table byte size=" + minTableSizeTmp + "(table_id=" + minTableSize.first + ")" + + ", avg table byte size=" + avgTableSize); } private void updateTabletStat(GetTabletStatsResponse response) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java index 14dc88eb509..ed5349e830d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TabletStatMgr.java @@ -22,6 +22,7 @@ import org.apache.doris.common.AnalysisException; import org.apache.doris.common.ClientPool; import org.apache.doris.common.Config; import org.apache.doris.common.MarkedCountDownLatch; +import org.apache.doris.common.Pair; import org.apache.doris.common.Status; import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.common.util.MasterDaemon; @@ -117,6 +118,16 @@ public class TabletStatMgr extends MasterDaemon { // after update replica in all backends, update index row num start = System.currentTimeMillis(); + Pair<String, Long> maxTabletSize = Pair.of(/* tablet id= */null, /* byte size= */0L); + Pair<String, Long> maxPartitionSize = Pair.of(/* partition id= */null, /* byte size= */0L); + Pair<String, Long> maxTableSize = Pair.of(/* table id= */null, /* byte size= */0L); + Pair<String, Long> minTabletSize = Pair.of(/* tablet id= */null, /* byte size= */Long.MAX_VALUE); + Pair<String, Long> minPartitionSize = Pair.of(/* partition id= */null, /* byte size= */Long.MAX_VALUE); + Pair<String, Long> minTableSize = Pair.of(/* tablet id= */null, /* byte size= */Long.MAX_VALUE); + Long totalTableSize = 0L; + Long tabletCount = 0L; + Long partitionCount = 0L; + Long tableCount = 0L; List<Long> dbIds = Env.getCurrentInternalCatalog().getDbIds(); for (Long dbId : dbIds) { Database db = Env.getCurrentInternalCatalog().getDbNullable(dbId); @@ -129,6 +140,7 @@ public class TabletStatMgr extends MasterDaemon { if (!table.isManagedTable()) { continue; } + tableCount++; OlapTable olapTable = (OlapTable) table; Long tableDataSize = 0L; @@ -149,12 +161,17 @@ public class TabletStatMgr extends MasterDaemon { continue; } try { - for (Partition partition : olapTable.getAllPartitions()) { + List<Partition> allPartitions = olapTable.getAllPartitions(); + partitionCount += allPartitions.size(); + for (Partition partition : allPartitions) { + Long partitionDataSize = 0L; long version = partition.getVisibleVersion(); for (MaterializedIndex index : partition.getMaterializedIndices(IndexExtState.VISIBLE)) { long indexRowCount = 0L; boolean indexReported = true; - for (Tablet tablet : index.getTablets()) { + List<Tablet> tablets = index.getTablets(); + tabletCount += tablets.size(); + for (Tablet tablet : tablets) { Long tabletDataSize = 0L; Long tabletRemoteDataSize = 0L; @@ -203,7 +220,14 @@ public class TabletStatMgr extends MasterDaemon { } tableDataSize += tabletDataSize; + partitionDataSize += tabletDataSize; tableRemoteDataSize += tabletRemoteDataSize; + if (maxTabletSize.second <= tabletDataSize) { + maxTabletSize = Pair.of("" + tablet.getId(), tabletDataSize); + } + if (minTabletSize.second >= tabletDataSize) { + minTabletSize = Pair.of("" + tablet.getId(), tabletDataSize); + } // When all BEs are down, avoid set Long.MAX_VALUE to index and table row count. Use 0. if (tabletRowCount == Long.MAX_VALUE) { @@ -220,7 +244,19 @@ public class TabletStatMgr extends MasterDaemon { olapTable.getName(), olapTable.getIndexNameById(index.getId()), indexReported, indexRowCount); } // end for indices + if (maxPartitionSize.second <= partitionDataSize) { + maxPartitionSize = Pair.of("" + partition.getId(), partitionDataSize); + } + if (minPartitionSize.second >= partitionDataSize) { + minPartitionSize = Pair.of("" + partition.getId(), partitionDataSize); + } } // end for partitions + if (maxTableSize.second <= tableDataSize) { + maxTableSize = Pair.of("" + table.getId(), tableDataSize); + } + if (minTableSize.second >= tableDataSize) { + minTableSize = Pair.of("" + table.getId(), tableDataSize); + } // this is only one thread to update table statistics, readLock is enough olapTable.setStatistics(new OlapTable.Statistics(db.getName(), table.getName(), @@ -236,10 +272,37 @@ public class TabletStatMgr extends MasterDaemon { } finally { table.readUnlock(); } + totalTableSize += tableDataSize; } } + MetricRepo.GAUGE_MAX_TABLE_SIZE_BYTES.setValue(maxTableSize.second); + MetricRepo.GAUGE_MAX_PARTITION_SIZE_BYTES.setValue(maxPartitionSize.second); + MetricRepo.GAUGE_MAX_TABLET_SIZE_BYTES.setValue(maxTabletSize.second); + long minTableSizeTmp = minTableSize.second == Long.MAX_VALUE ? 0 : minTableSize.second; + MetricRepo.GAUGE_MIN_TABLE_SIZE_BYTES.setValue(minTableSizeTmp); + long minPartitionSizeTmp = minPartitionSize.second == Long.MAX_VALUE ? 0 : minPartitionSize.second; + MetricRepo.GAUGE_MIN_PARTITION_SIZE_BYTES.setValue(minPartitionSizeTmp); + long minTabletSizeTmp = minTabletSize.second == Long.MAX_VALUE ? 0 : minTabletSize.second; + MetricRepo.GAUGE_MIN_TABLET_SIZE_BYTES.setValue(minTabletSizeTmp); + long avgTableSize = totalTableSize / Math.max(1, tableCount); // avoid ArithmeticException: / by zero + MetricRepo.GAUGE_AVG_TABLE_SIZE_BYTES.setValue(avgTableSize); + long avgPartitionSize = totalTableSize / Math.max(1, partitionCount); // avoid ArithmeticException: / by zero + MetricRepo.GAUGE_AVG_PARTITION_SIZE_BYTES.setValue(avgPartitionSize); + long avgTabletSize = totalTableSize / Math.max(1, tabletCount); // avoid ArithmeticException: / by zero + MetricRepo.GAUGE_AVG_TABLET_SIZE_BYTES.setValue(avgTabletSize); LOG.info("finished to update index row num of all databases. cost: {} ms", (System.currentTimeMillis() - start)); + LOG.info("Olap table num=" + tableCount + ", partition num=" + partitionCount + ", tablet num=" + tabletCount + + ", max tablet byte size=" + maxTabletSize.second + "(tablet_id=" + maxTableSize.first + ")" + + ", min tablet byte size=" + minTabletSizeTmp + "(tablet_id=" + minTabletSize.first + ")" + + ", avg tablet byte size=" + avgTabletSize + + ", max partition byte size=" + maxPartitionSize.second + "(partition_id=" + maxPartitionSize.first + + ")" + + ", min partition byte size=" + minPartitionSizeTmp + "(partition_id=" + minPartitionSize.first + ")" + + ", avg partition byte size=" + avgPartitionSize + + ", max table byte size=" + maxTableSize.second + "(table_id=" + maxTableSize.first + ")" + + ", min table byte size=" + minTableSizeTmp + "(table_id=" + minTableSize.first + ")" + + ", avg table byte size=" + avgTableSize); } public void waitForTabletStatUpdate() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 7d70239b0ae..83ed5fcc0aa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -167,6 +167,16 @@ public final class MetricRepo { public static GaugeMetric<Integer> GAUGE_CATALOG_NUM; public static GaugeMetric<Integer> GAUGE_INTERNAL_DATABASE_NUM; public static GaugeMetric<Integer> GAUGE_INTERNAL_TABLE_NUM; + // Table/Partition/Tablet DataSize + public static GaugeMetricImpl<Long> GAUGE_MAX_TABLE_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_MAX_PARTITION_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_MAX_TABLET_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_MIN_TABLE_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_MIN_PARTITION_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_MIN_TABLET_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_AVG_TABLE_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_AVG_PARTITION_SIZE_BYTES; + public static GaugeMetricImpl<Long> GAUGE_AVG_TABLET_SIZE_BYTES; // Agent task public static LongCounterMetric COUNTER_AGENT_TASK_REQUEST_TOTAL; @@ -613,6 +623,33 @@ public final class MetricRepo { }; DORIS_METRIC_REGISTER.addMetrics(GAUGE_INTERNAL_TABLE_NUM); + GAUGE_MAX_TABLE_SIZE_BYTES = new GaugeMetricImpl<>("max_table_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MAX_TABLE_SIZE_BYTES); + + GAUGE_MAX_PARTITION_SIZE_BYTES = new GaugeMetricImpl<>("max_partition_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MAX_PARTITION_SIZE_BYTES); + + GAUGE_MAX_TABLET_SIZE_BYTES = new GaugeMetricImpl<>("max_tablet_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MAX_TABLET_SIZE_BYTES); + + GAUGE_MIN_TABLE_SIZE_BYTES = new GaugeMetricImpl<>("min_table_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MIN_TABLE_SIZE_BYTES); + + GAUGE_MIN_PARTITION_SIZE_BYTES = new GaugeMetricImpl<>("min_partition_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MIN_PARTITION_SIZE_BYTES); + + GAUGE_MIN_TABLET_SIZE_BYTES = new GaugeMetricImpl<>("min_tablet_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_MIN_TABLET_SIZE_BYTES); + + GAUGE_AVG_TABLE_SIZE_BYTES = new GaugeMetricImpl<>("avg_table_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_AVG_TABLE_SIZE_BYTES); + + GAUGE_AVG_PARTITION_SIZE_BYTES = new GaugeMetricImpl<>("avg_partition_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_AVG_PARTITION_SIZE_BYTES); + + GAUGE_AVG_TABLET_SIZE_BYTES = new GaugeMetricImpl<>("avg_tablet_size_bytes", MetricUnit.BYTES, "", 0L); + DORIS_METRIC_REGISTER.addMetrics(GAUGE_AVG_TABLET_SIZE_BYTES); + COUNTER_AGENT_TASK_REQUEST_TOTAL = new LongCounterMetric("agent_task_request_total", MetricUnit.NOUNIT, "total agent batch task request send to BE"); DORIS_METRIC_REGISTER.addMetrics(COUNTER_AGENT_TASK_REQUEST_TOTAL); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org