This is an automated email from the ASF dual-hosted git repository. englefly pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new e7ea82bcd6d [feat](nereids) support partition level column stats (#35875) e7ea82bcd6d is described below commit e7ea82bcd6d7c7639e4e74266afe4d3451116d1d Author: minghong <engle...@gmail.com> AuthorDate: Mon Jun 24 10:36:03 2024 +0800 [feat](nereids) support partition level column stats (#35875) ## Proposed changes now partition level column stats are collected. StatsDrive job will merge the partition level col stats as single column stats, and then use the merged col stats for stats deriviation, instead of using the table level column stats. Issue Number: close #xxx <!--Describe your changes.--> --- .../doris/nereids/stats/StatsCalculator.java | 82 +++++++++++++++++++--- .../PartitionColumnStatisticBuilder.java | 21 ++++++ .../nereids_p0/stats/partition_col_stats.groovy | 45 ++++++++++++ 3 files changed, 139 insertions(+), 9 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index e42c58ca6b5..a96ec287f76 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -130,6 +130,8 @@ import org.apache.doris.statistics.ColStatsMeta; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.Histogram; +import org.apache.doris.statistics.PartitionColumnStatistic; +import org.apache.doris.statistics.PartitionColumnStatisticBuilder; import org.apache.doris.statistics.StatisticConstants; import org.apache.doris.statistics.StatisticRange; import org.apache.doris.statistics.Statistics; @@ -724,7 +726,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { } } - private ColumnStatistic getColumnStatistic(TableIf table, String colName, long idxId) { + private ColumnStatistic getColumnStatistic(TableIf table, String colName, long idxId, List<String> partitionNames) { ConnectContext connectContext = ConnectContext.get(); if (connectContext != null && connectContext.getSessionVariable().internalSession) { return ColumnStatistic.UNKNOWN; @@ -751,8 +753,45 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { return ColumnStatistic.UNKNOWN; } } else { + if (!partitionNames.isEmpty()) { + PartitionColumnStatisticBuilder builder = new PartitionColumnStatisticBuilder(); + boolean hasUnknown = false; + // check if there is any unknown stats to avoid unnecessary partition column stats merge. + for (String partitionName : partitionNames) { + PartitionColumnStatistic pcolStats = Env.getCurrentEnv().getStatisticsCache() + .getPartitionColumnStatistics( + catalogId, dbId, table.getId(), idxId, partitionName, colName); + if (pcolStats.isUnKnown) { + hasUnknown = true; + break; + } + } + if (!hasUnknown) { + boolean isFirst = true; + // try to merge partition column stats + for (String partitionName : partitionNames) { + PartitionColumnStatistic pcolStats = Env.getCurrentEnv().getStatisticsCache() + .getPartitionColumnStatistics( + catalogId, dbId, table.getId(), idxId, partitionName, colName); + if (pcolStats.isUnKnown) { + hasUnknown = true; + break; + } + if (isFirst) { + builder = new PartitionColumnStatisticBuilder(pcolStats); + isFirst = false; + } else { + builder.merge(pcolStats); + } + } + if (!hasUnknown) { + return builder.toColumnStatistics(); + } + } + } + // if any partition-col-stats is unknown, fall back to table level col stats return Env.getCurrentEnv().getStatisticsCache().getColumnStatistics( - catalogId, dbId, table.getId(), idxId, colName); + catalogId, dbId, table.getId(), idxId, colName); } } @@ -785,19 +824,25 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { Set<SlotReference> slotSet = slotSetBuilder.build(); Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap = new HashMap<>(); TableIf table = catalogRelation.getTable(); - boolean isOlapTable = table instanceof OlapTable; AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); TableStatsMeta tableMeta = analysisManager.findTableStatsStatus(table.getId()); long tableUpdatedRows = tableMeta == null ? 0 : tableMeta.updatedRows.get(); - double rowCount = catalogRelation.getTable().getRowCountForNereids(); boolean hasUnknownKeyCol = false; long idxId = -1; + List<String> selectedPartitionNames; if (catalogRelation instanceof OlapScan) { OlapScan olapScan = (OlapScan) catalogRelation; if (olapScan.getTable().getBaseIndexId() != olapScan.getSelectedIndexId()) { idxId = olapScan.getSelectedIndexId(); } + selectedPartitionNames = new ArrayList<>(olapScan.getSelectedPartitionIds().size()); + olapScan.getSelectedPartitionIds().forEach(id -> { + selectedPartitionNames.add(olapScan.getTable().getPartition(id).getName()); + }); + } else { + selectedPartitionNames = new ArrayList<>(); } + double rowCount = 0.0; for (SlotReference slotReference : slotSet) { boolean usedAsKey = false; if (ConnectContext.get() != null && slotReference.getColumn().isPresent() @@ -812,19 +857,33 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { if (colName == null) { throw new RuntimeException(String.format("Invalid slot: %s", slotReference.getExprId())); } + // compute delta row long deltaRowCount = 0; - if (isOlapTable) { + if (catalogRelation instanceof OlapScan) { OlapTable olapTable = (OlapTable) table; - ColStatsMeta colMeta = tableMeta == null ? null : tableMeta.findColumnStatsMeta( - olapTable.getIndexNameById(idxId == -1 ? olapTable.getBaseIndexId() : idxId), colName); - deltaRowCount = colMeta == null ? 0 : tableUpdatedRows - colMeta.updatedRows; + if (tableMeta != null) { + ColStatsMeta colMeta = tableMeta.findColumnStatsMeta( + olapTable.getIndexNameById(idxId == -1 ? olapTable.getBaseIndexId() : idxId), colName); + if (colMeta != null) { + if (((OlapScan) catalogRelation).getSelectedPartitionIds().isEmpty()) { + deltaRowCount = tableUpdatedRows - colMeta.updatedRows; + } else { + // sum partition delta row + for (long partitionId : ((OlapScan) catalogRelation).getSelectedPartitionIds()) { + deltaRowCount += tableMeta.partitionUpdateRows.getOrDefault(partitionId, 0L) + - colMeta.partitionUpdateRows.getOrDefault(partitionId, 0L); + } + } + } + } + } ColumnStatistic cache; if (!FeConstants.enableInternalSchemaDb || shouldIgnoreThisCol) { cache = ColumnStatistic.UNKNOWN; } else { - cache = getColumnStatistic(table, colName, idxId); + cache = getColumnStatistic(table, colName, idxId, selectedPartitionNames); } ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); if (cache.avgSizeByte <= 0) { @@ -859,6 +918,11 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { hasUnknownKeyCol = true; } } + if (rowCount <= 0.0) { + // if we failed to get rowCount from column stats, then try to get it from TableIf + rowCount = catalogRelation.getTable().getRowCountForNereids(); + } + if (hasUnknownKeyCol && ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null) { ConnectContext.get().getStatementContext().setHasUnknownColStats(true); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java index ecb0624d4e8..fe26396f212 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java @@ -155,4 +155,25 @@ public class PartitionColumnStatisticBuilder { dataSize, minValue, maxValue, minExpr, maxExpr, isUnknown, updatedTime); } + + public PartitionColumnStatisticBuilder merge(PartitionColumnStatistic other) { + count += other.count; + ndv.merge(other.ndv); + numNulls += other.numNulls; + if (minValue > other.minValue) { + minValue = other.minValue; + minExpr = other.minExpr; + } + if (maxValue < other.maxValue) { + maxValue = other.maxValue; + maxExpr = other.maxExpr; + } + isUnknown = isUnknown && other.isUnKnown; + return this; + } + + public ColumnStatistic toColumnStatistics() { + return new ColumnStatistic(count, ndv.estimateCardinality(), null, + avgSizeByte, numNulls, dataSize, minValue, maxValue, minExpr, maxExpr, isUnknown, updatedTime); + } } diff --git a/regression-test/suites/nereids_p0/stats/partition_col_stats.groovy b/regression-test/suites/nereids_p0/stats/partition_col_stats.groovy new file mode 100644 index 00000000000..89a32a80d91 --- /dev/null +++ b/regression-test/suites/nereids_p0/stats/partition_col_stats.groovy @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("partition_col_stats") { + multi_sql """ + set global enable_partition_analyze=true; + drop table if exists pt; + CREATE TABLE `pt` ( + `k1` int(11) NULL COMMENT "", + `k2` int(11) NULL COMMENT "", + `k3` int(11) NULL COMMENT "" + ) + PARTITION BY RANGE(`k1`) + (PARTITION p1 VALUES LESS THAN ("3"), + PARTITION p2 VALUES [("3"), ("7")), + PARTITION p3 VALUES [("7"), ("10"))) + DISTRIBUTED BY HASH(`k1`) BUCKETS 10 + PROPERTIES ('replication_num' = '1'); + + insert into pt values (1, 2, 2), (1, 3, 3), (1, 4, 1), (1, 4, 1), (4, 4, 4), (5,5,5),(6,6,6); + analyze table pt with sync; + """ + //run this sql to make stats be cached + sql "select * from pt where k1<3;" + sleep(10) + explain{ + sql "physical plan select * from pt where k1<3;" + contains("stats=4") + } + +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org