This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 7cc003ed5a3 [opt](nereids) clear min/max column stats if table is partially analyzed (#35533) 7cc003ed5a3 is described below commit 7cc003ed5a3982c3fc208fde78ae37ca1ed7a8a7 Author: minghong <engle...@gmail.com> AuthorDate: Wed May 29 11:54:01 2024 +0800 [opt](nereids) clear min/max column stats if table is partially analyzed (#35533) cherry picked from master PR #33685 commit 3d14f663a6a30292a547fd56e557cde55593c4b6 if user queries newly loaded data (the new data are not analyzed), optimizer may generate inefficient plan because the newly loaded data is out of column stats min-max range. In this pr, we will ignore min-max if there are newly loaded data. --- .../doris/nereids/stats/StatsCalculator.java | 65 ++++++++++++++-------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index f9b767f40c8..d711b99655b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -114,6 +114,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalWindow; import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor; import org.apache.doris.nereids.types.DataType; import org.apache.doris.qe.ConnectContext; +import org.apache.doris.statistics.AnalysisManager; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.Histogram; @@ -121,8 +122,10 @@ import org.apache.doris.statistics.StatisticConstants; import org.apache.doris.statistics.StatisticRange; import org.apache.doris.statistics.Statistics; import org.apache.doris.statistics.StatisticsBuilder; +import org.apache.doris.statistics.TableStatsMeta; import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.commons.collections.CollectionUtils; @@ -620,10 +623,20 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { // 2. Consider the influence of runtime filter // 3. Get NDV and column data size from StatisticManger, StatisticManager doesn't support it now. private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { - Set<SlotReference> slotSet = catalogRelation.getOutput().stream().filter(SlotReference.class::isInstance) - .map(s -> (SlotReference) s).collect(Collectors.toSet()); - Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>(); + List<Slot> output = catalogRelation.getOutput(); + ImmutableSet.Builder<SlotReference> slotSetBuilder = ImmutableSet.builderWithExpectedSize(output.size()); + for (Slot slot : output) { + if (slot instanceof SlotReference) { + slotSetBuilder.add((SlotReference) slot); + } + } + Set<SlotReference> slotSet = slotSetBuilder.build(); + Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap = new HashMap<>(); TableIf table = catalogRelation.getTable(); + AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); + TableStatsMeta tableMeta = analysisManager.findTableStatsStatus(table.getId()); + // rows newly updated after last analyze + long deltaRowCount = tableMeta == null ? 0 : tableMeta.updatedRows.get(); double rowCount = catalogRelation.getTable().getRowCountForNereids(); boolean hasUnknownCol = false; long idxId = -1; @@ -633,6 +646,10 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { idxId = olapScan.getSelectedIndexId(); } } + if (deltaRowCount > 0 && LOG.isDebugEnabled()) { + LOG.debug("{} is partially analyzed, clear min/max values in column stats", + catalogRelation.getTable().getName()); + } for (SlotReference slotReference : slotSet) { String colName = slotReference.getColumn().isPresent() ? slotReference.getColumn().get().getName() @@ -649,40 +666,44 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { } else { cache = getColumnStatistic(table, colName, idxId); } + ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); if (cache.avgSizeByte <= 0) { - cache = new ColumnStatisticBuilder(cache) - .setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize()) - .build(); + colStatsBuilder.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize()); } if (!cache.isUnKnown) { - rowCount = Math.max(rowCount, cache.count); + rowCount = Math.max(rowCount, cache.count + deltaRowCount); } else { hasUnknownCol = true; } if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) { - columnStatisticMap.put(slotReference, cache); + if (deltaRowCount > 0) { + // clear min-max to avoid error estimation + // for example, after yesterday data loaded, user send query about yesterday immediately. + // since yesterday data are not analyzed, the max date is before yesterday, and hence optimizer + // estimates the filter result is zero + colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY) + .setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY); + } + columnStatisticBuilderMap.put(slotReference, colStatsBuilder); } else { - columnStatisticMap.put(slotReference, ColumnStatistic.UNKNOWN); + columnStatisticBuilderMap.put(slotReference, new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN)); hasUnknownCol = true; } } if (hasUnknownCol && ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null) { ConnectContext.get().getStatementContext().setHasUnknownColStats(true); } - Statistics stats = new Statistics(rowCount, columnStatisticMap); - stats = normalizeCatalogRelationColumnStatsRowCount(stats); - return stats; - } - - private Statistics normalizeCatalogRelationColumnStatsRowCount(Statistics stats) { - for (Expression slot : stats.columnStatistics().keySet()) { - ColumnStatistic colStats = stats.findColumnStatistics(slot); - Preconditions.checkArgument(colStats != null, - "can not find col stats for %s in table", slot.toSql()); - stats.addColumnStats(slot, - new ColumnStatisticBuilder(colStats).setCount(stats.getRowCount()).build()); + return normalizeCatalogRelationColumnStatsRowCount(rowCount, columnStatisticBuilderMap); + } + + private Statistics normalizeCatalogRelationColumnStatsRowCount(double rowCount, + Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap) { + Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>(); + for (Expression slot : columnStatisticBuilderMap.keySet()) { + columnStatisticMap.put(slot, + columnStatisticBuilderMap.get(slot).setCount(rowCount).build()); } - return stats; + return new Statistics(rowCount, columnStatisticMap); } private Statistics computeTopN(TopN topN) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org