This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 7cc003ed5a3 [opt](nereids) clear min/max column stats if table is 
partially analyzed (#35533)
7cc003ed5a3 is described below

commit 7cc003ed5a3982c3fc208fde78ae37ca1ed7a8a7
Author: minghong <engle...@gmail.com>
AuthorDate: Wed May 29 11:54:01 2024 +0800

    [opt](nereids) clear min/max column stats if table is partially analyzed 
(#35533)
    
    cherry picked from master PR #33685
    commit 3d14f663a6a30292a547fd56e557cde55593c4b6
    
    if user queries newly loaded data (the new data are not analyzed), 
optimizer may generate inefficient plan because the newly loaded data is out of 
column stats min-max range.
    
    In this pr, we will ignore min-max if there are newly loaded data.
---
 .../doris/nereids/stats/StatsCalculator.java       | 65 ++++++++++++++--------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index f9b767f40c8..d711b99655b 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -114,6 +114,7 @@ import 
org.apache.doris.nereids.trees.plans.physical.PhysicalWindow;
 import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor;
 import org.apache.doris.nereids.types.DataType;
 import org.apache.doris.qe.ConnectContext;
+import org.apache.doris.statistics.AnalysisManager;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
 import org.apache.doris.statistics.Histogram;
@@ -121,8 +122,10 @@ import org.apache.doris.statistics.StatisticConstants;
 import org.apache.doris.statistics.StatisticRange;
 import org.apache.doris.statistics.Statistics;
 import org.apache.doris.statistics.StatisticsBuilder;
+import org.apache.doris.statistics.TableStatsMeta;
 
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import org.apache.commons.collections.CollectionUtils;
@@ -620,10 +623,20 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
     //       2. Consider the influence of runtime filter
     //       3. Get NDV and column data size from StatisticManger, 
StatisticManager doesn't support it now.
     private Statistics computeCatalogRelation(CatalogRelation catalogRelation) 
{
-        Set<SlotReference> slotSet = 
catalogRelation.getOutput().stream().filter(SlotReference.class::isInstance)
-                .map(s -> (SlotReference) s).collect(Collectors.toSet());
-        Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>();
+        List<Slot> output = catalogRelation.getOutput();
+        ImmutableSet.Builder<SlotReference> slotSetBuilder = 
ImmutableSet.builderWithExpectedSize(output.size());
+        for (Slot slot : output) {
+            if (slot instanceof SlotReference) {
+                slotSetBuilder.add((SlotReference) slot);
+            }
+        }
+        Set<SlotReference> slotSet = slotSetBuilder.build();
+        Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap = 
new HashMap<>();
         TableIf table = catalogRelation.getTable();
+        AnalysisManager analysisManager = 
Env.getCurrentEnv().getAnalysisManager();
+        TableStatsMeta tableMeta = 
analysisManager.findTableStatsStatus(table.getId());
+        // rows newly updated after last analyze
+        long deltaRowCount = tableMeta == null ? 0 : 
tableMeta.updatedRows.get();
         double rowCount = catalogRelation.getTable().getRowCountForNereids();
         boolean hasUnknownCol = false;
         long idxId = -1;
@@ -633,6 +646,10 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
                 idxId = olapScan.getSelectedIndexId();
             }
         }
+        if (deltaRowCount > 0 && LOG.isDebugEnabled()) {
+            LOG.debug("{} is partially analyzed, clear min/max values in 
column stats",
+                    catalogRelation.getTable().getName());
+        }
         for (SlotReference slotReference : slotSet) {
             String colName = slotReference.getColumn().isPresent()
                     ? slotReference.getColumn().get().getName()
@@ -649,40 +666,44 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
             } else {
                 cache = getColumnStatistic(table, colName, idxId);
             }
+            ColumnStatisticBuilder colStatsBuilder = new 
ColumnStatisticBuilder(cache);
             if (cache.avgSizeByte <= 0) {
-                cache = new ColumnStatisticBuilder(cache)
-                        
.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize())
-                        .build();
+                
colStatsBuilder.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize());
             }
             if (!cache.isUnKnown) {
-                rowCount = Math.max(rowCount, cache.count);
+                rowCount = Math.max(rowCount, cache.count + deltaRowCount);
             } else {
                 hasUnknownCol = true;
             }
             if (ConnectContext.get() != null && 
ConnectContext.get().getSessionVariable().enableStats) {
-                columnStatisticMap.put(slotReference, cache);
+                if (deltaRowCount > 0) {
+                    // clear min-max to avoid error estimation
+                    // for example, after yesterday data loaded, user send 
query about yesterday immediately.
+                    // since yesterday data are not analyzed, the max date is 
before yesterday, and hence optimizer
+                    // estimates the filter result is zero
+                    
colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
+                            
.setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
+                }
+                columnStatisticBuilderMap.put(slotReference, colStatsBuilder);
             } else {
-                columnStatisticMap.put(slotReference, ColumnStatistic.UNKNOWN);
+                columnStatisticBuilderMap.put(slotReference, new 
ColumnStatisticBuilder(ColumnStatistic.UNKNOWN));
                 hasUnknownCol = true;
             }
         }
         if (hasUnknownCol && ConnectContext.get() != null && 
ConnectContext.get().getStatementContext() != null) {
             
ConnectContext.get().getStatementContext().setHasUnknownColStats(true);
         }
-        Statistics stats = new Statistics(rowCount, columnStatisticMap);
-        stats = normalizeCatalogRelationColumnStatsRowCount(stats);
-        return stats;
-    }
-
-    private Statistics normalizeCatalogRelationColumnStatsRowCount(Statistics 
stats) {
-        for (Expression slot : stats.columnStatistics().keySet()) {
-            ColumnStatistic colStats = stats.findColumnStatistics(slot);
-            Preconditions.checkArgument(colStats != null,
-                    "can not find col stats for %s  in table", slot.toSql());
-            stats.addColumnStats(slot,
-                    new 
ColumnStatisticBuilder(colStats).setCount(stats.getRowCount()).build());
+        return normalizeCatalogRelationColumnStatsRowCount(rowCount, 
columnStatisticBuilderMap);
+    }
+
+    private Statistics normalizeCatalogRelationColumnStatsRowCount(double 
rowCount,
+            Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap) 
{
+        Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>();
+        for (Expression slot : columnStatisticBuilderMap.keySet()) {
+            columnStatisticMap.put(slot,
+                    
columnStatisticBuilderMap.get(slot).setCount(rowCount).build());
         }
-        return stats;
+        return new Statistics(rowCount, columnStatisticMap);
     }
 
     private Statistics computeTopN(TopN topN) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to