This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new e64f2e68e05 [opt](nereids) refine stats derive (#40654) (#40698) 
(#42050)
e64f2e68e05 is described below

commit e64f2e68e051d9f59624fc0e22659d287f353582
Author: xzj7019 <131111794+xzj7...@users.noreply.github.com>
AuthorDate: Fri Oct 18 16:18:10 2024 +0800

    [opt](nereids) refine stats derive (#40654) (#40698) (#42050)
    
    pick from master #40654 #40698
---
 .../doris/nereids/stats/ExpressionEstimation.java  | 23 +++++---
 .../doris/nereids/stats/StatsCalculator.java       |  4 +-
 .../org/apache/doris/nereids/types/StringType.java |  5 --
 .../nereids/types/coercion/CharacterType.java      |  3 ++
 .../doris/statistics/ColumnStatisticBuilder.java   | 18 +++++++
 .../org/apache/doris/statistics/Statistics.java    |  5 +-
 .../nereids/stats/ExpressionEstimationTest.java    | 63 ++++++++++++++++++++++
 7 files changed, 106 insertions(+), 15 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
index 14d5ae8b63d..780c5922c6a 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java
@@ -128,6 +128,10 @@ public class ExpressionEstimation extends 
ExpressionVisitor<ColumnStatistic, Sta
 
     @Override
     public ColumnStatistic visit(Expression expr, Statistics context) {
+        ColumnStatistic stats = context.findColumnStatistics(expr);
+        if (stats != null) {
+            return stats;
+        }
         List<Expression> childrenExpr = expr.children();
         if (CollectionUtils.isEmpty(childrenExpr)) {
             return ColumnStatistic.UNKNOWN;
@@ -135,26 +139,28 @@ public class ExpressionEstimation extends 
ExpressionVisitor<ColumnStatistic, Sta
         return expr.child(0).accept(this, context);
     }
 
-    //TODO: case-when need to re-implemented
     @Override
     public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics 
context) {
         double ndv = caseWhen.getWhenClauses().size();
+        double width = 1;
         if (caseWhen.getDefaultValue().isPresent()) {
             ndv += 1;
         }
         for (WhenClause clause : caseWhen.getWhenClauses()) {
             ColumnStatistic colStats = 
ExpressionEstimation.estimate(clause.getResult(), context);
             ndv = Math.max(ndv, colStats.ndv);
+            width = Math.max(width, clause.getResult().getDataType().width());
         }
         if (caseWhen.getDefaultValue().isPresent()) {
             ColumnStatistic colStats = 
ExpressionEstimation.estimate(caseWhen.getDefaultValue().get(), context);
             ndv = Math.max(ndv, colStats.ndv);
+            width = Math.max(width, 
caseWhen.getDefaultValue().get().getDataType().width());
         }
         return new ColumnStatisticBuilder()
                 .setNdv(ndv)
                 .setMinValue(Double.NEGATIVE_INFINITY)
                 .setMaxValue(Double.POSITIVE_INFINITY)
-                .setAvgSizeByte(8)
+                .setAvgSizeByte(width)
                 .setNumNulls(0)
                 .build();
     }
@@ -162,15 +168,20 @@ public class ExpressionEstimation extends 
ExpressionVisitor<ColumnStatistic, Sta
     @Override
     public ColumnStatistic visitIf(If ifClause, Statistics context) {
         double ndv = 2;
+        double width = 1;
         ColumnStatistic colStatsThen = 
ExpressionEstimation.estimate(ifClause.child(1), context);
         ndv = Math.max(ndv, colStatsThen.ndv);
+        width = Math.max(width, ifClause.child(1).getDataType().width());
+
         ColumnStatistic colStatsElse = 
ExpressionEstimation.estimate(ifClause.child(2), context);
         ndv = Math.max(ndv, colStatsElse.ndv);
+        width = Math.max(width, ifClause.child(2).getDataType().width());
+
         return new ColumnStatisticBuilder()
                 .setNdv(ndv)
                 .setMinValue(Double.NEGATIVE_INFINITY)
                 .setMaxValue(Double.POSITIVE_INFINITY)
-                .setAvgSizeByte(8)
+                .setAvgSizeByte(width)
                 .setNumNulls(0)
                 .build();
     }
@@ -242,9 +253,9 @@ public class ExpressionEstimation extends 
ExpressionVisitor<ColumnStatistic, Sta
         return new ColumnStatisticBuilder()
                 .setMaxValue(literalVal)
                 .setMinValue(literalVal)
-                .setNdv(1)
-                .setNumNulls(1)
-                .setAvgSizeByte(1)
+                .setNdv(literal.isNullLiteral() ? 0 : 1)
+                .setNumNulls(literal.isNullLiteral() ? 1 : 0)
+                .setAvgSizeByte(literal.getDataType().width())
                 .setMinExpr(literal.toLegacyLiteral())
                 .setMaxExpr(literal.toLegacyLiteral())
                 .build();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index 1a983532a94..a7540622303 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -825,9 +825,7 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
                 cache = getColumnStatistic(table, colName, idxId);
             }
             ColumnStatisticBuilder colStatsBuilder = new 
ColumnStatisticBuilder(cache);
-            if (cache.avgSizeByte <= 0) {
-                
colStatsBuilder.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize());
-            }
+            colStatsBuilder.normalizeAvgSizeByte(slotReference);
             if (!cache.isUnKnown) {
                 rowCount = Math.max(rowCount, cache.count + deltaRowCount);
             } else {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
index 935716e42bf..8e92f83274e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java
@@ -31,11 +31,6 @@ public class StringType extends CharacterType {
         super(-1);
     }
 
-    @Override
-    public int width() {
-        return len;
-    }
-
     @Override
     public Type toCatalogDataType() {
         return Type.STRING;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
index c02ea39e39a..153147f8bca 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java
@@ -27,8 +27,11 @@ import org.apache.doris.nereids.types.StringType;
 public abstract class CharacterType extends PrimitiveType {
 
     private static final int WIDTH = 16;
+    public static final int DEFAULT_WIDTH = WIDTH;
 
     protected final int len;
+    // When defining SQL schemas, users often tend to set the length of string
+    // fields much longer than actually needed for storage.
 
     public CharacterType(int len) {
         this.len = len;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
index a512fbadbda..85daa8ea0fe 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java
@@ -18,6 +18,8 @@
 package org.apache.doris.statistics;
 
 import org.apache.doris.analysis.LiteralExpr;
+import org.apache.doris.nereids.trees.expressions.SlotReference;
+import org.apache.doris.nereids.types.coercion.CharacterType;
 
 public class ColumnStatisticBuilder {
     private double count;
@@ -170,4 +172,20 @@ public class ColumnStatisticBuilder {
                 isUnknown, updatedTime);
         return colStats;
     }
+
+    public void normalizeAvgSizeByte(SlotReference slot) {
+        if (isUnknown) {
+            return;
+        }
+        if (avgSizeByte > 0) {
+            return;
+        }
+        avgSizeByte = slot.getDataType().toCatalogDataType().getSlotSize();
+        // When defining SQL schemas, users often tend to set the length of 
string \
+        // fields much longer than actually needed for storage.
+        if (slot.getDataType() instanceof CharacterType) {
+            avgSizeByte = Math.min(avgSizeByte,
+                    CharacterType.DEFAULT_WIDTH);
+        }
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
index a907f6412f1..8a42245e895 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java
@@ -20,6 +20,7 @@ package org.apache.doris.statistics;
 import org.apache.doris.nereids.stats.StatsMathUtil;
 import org.apache.doris.nereids.trees.expressions.Expression;
 import org.apache.doris.nereids.trees.expressions.Slot;
+import org.apache.doris.nereids.types.coercion.CharacterType;
 
 import java.text.DecimalFormat;
 import java.util.HashMap;
@@ -143,7 +144,9 @@ public class Statistics {
         if (tupleSize <= 0) {
             double tempSize = 0.0;
             for (ColumnStatistic s : expressionToColumnStats.values()) {
-                tempSize += s.avgSizeByte;
+                if (s != null) {
+                    tempSize += Math.max(1, 
Math.min(CharacterType.DEFAULT_WIDTH, s.avgSizeByte));
+                }
             }
             tupleSize = Math.max(1, tempSize);
         }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
index 1748802e4dd..32e7504a535 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java
@@ -31,7 +31,14 @@ import org.apache.doris.nereids.trees.expressions.WhenClause;
 import org.apache.doris.nereids.trees.expressions.functions.agg.Max;
 import org.apache.doris.nereids.trees.expressions.functions.agg.Min;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.If;
+import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral;
 import org.apache.doris.nereids.trees.expressions.literal.BooleanLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.DateTimeLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.DateV2Literal;
+import org.apache.doris.nereids.trees.expressions.literal.DecimalLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.NullLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral;
 import org.apache.doris.nereids.types.DateType;
 import org.apache.doris.nereids.types.DoubleType;
 import org.apache.doris.nereids.types.IntegerType;
@@ -44,6 +51,7 @@ import org.apache.commons.math3.util.Precision;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
+import java.math.BigDecimal;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -357,6 +365,7 @@ class ExpressionEstimationTest {
         CaseWhen caseWhen = new CaseWhen(whens);
         ColumnStatistic est = ExpressionEstimation.estimate(caseWhen, stats);
         Assertions.assertEquals(est.ndv, 100);
+        Assertions.assertEquals(est.avgSizeByte, 16);
     }
 
     @Test
@@ -383,5 +392,59 @@ class ExpressionEstimationTest {
         If ifClause = new If(BooleanLiteral.TRUE, a, b);
         ColumnStatistic est = ExpressionEstimation.estimate(ifClause, stats);
         Assertions.assertEquals(est.ndv, 100);
+        Assertions.assertEquals(est.avgSizeByte, 16);
+    }
+
+    @Test
+    public void testLiteral() {
+        Statistics stats = new Statistics(1000, new HashMap<>());
+
+        BigIntLiteral l1 = new BigIntLiteral(1000000);
+        ColumnStatistic est = ExpressionEstimation.estimate(l1, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 8);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        VarcharLiteral l2 = new VarcharLiteral("abcdefghij");
+        est = ExpressionEstimation.estimate(l2, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 10);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        DoubleLiteral l3 = new DoubleLiteral(0.01);
+        est = ExpressionEstimation.estimate(l3, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 8);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        DateV2Literal l4 = new DateV2Literal("2024-09-10");
+        est = ExpressionEstimation.estimate(l4, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 4);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        DateTimeLiteral l5 = new DateTimeLiteral("2024-09-10 00:00:00");
+        est = ExpressionEstimation.estimate(l5, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 16);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        BooleanLiteral l6 = BooleanLiteral.TRUE;
+        est = ExpressionEstimation.estimate(l6, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 1);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        DecimalLiteral l7 = new DecimalLiteral(BigDecimal.valueOf(2024.0928));
+        est = ExpressionEstimation.estimate(l7, stats);
+        Assertions.assertEquals(est.ndv, 1);
+        Assertions.assertEquals(est.avgSizeByte, 16);
+        Assertions.assertEquals(est.numNulls, 0);
+
+        NullLiteral l8 = new NullLiteral();
+        est = ExpressionEstimation.estimate(l8, stats);
+        Assertions.assertEquals(est.ndv, 0);
+        Assertions.assertEquals(est.avgSizeByte, 1);
+        Assertions.assertEquals(est.numNulls, 1);
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to