This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new e64f2e68e05 [opt](nereids) refine stats derive (#40654) (#40698) (#42050) e64f2e68e05 is described below commit e64f2e68e051d9f59624fc0e22659d287f353582 Author: xzj7019 <131111794+xzj7...@users.noreply.github.com> AuthorDate: Fri Oct 18 16:18:10 2024 +0800 [opt](nereids) refine stats derive (#40654) (#40698) (#42050) pick from master #40654 #40698 --- .../doris/nereids/stats/ExpressionEstimation.java | 23 +++++--- .../doris/nereids/stats/StatsCalculator.java | 4 +- .../org/apache/doris/nereids/types/StringType.java | 5 -- .../nereids/types/coercion/CharacterType.java | 3 ++ .../doris/statistics/ColumnStatisticBuilder.java | 18 +++++++ .../org/apache/doris/statistics/Statistics.java | 5 +- .../nereids/stats/ExpressionEstimationTest.java | 63 ++++++++++++++++++++++ 7 files changed, 106 insertions(+), 15 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java index 14d5ae8b63d..780c5922c6a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java @@ -128,6 +128,10 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta @Override public ColumnStatistic visit(Expression expr, Statistics context) { + ColumnStatistic stats = context.findColumnStatistics(expr); + if (stats != null) { + return stats; + } List<Expression> childrenExpr = expr.children(); if (CollectionUtils.isEmpty(childrenExpr)) { return ColumnStatistic.UNKNOWN; @@ -135,26 +139,28 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta return expr.child(0).accept(this, context); } - //TODO: case-when need to re-implemented @Override public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics context) { double ndv = caseWhen.getWhenClauses().size(); + double width = 1; if (caseWhen.getDefaultValue().isPresent()) { ndv += 1; } for (WhenClause clause : caseWhen.getWhenClauses()) { ColumnStatistic colStats = ExpressionEstimation.estimate(clause.getResult(), context); ndv = Math.max(ndv, colStats.ndv); + width = Math.max(width, clause.getResult().getDataType().width()); } if (caseWhen.getDefaultValue().isPresent()) { ColumnStatistic colStats = ExpressionEstimation.estimate(caseWhen.getDefaultValue().get(), context); ndv = Math.max(ndv, colStats.ndv); + width = Math.max(width, caseWhen.getDefaultValue().get().getDataType().width()); } return new ColumnStatisticBuilder() .setNdv(ndv) .setMinValue(Double.NEGATIVE_INFINITY) .setMaxValue(Double.POSITIVE_INFINITY) - .setAvgSizeByte(8) + .setAvgSizeByte(width) .setNumNulls(0) .build(); } @@ -162,15 +168,20 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta @Override public ColumnStatistic visitIf(If ifClause, Statistics context) { double ndv = 2; + double width = 1; ColumnStatistic colStatsThen = ExpressionEstimation.estimate(ifClause.child(1), context); ndv = Math.max(ndv, colStatsThen.ndv); + width = Math.max(width, ifClause.child(1).getDataType().width()); + ColumnStatistic colStatsElse = ExpressionEstimation.estimate(ifClause.child(2), context); ndv = Math.max(ndv, colStatsElse.ndv); + width = Math.max(width, ifClause.child(2).getDataType().width()); + return new ColumnStatisticBuilder() .setNdv(ndv) .setMinValue(Double.NEGATIVE_INFINITY) .setMaxValue(Double.POSITIVE_INFINITY) - .setAvgSizeByte(8) + .setAvgSizeByte(width) .setNumNulls(0) .build(); } @@ -242,9 +253,9 @@ public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Sta return new ColumnStatisticBuilder() .setMaxValue(literalVal) .setMinValue(literalVal) - .setNdv(1) - .setNumNulls(1) - .setAvgSizeByte(1) + .setNdv(literal.isNullLiteral() ? 0 : 1) + .setNumNulls(literal.isNullLiteral() ? 1 : 0) + .setAvgSizeByte(literal.getDataType().width()) .setMinExpr(literal.toLegacyLiteral()) .setMaxExpr(literal.toLegacyLiteral()) .build(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 1a983532a94..a7540622303 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -825,9 +825,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { cache = getColumnStatistic(table, colName, idxId); } ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); - if (cache.avgSizeByte <= 0) { - colStatsBuilder.setAvgSizeByte(slotReference.getColumn().get().getType().getSlotSize()); - } + colStatsBuilder.normalizeAvgSizeByte(slotReference); if (!cache.isUnKnown) { rowCount = Math.max(rowCount, cache.count + deltaRowCount); } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java index 935716e42bf..8e92f83274e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java @@ -31,11 +31,6 @@ public class StringType extends CharacterType { super(-1); } - @Override - public int width() { - return len; - } - @Override public Type toCatalogDataType() { return Type.STRING; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java index c02ea39e39a..153147f8bca 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java @@ -27,8 +27,11 @@ import org.apache.doris.nereids.types.StringType; public abstract class CharacterType extends PrimitiveType { private static final int WIDTH = 16; + public static final int DEFAULT_WIDTH = WIDTH; protected final int len; + // When defining SQL schemas, users often tend to set the length of string + // fields much longer than actually needed for storage. public CharacterType(int len) { this.len = len; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index a512fbadbda..85daa8ea0fe 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -18,6 +18,8 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.types.coercion.CharacterType; public class ColumnStatisticBuilder { private double count; @@ -170,4 +172,20 @@ public class ColumnStatisticBuilder { isUnknown, updatedTime); return colStats; } + + public void normalizeAvgSizeByte(SlotReference slot) { + if (isUnknown) { + return; + } + if (avgSizeByte > 0) { + return; + } + avgSizeByte = slot.getDataType().toCatalogDataType().getSlotSize(); + // When defining SQL schemas, users often tend to set the length of string \ + // fields much longer than actually needed for storage. + if (slot.getDataType() instanceof CharacterType) { + avgSizeByte = Math.min(avgSizeByte, + CharacterType.DEFAULT_WIDTH); + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index a907f6412f1..8a42245e895 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -20,6 +20,7 @@ package org.apache.doris.statistics; import org.apache.doris.nereids.stats.StatsMathUtil; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.Slot; +import org.apache.doris.nereids.types.coercion.CharacterType; import java.text.DecimalFormat; import java.util.HashMap; @@ -143,7 +144,9 @@ public class Statistics { if (tupleSize <= 0) { double tempSize = 0.0; for (ColumnStatistic s : expressionToColumnStats.values()) { - tempSize += s.avgSizeByte; + if (s != null) { + tempSize += Math.max(1, Math.min(CharacterType.DEFAULT_WIDTH, s.avgSizeByte)); + } } tupleSize = Math.max(1, tempSize); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java index 1748802e4dd..32e7504a535 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java @@ -31,7 +31,14 @@ import org.apache.doris.nereids.trees.expressions.WhenClause; import org.apache.doris.nereids.trees.expressions.functions.agg.Max; import org.apache.doris.nereids.trees.expressions.functions.agg.Min; import org.apache.doris.nereids.trees.expressions.functions.scalar.If; +import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.BooleanLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DateTimeLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DateV2Literal; +import org.apache.doris.nereids.trees.expressions.literal.DecimalLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; +import org.apache.doris.nereids.trees.expressions.literal.NullLiteral; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DoubleType; import org.apache.doris.nereids.types.IntegerType; @@ -44,6 +51,7 @@ import org.apache.commons.math3.util.Precision; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -357,6 +365,7 @@ class ExpressionEstimationTest { CaseWhen caseWhen = new CaseWhen(whens); ColumnStatistic est = ExpressionEstimation.estimate(caseWhen, stats); Assertions.assertEquals(est.ndv, 100); + Assertions.assertEquals(est.avgSizeByte, 16); } @Test @@ -383,5 +392,59 @@ class ExpressionEstimationTest { If ifClause = new If(BooleanLiteral.TRUE, a, b); ColumnStatistic est = ExpressionEstimation.estimate(ifClause, stats); Assertions.assertEquals(est.ndv, 100); + Assertions.assertEquals(est.avgSizeByte, 16); + } + + @Test + public void testLiteral() { + Statistics stats = new Statistics(1000, new HashMap<>()); + + BigIntLiteral l1 = new BigIntLiteral(1000000); + ColumnStatistic est = ExpressionEstimation.estimate(l1, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 8); + Assertions.assertEquals(est.numNulls, 0); + + VarcharLiteral l2 = new VarcharLiteral("abcdefghij"); + est = ExpressionEstimation.estimate(l2, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 10); + Assertions.assertEquals(est.numNulls, 0); + + DoubleLiteral l3 = new DoubleLiteral(0.01); + est = ExpressionEstimation.estimate(l3, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 8); + Assertions.assertEquals(est.numNulls, 0); + + DateV2Literal l4 = new DateV2Literal("2024-09-10"); + est = ExpressionEstimation.estimate(l4, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 4); + Assertions.assertEquals(est.numNulls, 0); + + DateTimeLiteral l5 = new DateTimeLiteral("2024-09-10 00:00:00"); + est = ExpressionEstimation.estimate(l5, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 16); + Assertions.assertEquals(est.numNulls, 0); + + BooleanLiteral l6 = BooleanLiteral.TRUE; + est = ExpressionEstimation.estimate(l6, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 1); + Assertions.assertEquals(est.numNulls, 0); + + DecimalLiteral l7 = new DecimalLiteral(BigDecimal.valueOf(2024.0928)); + est = ExpressionEstimation.estimate(l7, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 16); + Assertions.assertEquals(est.numNulls, 0); + + NullLiteral l8 = new NullLiteral(); + est = ExpressionEstimation.estimate(l8, stats); + Assertions.assertEquals(est.ndv, 0); + Assertions.assertEquals(est.avgSizeByte, 1); + Assertions.assertEquals(est.numNulls, 1); } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org