This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 6dcc221e44d [opt](nereids) fix non-null selectivity computing (#42286) 6dcc221e44d is described below commit 6dcc221e44d1ebedac9e598e04a326f40e39c69c Author: xzj7019 <131111794+xzj7...@users.noreply.github.com> AuthorDate: Thu Oct 24 11:47:50 2024 +0800 [opt](nereids) fix non-null selectivity computing (#42286) Introduced by pr: #40762 which doesn't update numNulls and other related column stats when they needs to be normalized. This pr adds the updating trigger condition and makes the related case healthy. --- .../org/apache/doris/statistics/Statistics.java | 3 +- .../doris/nereids/stats/FilterEstimationTest.java | 72 +++++++++++++++++++++- .../doris/nereids/stats/StatsCalculatorTest.java | 4 +- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index 72000f3ce5a..6ad4297dcb1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -119,7 +119,8 @@ public class Statistics { // the following columnStatistic.isUnKnown() judgment is loop inside since current doris // supports partial stats deriving, i.e, allowing part of tables have stats and other parts don't, // or part of columns have stats but other parts don't, especially join and filter estimation. - if (!checkColumnStatsValid(columnStatistic, rowCount) && !columnStatistic.isUnKnown()) { + if (!columnStatistic.isUnKnown() && (!checkColumnStatsValid(columnStatistic, rowCount) + || isNumNullsDecreaseByProportion && columnStatistic.numNulls != 0)) { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); double ndv = Math.min(columnStatistic.ndv, rowCount); double numNulls = Math.min(columnStatistic.numNulls * factor, rowCount - ndv); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 28fe50d16ec..9b0fdc3880d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -36,9 +36,11 @@ import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.scalar.Left; import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.DateLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DateTimeLiteral; import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; +import org.apache.doris.nereids.types.DateTimeType; import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DoubleType; import org.apache.doris.nereids.types.IntegerType; @@ -1144,7 +1146,75 @@ class FilterEstimationTest { Statistics result = filterEstimation.estimate(and, stats); // result 1.0->2.0 bc happens because the calculation from normalization of // "Math.min(columnStatistic.numNulls * factor, rowCount - ndv);" - Assertions.assertEquals(result.getRowCount(), 2.0, 0.01); + Assertions.assertEquals(result.getRowCount(), 3.5, 0.01); + } + + /** + * dt BETWEEN "2020-05-25 00:00:00" and "2020-05-25 23:59:59" + * and day BETWEEN "2020-05-24" and "2020-05-26" + * and game="mus" and plat = "37wan"; + */ + @Test + void testMultiAndWithNull() { + SlotReference dt = new SlotReference("dt", DateTimeType.INSTANCE); + ColumnStatisticBuilder dtBuilder = new ColumnStatisticBuilder(1000000) + .setNdv(783813.0) + .setNumNulls(50833.0) + .setMaxValue(new DateTimeLiteral("2020-05-31 07:59:59").getDouble()) + .setMinValue(new DateTimeLiteral("2020-05-01 08:00:04").getDouble()); + DateLiteral dtMin = new DateTimeLiteral("2020-05-25 00:00:00"); + DateLiteral dtMax = new DateTimeLiteral("2020-05-25 23:59:59"); + GreaterThanEqual dtGreater = new GreaterThanEqual(dt, dtMin); + LessThan dtLess = new LessThan(dt, dtMax); + And dtAnd = new And(dtLess, dtGreater); + + SlotReference day = new SlotReference("day", DateType.INSTANCE); + ColumnStatisticBuilder dayBuilder = new ColumnStatisticBuilder(1000000) + .setNdv(31.0) + .setNumNulls(49699.0) + .setMaxValue(new DateLiteral("2020-05-31").getDouble()) + .setMinValue(new DateLiteral("2020-05-01").getDouble()); + DateLiteral dayMin = new DateLiteral("2020-05-24"); + DateLiteral dayMax = new DateLiteral("2020-05-26"); + GreaterThanEqual dayGreater = new GreaterThanEqual(day, dayMin); + LessThan dayLess = new LessThan(day, dayMax); + And dayAnd = new And(dayLess, dayGreater); + + SlotReference game = new SlotReference("game", new VarcharType(500)); + ColumnStatisticBuilder gameBuilder = new ColumnStatisticBuilder(1000000) + .setNdv(1.0) + .setNumNulls(49813.0) + .setMaxExpr(new StringLiteral("mus")) + .setMaxValue(new VarcharLiteral("mus").getDouble()) + .setMinExpr(new StringLiteral("mus")) + .setMinValue(new VarcharLiteral("mus").getDouble()); + VarcharLiteral mus = new VarcharLiteral("mus"); + EqualTo gameEqualTo = new EqualTo(game, mus); + + SlotReference plat = new SlotReference("plat", new VarcharType(500)); + ColumnStatisticBuilder platBuilder = new ColumnStatisticBuilder(1000000) + .setNdv(1.0) + .setNumNulls(49691.0) + .setMaxExpr(new StringLiteral("37wan")) + .setMaxValue(new VarcharLiteral("37wan").getDouble()) + .setMinExpr(new StringLiteral("37wan")) + .setMinValue(new VarcharLiteral("37wan").getDouble()); + VarcharLiteral wan = new VarcharLiteral("37wan"); + EqualTo wanEqualTo = new EqualTo(plat, wan); + And equalAnd = new And(gameEqualTo, wanEqualTo); + + And partialAnd = new And(dtAnd, dayAnd); + And allAnd = new And(partialAnd, equalAnd); + + Statistics stats = new Statistics(1000000, new HashMap<>()); + stats.addColumnStats(dt, dtBuilder.build()); + stats.addColumnStats(day, dayBuilder.build()); + stats.addColumnStats(game, gameBuilder.build()); + stats.addColumnStats(plat, platBuilder.build()); + + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(allAnd, stats); + Assertions.assertEquals(result.getRowCount(), 2109.16, 0.01); } /** diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java index 49d295ea7c0..cf91eacb51c 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java @@ -145,13 +145,13 @@ public class StatsCalculatorTest { GroupExpression groupExpression = new GroupExpression(logicalFilter, ImmutableList.of(childGroup)); Group ownerGroup = new Group(null, groupExpression, null); StatsCalculator.estimate(groupExpression, null); - Assertions.assertEquals(49.45, ownerGroup.getStatistics().getRowCount(), 0.001); + Assertions.assertEquals(49.945, ownerGroup.getStatistics().getRowCount(), 0.001); LogicalFilter<GroupPlan> logicalFilterOr = new LogicalFilter<>(or, groupPlan); GroupExpression groupExpressionOr = new GroupExpression(logicalFilterOr, ImmutableList.of(childGroup)); Group ownerGroupOr = new Group(null, groupExpressionOr, null); StatsCalculator.estimate(groupExpressionOr, null); - Assertions.assertEquals(1449.05, + Assertions.assertEquals(1448.555, ownerGroupOr.getStatistics().getRowCount(), 0.001); } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org