This is an automated email from the ASF dual-hosted git repository. englefly pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 142ab77549b [opt](nereids) optimize str-like-col range filter estimation (#34542) 142ab77549b is described below commit 142ab77549b084e53b2573e93a73281f10c6c87b Author: minghong <engle...@gmail.com> AuthorDate: Thu May 9 22:42:01 2024 +0800 [opt](nereids) optimize str-like-col range filter estimation (#34542) we have an order reserved mappping from string to double. for string column A, we have double values for A.min and A.max. when estimating A<"abc", A.min/max could be used to judge whether 'abc' is between A.min and A.max, but it cannot be used to do range estimation. suppose "abc" is mapped to double x. if we compute selectivity by formula "sel = (x-A.min)/(A.max-A.min)", we are likely to obtain extreme values. --- .../doris/nereids/stats/FilterEstimation.java | 20 +++-- .../org/apache/doris/nereids/types/TimeType.java | 3 +- .../org/apache/doris/nereids/types/TimeV2Type.java | 3 +- .../doris/nereids/types/coercion/DateLikeType.java | 2 +- .../doris/nereids/types/coercion/NumericType.java | 2 +- .../{TimeType.java => coercion/RangeScalable.java} | 37 +++------ .../doris/nereids/stats/FilterEstimationTest.java | 89 ++++++++++++++++++++++ .../data/nereids_ssb_shape_sf100_p0/shape/q2.2.out | 19 +++-- 8 files changed, 128 insertions(+), 47 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index e13eed89547..45e6dcd2abc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -40,6 +40,7 @@ import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.coercion.RangeScalable; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.StatisticRange; @@ -494,6 +495,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo .setNdv(intersectRange.getDistinctValues()) .setNumNulls(0); double sel = leftRange.overlapPercentWith(rightRange); + if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) { + sel = DEFAULT_INEQUALITY_COEFFICIENT; + } sel = getNotNullSelectivity(leftStats, sel); updatedStatistics = context.statistics.withSel(sel); leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); @@ -550,8 +554,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo } double leftOverlapPercent = leftRange.overlapPercentWith(rightRange); - // Left always greater than right - if (leftOverlapPercent == 0) { + + if (leftOverlapPercent == 0.0) { + // Left always greater than right return context.statistics.withRowCount(0.0); } StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, @@ -580,9 +585,14 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo .setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + rightOverlappingRangeFraction)) .setNumNulls(0) .build(); - double sel = leftAlwaysLessThanRightPercent - + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT - + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + double sel = DEFAULT_INEQUALITY_COEFFICIENT; + if (leftExpr.getDataType() instanceof RangeScalable) { + sel = leftAlwaysLessThanRightPercent + + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT + + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + } else if (leftOverlapPercent == 1.0) { + sel = 1.0; + } context.addKeyIfSlot(leftExpr); context.addKeyIfSlot(rightExpr); return context.statistics.withSel(sel) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java index 9fb438fd79c..1111038ed91 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java @@ -19,11 +19,12 @@ package org.apache.doris.nereids.types; import org.apache.doris.catalog.Type; import org.apache.doris.nereids.types.coercion.PrimitiveType; +import org.apache.doris.nereids.types.coercion.RangeScalable; /** * Datetime type in Nereids. */ -public class TimeType extends PrimitiveType { +public class TimeType extends PrimitiveType implements RangeScalable { public static final TimeType INSTANCE = new TimeType(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java index ec625d0cd17..b436fe31d39 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java @@ -20,11 +20,12 @@ package org.apache.doris.nereids.types; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Type; import org.apache.doris.nereids.types.coercion.PrimitiveType; +import org.apache.doris.nereids.types.coercion.RangeScalable; /** * Datetime type in Nereids. */ -public class TimeV2Type extends PrimitiveType { +public class TimeV2Type extends PrimitiveType implements RangeScalable { public static final TimeV2Type INSTANCE = new TimeV2Type(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java index 22ea99f00bc..1f8130215b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java @@ -33,7 +33,7 @@ import java.time.LocalDateTime; /** * date like type. */ -public abstract class DateLikeType extends PrimitiveType { +public abstract class DateLikeType extends PrimitiveType implements RangeScalable { protected LocalDate toLocalDate(double d) { // d = (year * 10000 + month * 100 + day) * 1000000L; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java index 7bbf8303dba..5ce248283ef 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java @@ -24,7 +24,7 @@ import org.apache.doris.nereids.types.DoubleType; /** * Abstract class for all numeric type in Nereids. */ -public class NumericType extends PrimitiveType { +public class NumericType extends PrimitiveType implements RangeScalable { public static final NumericType INSTANCE = new NumericType(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java similarity index 57% copy from fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java copy to fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java index 9fb438fd79c..8209e2f2b6e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java @@ -15,35 +15,16 @@ // specific language governing permissions and limitations // under the License. -package org.apache.doris.nereids.types; - -import org.apache.doris.catalog.Type; -import org.apache.doris.nereids.types.coercion.PrimitiveType; +package org.apache.doris.nereids.types.coercion; /** - * Datetime type in Nereids. + * numeric type/ date related type are range scalable + * RangeScalable Column can be estimated by filter like "A < 10" more accurate. + * For example, for a given relation R, which contains 10 rows. R.A in (1, 100), + * the selectivity of filter "A<10" is "(10-1) / (100 -1)" + * But for string column A, the filter selectivity of "A<'abc'" can not be estimated by range, although we could + * have an order reserved mapping from string value to double. + * */ -public class TimeType extends PrimitiveType { - - public static final TimeType INSTANCE = new TimeType(); - - private static final int WIDTH = 8; - - private TimeType() { - } - - @Override - public Type toCatalogDataType() { - return Type.TIME; - } - - @Override - public boolean equals(Object o) { - return o instanceof TimeType; - } - - @Override - public int width() { - return WIDTH; - } +public interface RangeScalable { } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 3b718062635..687a4d7a54a 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -18,6 +18,7 @@ package org.apache.doris.nereids.stats; import org.apache.doris.analysis.IntLiteral; +import org.apache.doris.analysis.StringLiteral; import org.apache.doris.nereids.trees.expressions.And; import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -35,9 +36,11 @@ import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.literal.DateLiteral; import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DoubleType; import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.VarcharType; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.Statistics; @@ -1139,4 +1142,90 @@ class FilterEstimationTest { Statistics resultEq = estimator.estimate(eq, statsBuilder.build()); Assertions.assertEquals(7, resultNse.getRowCount() - resultEq.getRowCount()); } + + /** + * for string literal, min-max range is only used for coverage, not for percentage + */ + @Test + public void testStringRangeColToLiteral() { + SlotReference a = new SlotReference("a", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2022-01-01")) + .setMaxValue(new VarcharLiteral("2022-01-01").getDouble()) + .setMinExpr(new StringLiteral("2020-01-01")) + .setMinValue(new VarcharLiteral("2020-01-01").getDouble()) + .setCount(100); + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build()); + Statistics baseStats = statsBuilder.build(); + VarcharLiteral year2030 = new VarcharLiteral("2030-01-01"); + Statistics filter2030 = new FilterEstimation().estimate(new LessThan(a, year2030), baseStats); + Assertions.assertEquals(100, filter2030.getRowCount()); + + VarcharLiteral year2000 = new VarcharLiteral("2000-01-01"); + Statistics filter2k = new FilterEstimation().estimate(new LessThan(year2000, a), baseStats); + Assertions.assertEquals(100, filter2k.getRowCount()); + + VarcharLiteral year2021 = new VarcharLiteral("2021-12-01"); + Statistics filter2021 = new FilterEstimation().estimate(new GreaterThan(a, year2021), baseStats); + Assertions.assertEquals(50, filter2021.getRowCount()); + } + + @Test + public void testStringRangeColToCol() { + SlotReference a = new SlotReference("a", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2022-01-01")) + .setMaxValue(new VarcharLiteral("2022-01-01").getDouble()) + .setMinExpr(new StringLiteral("2020-01-01")) + .setMinValue(new VarcharLiteral("2020-01-01").getDouble()) + .setCount(100); + + SlotReference b = new SlotReference("b", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2012-01-01")) + .setMaxValue(new VarcharLiteral("2012-01-01").getDouble()) + .setMinExpr(new StringLiteral("2010-01-01")) + .setMinValue(new VarcharLiteral("2010-01-01").getDouble()) + .setCount(100); + + SlotReference c = new SlotReference("c", new VarcharType(25)); + ColumnStatisticBuilder columnStatisticBuilderC = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2021-01-01")) + .setMaxValue(new VarcharLiteral("2021-01-01").getDouble()) + .setMinExpr(new StringLiteral("2010-01-01")) + .setMinValue(new VarcharLiteral("2010-01-01").getDouble()) + .setCount(100); + + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilderA.build()); + statsBuilder.putColumnStatistics(b, columnStatisticBuilderB.build()); + statsBuilder.putColumnStatistics(c, columnStatisticBuilderC.build()); + Statistics baseStats = statsBuilder.build(); + + // (2020-2022) > (2010,2012), sel=1 + Statistics agrtb = new FilterEstimation().estimate(new GreaterThan(a, b), baseStats); + Assertions.assertEquals(100, agrtb.getRowCount()); + // (2020-2022) < (2010,2012), sel=0 + Statistics alessb = new FilterEstimation().estimate(new LessThan(a, b), baseStats); + Assertions.assertEquals(0, alessb.getRowCount()); + + // (2020-2022) > (2010-2021), sel = DEFAULT (0.5) + Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats); + Assertions.assertEquals(50, agrtc.getRowCount()); + } } diff --git a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out index 9bf2e90dbbf..f57f1958aab 100644 --- a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out +++ b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out @@ -10,19 +10,18 @@ PhysicalResultSink --------------PhysicalProject ----------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_orderdate = dates.d_datekey)) otherCondition=() build RFs:RF2 d_datekey->[lo_orderdate] ------------------PhysicalProject ---------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_suppkey = supplier.s_suppkey)) otherCondition=() build RFs:RF1 s_suppkey->[lo_suppkey] -----------------------PhysicalProject -------------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_partkey = part.p_partkey)) otherCondition=() build RFs:RF0 p_partkey->[lo_partkey] +--------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_partkey = part.p_partkey)) otherCondition=() build RFs:RF1 p_partkey->[lo_partkey] +----------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_suppkey = supplier.s_suppkey)) otherCondition=() build RFs:RF0 s_suppkey->[lo_suppkey] +------------------------PhysicalProject +--------------------------PhysicalOlapScan[lineorder] apply RFs: RF0 RF1 RF2 +------------------------PhysicalDistribute[DistributionSpecReplicated] --------------------------PhysicalProject -----------------------------PhysicalOlapScan[lineorder] apply RFs: RF0 RF1 RF2 ---------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------PhysicalProject -------------------------------filter((part.p_brand <= 'MFGR#2228') and (part.p_brand >= 'MFGR#2221')) ---------------------------------PhysicalOlapScan[part] +----------------------------filter((supplier.s_region = 'ASIA')) +------------------------------PhysicalOlapScan[supplier] ----------------------PhysicalDistribute[DistributionSpecReplicated] ------------------------PhysicalProject ---------------------------filter((supplier.s_region = 'ASIA')) -----------------------------PhysicalOlapScan[supplier] +--------------------------filter((part.p_brand <= 'MFGR#2228') and (part.p_brand >= 'MFGR#2221')) +----------------------------PhysicalOlapScan[part] ------------------PhysicalDistribute[DistributionSpecReplicated] --------------------PhysicalProject ----------------------PhysicalOlapScan[dates] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org