This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 142ab77549b [opt](nereids) optimize str-like-col range filter 
estimation (#34542)
142ab77549b is described below

commit 142ab77549b084e53b2573e93a73281f10c6c87b
Author: minghong <engle...@gmail.com>
AuthorDate: Thu May 9 22:42:01 2024 +0800

    [opt](nereids) optimize str-like-col range filter estimation (#34542)
    
    we have an order reserved mappping from string to double.
    for string column A, we have double values for A.min and A.max.
    when estimating A<"abc", A.min/max could be used to judge whether 'abc' is 
between A.min and A.max, but it cannot be used to do range estimation. suppose 
"abc" is mapped to double x. if we compute selectivity by formula "sel = 
(x-A.min)/(A.max-A.min)", we are likely to obtain extreme values.
---
 .../doris/nereids/stats/FilterEstimation.java      | 20 +++--
 .../org/apache/doris/nereids/types/TimeType.java   |  3 +-
 .../org/apache/doris/nereids/types/TimeV2Type.java |  3 +-
 .../doris/nereids/types/coercion/DateLikeType.java |  2 +-
 .../doris/nereids/types/coercion/NumericType.java  |  2 +-
 .../{TimeType.java => coercion/RangeScalable.java} | 37 +++------
 .../doris/nereids/stats/FilterEstimationTest.java  | 89 ++++++++++++++++++++++
 .../data/nereids_ssb_shape_sf100_p0/shape/q2.2.out | 19 +++--
 8 files changed, 128 insertions(+), 47 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
index e13eed89547..45e6dcd2abc 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java
@@ -40,6 +40,7 @@ import 
org.apache.doris.nereids.trees.expressions.SlotReference;
 import org.apache.doris.nereids.trees.expressions.functions.Function;
 import org.apache.doris.nereids.trees.expressions.literal.Literal;
 import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.coercion.RangeScalable;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
 import org.apache.doris.statistics.StatisticRange;
@@ -494,6 +495,9 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                     .setNdv(intersectRange.getDistinctValues())
                     .setNumNulls(0);
             double sel = leftRange.overlapPercentWith(rightRange);
+            if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 
0.0 && sel != 1.0)) {
+                sel = DEFAULT_INEQUALITY_COEFFICIENT;
+            }
             sel = getNotNullSelectivity(leftStats, sel);
             updatedStatistics = context.statistics.withSel(sel);
             
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
@@ -550,8 +554,9 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
         }
 
         double leftOverlapPercent = leftRange.overlapPercentWith(rightRange);
-        // Left always greater than right
-        if (leftOverlapPercent == 0) {
+
+        if (leftOverlapPercent == 0.0) {
+            // Left always greater than right
             return context.statistics.withRowCount(0.0);
         }
         StatisticRange leftAlwaysLessThanRightRange = new 
StatisticRange(leftStats.minValue, leftStats.minExpr,
@@ -580,9 +585,14 @@ public class FilterEstimation extends 
ExpressionVisitor<Statistics, EstimationCo
                 .setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + 
rightOverlappingRangeFraction))
                 .setNumNulls(0)
                 .build();
-        double sel = leftAlwaysLessThanRightPercent
-                + leftOverlapPercent * rightOverlappingRangeFraction * 
DEFAULT_INEQUALITY_COEFFICIENT
-                + leftOverlapPercent * rightAlwaysGreaterRangeFraction;
+        double sel = DEFAULT_INEQUALITY_COEFFICIENT;
+        if (leftExpr.getDataType() instanceof RangeScalable) {
+            sel = leftAlwaysLessThanRightPercent
+                    + leftOverlapPercent * rightOverlappingRangeFraction * 
DEFAULT_INEQUALITY_COEFFICIENT
+                    + leftOverlapPercent * rightAlwaysGreaterRangeFraction;
+        } else if (leftOverlapPercent == 1.0) {
+            sel = 1.0;
+        }
         context.addKeyIfSlot(leftExpr);
         context.addKeyIfSlot(rightExpr);
         return context.statistics.withSel(sel)
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java
index 9fb438fd79c..1111038ed91 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java
@@ -19,11 +19,12 @@ package org.apache.doris.nereids.types;
 
 import org.apache.doris.catalog.Type;
 import org.apache.doris.nereids.types.coercion.PrimitiveType;
+import org.apache.doris.nereids.types.coercion.RangeScalable;
 
 /**
  * Datetime type in Nereids.
  */
-public class TimeType extends PrimitiveType {
+public class TimeType extends PrimitiveType implements RangeScalable {
 
     public static final TimeType INSTANCE = new TimeType();
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java
index ec625d0cd17..b436fe31d39 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeV2Type.java
@@ -20,11 +20,12 @@ package org.apache.doris.nereids.types;
 import org.apache.doris.catalog.ScalarType;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.nereids.types.coercion.PrimitiveType;
+import org.apache.doris.nereids.types.coercion.RangeScalable;
 
 /**
  * Datetime type in Nereids.
  */
-public class TimeV2Type extends PrimitiveType {
+public class TimeV2Type extends PrimitiveType implements RangeScalable {
 
     public static final TimeV2Type INSTANCE = new TimeV2Type();
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
index 22ea99f00bc..1f8130215b0 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java
@@ -33,7 +33,7 @@ import java.time.LocalDateTime;
 /**
  * date like type.
  */
-public abstract class DateLikeType extends PrimitiveType {
+public abstract class DateLikeType extends PrimitiveType implements 
RangeScalable {
 
     protected LocalDate toLocalDate(double d) {
         // d = (year * 10000 + month * 100 + day) * 1000000L;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java
index 7bbf8303dba..5ce248283ef 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/NumericType.java
@@ -24,7 +24,7 @@ import org.apache.doris.nereids.types.DoubleType;
 /**
  * Abstract class for all numeric type in Nereids.
  */
-public class NumericType extends PrimitiveType {
+public class NumericType extends PrimitiveType implements RangeScalable {
 
     public static final NumericType INSTANCE = new NumericType();
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java
similarity index 57%
copy from fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java
copy to 
fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java
index 9fb438fd79c..8209e2f2b6e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/TimeType.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java
@@ -15,35 +15,16 @@
 // specific language governing permissions and limitations
 // under the License.
 
-package org.apache.doris.nereids.types;
-
-import org.apache.doris.catalog.Type;
-import org.apache.doris.nereids.types.coercion.PrimitiveType;
+package org.apache.doris.nereids.types.coercion;
 
 /**
- * Datetime type in Nereids.
+ * numeric type/ date related type are range scalable
+ * RangeScalable Column can be estimated by filter like "A < 10" more accurate.
+ * For example, for a given relation R, which contains 10 rows. R.A in (1, 
100),
+ * the selectivity of filter "A<10" is "(10-1) / (100 -1)"
+ * But for string column A, the filter selectivity of "A<'abc'" can not be 
estimated by range, although we could
+ * have an order reserved mapping from string value to double.
+ *
  */
-public class TimeType extends PrimitiveType {
-
-    public static final TimeType INSTANCE = new TimeType();
-
-    private static final int WIDTH = 8;
-
-    private TimeType() {
-    }
-
-    @Override
-    public Type toCatalogDataType() {
-        return Type.TIME;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        return o instanceof TimeType;
-    }
-
-    @Override
-    public int width() {
-        return WIDTH;
-    }
+public interface RangeScalable {
 }
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
index 3b718062635..687a4d7a54a 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java
@@ -18,6 +18,7 @@
 package org.apache.doris.nereids.stats;
 
 import org.apache.doris.analysis.IntLiteral;
+import org.apache.doris.analysis.StringLiteral;
 import org.apache.doris.nereids.trees.expressions.And;
 import org.apache.doris.nereids.trees.expressions.Cast;
 import org.apache.doris.nereids.trees.expressions.EqualTo;
@@ -35,9 +36,11 @@ import 
org.apache.doris.nereids.trees.expressions.SlotReference;
 import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
 import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
 import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
+import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral;
 import org.apache.doris.nereids.types.DateType;
 import org.apache.doris.nereids.types.DoubleType;
 import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.VarcharType;
 import org.apache.doris.statistics.ColumnStatistic;
 import org.apache.doris.statistics.ColumnStatisticBuilder;
 import org.apache.doris.statistics.Statistics;
@@ -1139,4 +1142,90 @@ class FilterEstimationTest {
         Statistics resultEq = estimator.estimate(eq, statsBuilder.build());
         Assertions.assertEquals(7, resultNse.getRowCount() - 
resultEq.getRowCount());
     }
+
+    /**
+     * for string literal, min-max range is only used for coverage, not for 
percentage
+     */
+    @Test
+    public void testStringRangeColToLiteral() {
+        SlotReference a = new SlotReference("a", new VarcharType(25));
+        ColumnStatisticBuilder columnStatisticBuilder = new 
ColumnStatisticBuilder()
+                .setNdv(100)
+                .setAvgSizeByte(25)
+                .setNumNulls(0)
+                .setMaxExpr(new StringLiteral("2022-01-01"))
+                .setMaxValue(new VarcharLiteral("2022-01-01").getDouble())
+                .setMinExpr(new StringLiteral("2020-01-01"))
+                .setMinValue(new VarcharLiteral("2020-01-01").getDouble())
+                .setCount(100);
+        StatisticsBuilder statsBuilder = new StatisticsBuilder();
+        statsBuilder.setRowCount(100);
+        statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build());
+        Statistics baseStats = statsBuilder.build();
+        VarcharLiteral year2030 = new VarcharLiteral("2030-01-01");
+        Statistics filter2030 = new FilterEstimation().estimate(new 
LessThan(a, year2030), baseStats);
+        Assertions.assertEquals(100, filter2030.getRowCount());
+
+        VarcharLiteral year2000 = new VarcharLiteral("2000-01-01");
+        Statistics filter2k = new FilterEstimation().estimate(new 
LessThan(year2000, a), baseStats);
+        Assertions.assertEquals(100, filter2k.getRowCount());
+
+        VarcharLiteral year2021 = new VarcharLiteral("2021-12-01");
+        Statistics filter2021 = new FilterEstimation().estimate(new 
GreaterThan(a, year2021), baseStats);
+        Assertions.assertEquals(50, filter2021.getRowCount());
+    }
+
+    @Test
+    public void testStringRangeColToCol() {
+        SlotReference a = new SlotReference("a", new VarcharType(25));
+        ColumnStatisticBuilder columnStatisticBuilderA = new 
ColumnStatisticBuilder()
+                .setNdv(100)
+                .setAvgSizeByte(25)
+                .setNumNulls(0)
+                .setMaxExpr(new StringLiteral("2022-01-01"))
+                .setMaxValue(new VarcharLiteral("2022-01-01").getDouble())
+                .setMinExpr(new StringLiteral("2020-01-01"))
+                .setMinValue(new VarcharLiteral("2020-01-01").getDouble())
+                .setCount(100);
+
+        SlotReference b = new SlotReference("b", new VarcharType(25));
+        ColumnStatisticBuilder columnStatisticBuilderB = new 
ColumnStatisticBuilder()
+                .setNdv(100)
+                .setAvgSizeByte(25)
+                .setNumNulls(0)
+                .setMaxExpr(new StringLiteral("2012-01-01"))
+                .setMaxValue(new VarcharLiteral("2012-01-01").getDouble())
+                .setMinExpr(new StringLiteral("2010-01-01"))
+                .setMinValue(new VarcharLiteral("2010-01-01").getDouble())
+                .setCount(100);
+
+        SlotReference c = new SlotReference("c", new VarcharType(25));
+        ColumnStatisticBuilder columnStatisticBuilderC = new 
ColumnStatisticBuilder()
+                .setNdv(100)
+                .setAvgSizeByte(25)
+                .setNumNulls(0)
+                .setMaxExpr(new StringLiteral("2021-01-01"))
+                .setMaxValue(new VarcharLiteral("2021-01-01").getDouble())
+                .setMinExpr(new StringLiteral("2010-01-01"))
+                .setMinValue(new VarcharLiteral("2010-01-01").getDouble())
+                .setCount(100);
+
+        StatisticsBuilder statsBuilder = new StatisticsBuilder();
+        statsBuilder.setRowCount(100);
+        statsBuilder.putColumnStatistics(a, columnStatisticBuilderA.build());
+        statsBuilder.putColumnStatistics(b, columnStatisticBuilderB.build());
+        statsBuilder.putColumnStatistics(c, columnStatisticBuilderC.build());
+        Statistics baseStats = statsBuilder.build();
+
+        // (2020-2022) > (2010,2012), sel=1
+        Statistics agrtb = new FilterEstimation().estimate(new GreaterThan(a, 
b), baseStats);
+        Assertions.assertEquals(100, agrtb.getRowCount());
+        // (2020-2022) < (2010,2012), sel=0
+        Statistics alessb = new FilterEstimation().estimate(new LessThan(a, 
b), baseStats);
+        Assertions.assertEquals(0, alessb.getRowCount());
+
+        // (2020-2022) > (2010-2021), sel = DEFAULT (0.5)
+        Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, 
c), baseStats);
+        Assertions.assertEquals(50, agrtc.getRowCount());
+    }
 }
diff --git a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out 
b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out
index 9bf2e90dbbf..f57f1958aab 100644
--- a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out
+++ b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out
@@ -10,19 +10,18 @@ PhysicalResultSink
 --------------PhysicalProject
 ----------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_orderdate = 
dates.d_datekey)) otherCondition=() build RFs:RF2 d_datekey->[lo_orderdate]
 ------------------PhysicalProject
---------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_suppkey 
= supplier.s_suppkey)) otherCondition=() build RFs:RF1 s_suppkey->[lo_suppkey]
-----------------------PhysicalProject
-------------------------hashJoin[INNER_JOIN] 
hashCondition=((lineorder.lo_partkey = part.p_partkey)) otherCondition=() build 
RFs:RF0 p_partkey->[lo_partkey]
+--------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_partkey 
= part.p_partkey)) otherCondition=() build RFs:RF1 p_partkey->[lo_partkey]
+----------------------hashJoin[INNER_JOIN] 
hashCondition=((lineorder.lo_suppkey = supplier.s_suppkey)) otherCondition=() 
build RFs:RF0 s_suppkey->[lo_suppkey]
+------------------------PhysicalProject
+--------------------------PhysicalOlapScan[lineorder] apply RFs: RF0 RF1 RF2
+------------------------PhysicalDistribute[DistributionSpecReplicated]
 --------------------------PhysicalProject
-----------------------------PhysicalOlapScan[lineorder] apply RFs: RF0 RF1 RF2
---------------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------------PhysicalProject
-------------------------------filter((part.p_brand <= 'MFGR#2228') and 
(part.p_brand >= 'MFGR#2221'))
---------------------------------PhysicalOlapScan[part]
+----------------------------filter((supplier.s_region = 'ASIA'))
+------------------------------PhysicalOlapScan[supplier]
 ----------------------PhysicalDistribute[DistributionSpecReplicated]
 ------------------------PhysicalProject
---------------------------filter((supplier.s_region = 'ASIA'))
-----------------------------PhysicalOlapScan[supplier]
+--------------------------filter((part.p_brand <= 'MFGR#2228') and 
(part.p_brand >= 'MFGR#2221'))
+----------------------------PhysicalOlapScan[part]
 ------------------PhysicalDistribute[DistributionSpecReplicated]
 --------------------PhysicalProject
 ----------------------PhysicalOlapScan[dates]


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to