This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 489db8125ba [improve](nereids) if a value occurs many times in a 
column, take it as hot value (#55827)
489db8125ba is described below

commit 489db8125ba8e181a1de82cde9aa728b9e59028b
Author: minghong <[email protected]>
AuthorDate: Wed Sep 17 08:37:53 2025 +0800

    [improve](nereids) if a value occurs many times in a column, take it as hot 
value (#55827)
    
    ### What problem does this PR solve?
    
    In the previous PR #55472, whether a value qualifies as a hot value for a 
certain column depends on whether its occurrence frequency is higher than the 
average frequency of all values (i.e., 1/ndv). However, this approach is not 
conducive to the use of skew join; therefore, we also include values from 
columns with uniform distribution but small ndv into the collection of hot 
values.
---
 .../doris/nereids/stats/StatsCalculator.java       |  3 +-
 .../java/org/apache/doris/qe/SessionVariable.java  | 35 +++++++++++++++++-----
 .../doris/statistics/util/StatisticsUtil.java      |  3 +-
 .../doris/nereids/stats/StatsCalculatorTest.java   |  2 +-
 4 files changed, 32 insertions(+), 11 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
index f129ac4e9df..f0ca1f1e6ba 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java
@@ -1526,7 +1526,8 @@ public class StatsCalculator extends 
DefaultPlanVisitor<Statistics, Void> {
             Map<Literal, Float> resultHotValues = new LinkedHashMap<>();
             for (Literal hot : unionHotValues.keySet()) {
                 float ratio = (float) (unionHotValues.get(hot) / 
unionRowCount);
-                if (ratio * colStatsBuilder.getNdv() >= 
SessionVariable.getHotValueThreshold()) {
+                if (ratio * colStatsBuilder.getNdv() >= 
SessionVariable.getSkewValueThreshold()
+                        || ratio >= SessionVariable.getHotValueThreshold()) {
                     resultHotValues.put(hot, ratio);
                 }
             }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 889eb4d91c8..f9cccbbfd85 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -794,24 +794,43 @@ public class SessionVariable implements Serializable, 
Writable {
         }
     }
 
-    public static final String HOT_VALUE_THRESHOLD = "hot_value_threshold";
+    public static final String SKEW_VALUE_THRESHOLD = "skew_value_threshold";
 
-    @VariableMgr.VarAttr(name = HOT_VALUE_THRESHOLD, needForward = true,
-                description = {"当列中某个特定值的出现次数大于等于(rowCount/ndv)× 
hotValueThreshold 时,该值即被视为热点值",
+
+    @VariableMgr.VarAttr(name = SKEW_VALUE_THRESHOLD, needForward = true,
+                description = {"当列中某个特定值的出现次数大于等于(rowCount/ndv)× 
skewValueThreshold 时,该值即被视为热点值",
                         "When the occurrence of a value in a column is greater 
than "
-                                + "hotValueThreshold tmies of average 
occurences "
-                                + "(occurrences >= hotValueThreshold * 
rowCount / ndv), "
+                                + "skewValueThreshold tmies of average 
occurences "
+                                + "(occurrences >= skewValueThreshold * 
rowCount / ndv), "
                                 + "the value is regarded as hot value"})
-    private double hotValueThreshold = 10;
+    private double skewValueThreshold = 10;
+
+    public void setSkewValueThreshold(int threshold) {
+        this.skewValueThreshold = threshold;
+    }
+
+    public static double getSkewValueThreshold() {
+        if (ConnectContext.get() != null) {
+            return 
ConnectContext.get().getSessionVariable().skewValueThreshold;
+        } else {
+            return 
Double.parseDouble(VariableMgr.getDefaultValue(SKEW_VALUE_THRESHOLD));
+        }
+    }
+
+    public static final String HOT_VALUE_THRESHOLD = "hot_value_threshold";
+    @VariableMgr.VarAttr(name = HOT_VALUE_THRESHOLD, needForward = true,
+            description = {"hot value 在列中出现的最小比例",
+                    "The minimum ratio of occurrences of a hot value in a 
column"})
+    private double hotValueThreshold = 0.10d;
 
-    public void setHotValueThreshold(int threshold) {
+    public void setHotValueThreshold(double threshold) {
         this.hotValueThreshold = threshold;
     }
 
     public static double getHotValueThreshold() {
         if (ConnectContext.get() != null) {
             if (ConnectContext.get().getState().isInternal()) {
-                return 0.0;
+                return 0.1;
             } else {
                 return 
ConnectContext.get().getSessionVariable().hotValueThreshold;
             }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 4f6cba39ab9..199dfd26987 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -1278,7 +1278,8 @@ public class StatisticsUtil {
             for (String oneRow : stringValues.split(" ;")) {
                 String[] oneRowSplit = oneRow.split(" :");
                 float value = Float.parseFloat(oneRowSplit[1]);
-                if (value >= avgOccurrences * 
SessionVariable.getHotValueThreshold()) {
+                if (value >= avgOccurrences * 
SessionVariable.getSkewValueThreshold()
+                        || value >= SessionVariable.getHotValueThreshold()) {
                     
org.apache.doris.nereids.trees.expressions.literal.StringLiteral stringLiteral =
                             new 
org.apache.doris.nereids.trees.expressions.literal.StringLiteral(
                                     oneRowSplit[0].replaceAll("\\\\:", ":")
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
index 0590078de22..c5d2f2fa737 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java
@@ -518,7 +518,7 @@ public class StatsCalculatorTest {
         StatsCalculator calculator = new StatsCalculator(null);
         Statistics outputStats = calculator.computeUnion(unionAll, 
ImmutableList.of(child0Stats, child1Stats));
         ColumnStatistic iaStatsOut = outputStats.findColumnStatistics(ia);
-        Assertions.assertEquals(1, iaStatsOut.getHotValues().size());
+        Assertions.assertEquals(3, iaStatsOut.getHotValues().size());
         Assertions.assertTrue(containsHotValue(iaStatsOut, "1"));
     }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to