englefly commented on code in PR #40654: URL: https://github.com/apache/doris/pull/40654#discussion_r1753609665
########## fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java: ########## @@ -177,14 +181,27 @@ private static double computeSelectivityForBuildSideWhenColStatsUnknown(Statisti if (cond instanceof EqualTo) { EqualTo equal = (EqualTo) cond; if (equal.left() instanceof Slot && equal.right() instanceof Slot) { - ColumnStatistic buildColStats = buildStats.findColumnStatistics(equal.left()); - if (buildColStats == null) { - buildColStats = buildStats.findColumnStatistics(equal.right()); + Slot buildSideSlot = null; + if (buildStats.findColumnStatistics(equal.left()) != null) { + buildSideSlot = (Slot) equal.left(); + } else if (buildStats.findColumnStatistics(equal.right()) != null) { + buildSideSlot = (Slot) equal.right(); } - if (buildColStats != null) { - double buildSel = Math.min(buildStats.getRowCount() / buildColStats.count, 1.0); - buildSel = Math.max(buildSel, UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND); - sel = Math.min(sel, buildSel); + // the processing is mainly for olap table since external table rarely has column level statistics Review Comment: 1. if this funciton does not work for external table, put this check at the begining of this function. 2. It seems not good if the sel for external table is always 1 ########## fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java: ########## @@ -56,9 +56,24 @@ public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.updatedTime = columnStatistic.updatedTime; } - public ColumnStatisticBuilder setCount(double count) { + // ATTENTION: DON'T USE FOLLOWING TWO DURING STATS DERIVING EXCEPT FOR INITIALIZATION + public ColumnStatisticBuilder(double count) { this.count = count; - return this; + } + + public ColumnStatisticBuilder(ColumnStatistic columnStatistic, double count) { Review Comment: this constructor can be removed. the parameter columnStatistic is always UNKNOWN. It can be replaced by ColumnStatisticBuilder(count) ########## fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java: ########## @@ -403,8 +403,7 @@ private Statistics computeOlapScan(OlapScan olapScan) { for (Slot slot : ((Relation) olapScan).getOutput()) { if (derivedStats.findColumnStatistics(slot) == null) { derivedStats.addColumnStats(slot, - new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN) - .setCount(derivedRowCount).build()); + new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN, derivedRowCount).build()); Review Comment: new ColumnStatisticBuilder(derivedRowCount).build() ########## fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java: ########## @@ -177,14 +181,27 @@ private static double computeSelectivityForBuildSideWhenColStatsUnknown(Statisti if (cond instanceof EqualTo) { EqualTo equal = (EqualTo) cond; if (equal.left() instanceof Slot && equal.right() instanceof Slot) { - ColumnStatistic buildColStats = buildStats.findColumnStatistics(equal.left()); - if (buildColStats == null) { - buildColStats = buildStats.findColumnStatistics(equal.right()); + Slot buildSideSlot = null; + if (buildStats.findColumnStatistics(equal.left()) != null) { + buildSideSlot = (Slot) equal.left(); + } else if (buildStats.findColumnStatistics(equal.right()) != null) { + buildSideSlot = (Slot) equal.right(); } - if (buildColStats != null) { - double buildSel = Math.min(buildStats.getRowCount() / buildColStats.count, 1.0); - buildSel = Math.max(buildSel, UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND); - sel = Math.min(sel, buildSel); + // the processing is mainly for olap table since external table rarely has column level statistics + if (buildSideSlot instanceof SlotReference + && ((SlotReference) buildSideSlot).getTable().isPresent() + && (((SlotReference) buildSideSlot).getTable().get() instanceof OlapTable)) { + OlapTable olapTable = (OlapTable) ((SlotReference) buildSideSlot).getTable().get(); + TableStatsMeta tableStatsMeta = Env.getCurrentEnv().getAnalysisManager() + .findTableStatsStatus(olapTable.getId()); + if (tableStatsMeta != null) { + double tableRowCount = tableStatsMeta.getRowCount(olapTable.getBaseIndexId()); Review Comment: 1. the base table row count is usually much more than that of rollup. 2. tableStatsMeta.getRowCount(olapTable.getBaseIndexId()) may return -1, if the table is not analyzed. In fact, BE reported rowCount has higher priority than that in TableMeta. 3. the assumption: "this slot is a base table slot” is easiy broken by alias. for example " select .. from (select A as x from T) T1 ########## fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java: ########## @@ -1411,7 +1404,7 @@ private Statistics computeWindow(Window windowOperator) { private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats, double rightRowCount, DataType dataType) { if (leftStats.isUnKnown() || rightStats.isUnKnown()) { - return new ColumnStatisticBuilder(leftStats).setCount(leftRowCount + rightRowCount).build(); + return new ColumnStatisticBuilder(leftStats).build(); Review Comment: return new ColumnStatisticBuilder(leftStats.count + rightStats.count).build(); ########## fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java: ########## @@ -323,6 +323,8 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats selectivity = DEFAULT_INEQUALITY_COEFFICIENT; } else { double ndv = statsForLeft.ndv; + double numNulls = statsForLeft.numNulls; Review Comment: numNulls = statsForLeft.numNulls * statsForRight.ndv Usually, this func is used to evaluate join condition: T1.A=T2.B the context.statistics is in charge of T1 cross join T2, and hence, the number of tuples, in which A=null or B=null, is A.numNulls * B.numNulls ########## fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatistic.java: ########## @@ -38,14 +38,14 @@ public class PartitionColumnStatistic { private static final Logger LOG = LogManager.getLogger(PartitionColumnStatistic.class); - public static PartitionColumnStatistic UNKNOWN = new PartitionColumnStatisticBuilder().setAvgSizeByte(1) - .setNdv(new Hll128()).setNumNulls(1).setCount(1).setMaxValue(Double.POSITIVE_INFINITY) + public static PartitionColumnStatistic UNKNOWN = new PartitionColumnStatisticBuilder(1).setAvgSizeByte(1) + .setNdv(new Hll128()).setNumNulls(1).setMaxValue(Double.POSITIVE_INFINITY) .setMinValue(Double.NEGATIVE_INFINITY) .setIsUnknown(true).setUpdatedTime("") .build(); - public static PartitionColumnStatistic ZERO = new PartitionColumnStatisticBuilder().setAvgSizeByte(0) - .setNdv(new Hll128()).setNumNulls(0).setCount(0).setMaxValue(Double.NaN).setMinValue(Double.NaN) + public static PartitionColumnStatistic ZERO = new PartitionColumnStatisticBuilder(0).setAvgSizeByte(0) Review Comment: not used any more. Please remove it ########## fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java: ########## @@ -197,7 +198,6 @@ public ColumnStatistic updateByLimit(long limit, double rowCount) { } double newNdv = Math.ceil(Math.min(ndv, limit)); return new ColumnStatisticBuilder() - .setCount(Math.ceil(limit)) Review Comment: the function "updateByLimit" can be removed. ########## fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java: ########## @@ -50,9 +50,23 @@ public PartitionColumnStatisticBuilder(PartitionColumnStatistic statistic) { this.updatedTime = statistic.updatedTime; } - public PartitionColumnStatisticBuilder setCount(double count) { + // ATTENTION: DON'T USE FOLLOWING TWO DURING STATS DERIVING EXCEPT FOR INITIALIZATION + public PartitionColumnStatisticBuilder(double count) { this.count = count; - return this; + } + + public PartitionColumnStatisticBuilder(PartitionColumnStatistic partitionColumnStatistic, double count) { Review Comment: partitionColumnStatistic is always unkown, remove it -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org