feiniaofeiafei commented on code in PR #64618:
URL: https://github.com/apache/doris/pull/64618#discussion_r3457456200
##########
fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InferSetOperatorDistinct.java:
##########
@@ -77,4 +96,82 @@ private boolean rejectNLJ(Plan plan) {
}
return true;
}
+
+ private boolean shouldGenerateAggregateByNdv(Plan plan, List<? extends
NamedExpression> groupKeys) {
+ Statistics stats = plan.getStats();
+ if (stats == null) {
+ stats = plan.accept(derive, new StatsDerive.DeriveContext());
+ }
+ if (stats.getRowCount() <= 0) {
+ return false;
+ }
+
+ List<ColumnStatistic> lower = new ArrayList<>();
+ List<ColumnStatistic> medium = new ArrayList<>();
+ List<ColumnStatistic> high = new ArrayList<>();
+
+ List<ColumnStatistic>[] cards = new List[] { lower, medium, high };
+
+ for (NamedExpression key : groupKeys) {
+ ColumnStatistic colStats =
ExpressionEstimation.INSTANCE.estimate(key, stats);
+ if (colStats.isUnKnown) {
+ return false;
+ }
+ if (stats.getRowCount() * 0.9 <= colStats.ndv) {
+ return false;
+ }
+ cards[groupByCardinality(colStats,
stats.getRowCount())].add(colStats);
+ }
+
+ double lowerCartesian = 1.0;
+ for (ColumnStatistic colStats : lower) {
+ lowerCartesian = lowerCartesian * colStats.ndv;
+ }
+
+ // Same NDV heuristic as EagerAggRewriter#checkStats, but kept local
because set-op
+ // local distinct and eager aggregation have different optimization
boundaries.
+ double lowerUpper = Math.max(stats.getRowCount() / 20, 1);
+ lowerUpper = Math.pow(lowerUpper, Math.max(lower.size() / 2, 1));
+
+ if (high.isEmpty() && (lower.size() + medium.size()) <= 2) {
+ return true;
+ }
+
+ if (high.isEmpty() && medium.isEmpty()) {
+ if (lower.size() == 1 && lowerCartesian * 20 <=
stats.getRowCount()) {
+ return true;
+ } else if (lower.size() == 2 && lowerCartesian * 7 <=
stats.getRowCount()) {
+ return true;
+ } else if (lower.size() <= 3 && lowerCartesian * 20 <=
stats.getRowCount()
+ && lowerCartesian < lowerUpper) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ if (high.size() >= 2 || medium.size() > 2 || (high.size() == 1 &&
!medium.isEmpty())) {
+ return false;
+ }
+
+ double lowerCartesianLowerBound = stats.getRowCount() /
LOWER_AGGREGATE_EFFECT_COEFFICIENT;
+ if (high.size() + medium.size() == 1 && lower.size() <= 2
+ && lowerCartesian <= lowerCartesianLowerBound) {
+ return true;
+ }
+
+ return false;
+ }
+
+ private int groupByCardinality(ColumnStatistic colStats, double rowCount) {
+ if (rowCount == 0 || colStats.ndv *
MEDIUM_AGGREGATE_EFFECT_COEFFICIENT > rowCount) {
+ return 2;
+ } else if (colStats.ndv * MEDIUM_AGGREGATE_EFFECT_COEFFICIENT <=
rowCount
+ && colStats.ndv * LOW_AGGREGATE_EFFECT_COEFFICIENT > rowCount)
{
+ return 1;
+ } else if (colStats.ndv * LOW_AGGREGATE_EFFECT_COEFFICIENT <=
rowCount) {
+ return 0;
+ }
Review Comment:
need not to fix
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]