This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new bcd641877f [Enhancement](scan) disable build key range and filters 
when push down agg work  (#14248)
bcd641877f is described below

commit bcd641877fc85ebc334c48f3912dcaa7f296bc45
Author: Pxl <pxl...@qq.com>
AuthorDate: Mon Nov 21 12:47:57 2022 +0800

    [Enhancement](scan) disable build key range and filters when push down agg 
work  (#14248)
    
    disable build key range and filters when push down agg work
---
 be/src/exec/olap_common.h                          | 183 ++++++++++++---------
 be/src/vec/exec/scan/new_olap_scan_node.cpp        | 110 ++++++++-----
 be/test/exec/olap_common_test.cpp                  |  14 +-
 .../test_aggregate_all_functions.out               |  16 +-
 .../test_aggregate_all_functions.groovy            |   8 +-
 run-be-ut.sh                                       |   2 +-
 6 files changed, 203 insertions(+), 130 deletions(-)

diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index 1823f8cbae..605ab9dda0 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <boost/lexical_cast.hpp>
+#include <cstdint>
 #include <map>
 #include <sstream>
 #include <string>
@@ -25,6 +26,7 @@
 #include <variant>
 
 #include "exec/olap_utils.h"
+#include "olap/olap_common.h"
 #include "olap/tuple.h"
 #include "runtime/primitive_type.h"
 #include "runtime/type_limit.h"
@@ -112,7 +114,13 @@ public:
 
     bool convert_to_avg_range_value(std::vector<OlapTuple>& begin_scan_keys,
                                     std::vector<OlapTuple>& end_scan_keys, 
bool& begin_include,
-                                    bool& end_include, bool* eos, int32_t 
max_scan_key_num);
+                                    bool& end_include, int32_t 
max_scan_key_num);
+
+    bool convert_to_close_range(std::vector<OlapTuple>& begin_scan_keys,
+                                std::vector<OlapTuple>& end_scan_keys, bool& 
begin_include,
+                                bool& end_include);
+
+    constexpr bool is_reject_split_type() const { return 
_is_reject_split_type; }
 
     bool has_intersection(ColumnValueRange<primitive_type>& range);
 
@@ -317,6 +325,16 @@ private:
     bool _contain_null;
     int _precision;
     int _scale;
+
+    static constexpr bool _is_reject_split_type = primitive_type == 
PrimitiveType::TYPE_LARGEINT ||
+                                                  primitive_type == 
PrimitiveType::TYPE_DECIMALV2 ||
+                                                  primitive_type == 
PrimitiveType::TYPE_HLL ||
+                                                  primitive_type == 
PrimitiveType::TYPE_VARCHAR ||
+                                                  primitive_type == 
PrimitiveType::TYPE_CHAR ||
+                                                  primitive_type == 
PrimitiveType::TYPE_STRING ||
+                                                  primitive_type == 
PrimitiveType::TYPE_BOOLEAN ||
+                                                  primitive_type == 
PrimitiveType::TYPE_DATETIME ||
+                                                  primitive_type == 
PrimitiveType::TYPE_DATETIMEV2;
 };
 
 class OlapScanKeys {
@@ -517,112 +535,123 @@ size_t 
ColumnValueRange<primitive_type>::get_convertible_fixed_value_size() cons
     return _high_value - _low_value;
 }
 
+// The return value indicates whether eos.
 template <PrimitiveType primitive_type>
-bool ColumnValueRange<primitive_type>::convert_to_avg_range_value(
+bool ColumnValueRange<primitive_type>::convert_to_close_range(
         std::vector<OlapTuple>& begin_scan_keys, std::vector<OlapTuple>& 
end_scan_keys,
-        bool& begin_include, bool& end_include, bool* eos, int32_t 
max_scan_key_num) {
-    constexpr bool reject_type = primitive_type == 
PrimitiveType::TYPE_LARGEINT ||
-                                 primitive_type == 
PrimitiveType::TYPE_DECIMALV2 ||
-                                 primitive_type == PrimitiveType::TYPE_HLL ||
-                                 primitive_type == PrimitiveType::TYPE_VARCHAR 
||
-                                 primitive_type == PrimitiveType::TYPE_CHAR ||
-                                 primitive_type == PrimitiveType::TYPE_STRING 
||
-                                 primitive_type == PrimitiveType::TYPE_BOOLEAN 
||
-                                 primitive_type == 
PrimitiveType::TYPE_DATETIME ||
-                                 primitive_type == 
PrimitiveType::TYPE_DATETIMEV2;
-    begin_include = is_begin_include();
-    end_include = is_end_include();
-    bool is_empty_range = false;
-    if constexpr (reject_type) {
-        begin_scan_keys.emplace_back();
-        begin_scan_keys.back().add_value(
-                cast_to_string<primitive_type, CppType>(get_range_min_value(), 
scale()),
-                contain_null());
-        end_scan_keys.emplace_back();
-        end_scan_keys.back().add_value(
-                cast_to_string<primitive_type, CppType>(get_range_max_value(), 
scale()));
-        return true;
-    } else if (is_low_value_mininum() && is_high_value_maximum()) {
-        // Do not split the range of whole range. TODO: figure out why the code
-        // execute here
-        begin_scan_keys.emplace_back();
-        begin_scan_keys.back().add_value(
-                cast_to_string<primitive_type, CppType>(get_range_min_value(), 
scale()),
-                contain_null());
-        end_scan_keys.emplace_back();
-        end_scan_keys.back().add_value(
-                cast_to_string<primitive_type, CppType>(get_range_max_value(), 
scale()));
-        return true;
-    } else {
-        CppType min_value = get_range_min_value();
-        CppType max_value = get_range_max_value();
+        bool& begin_include, bool& end_include) {
+    if constexpr (!_is_reject_split_type) {
+        begin_include = true;
+        end_include = true;
 
-        if (contain_null()) {
-            begin_scan_keys.emplace_back();
-            begin_scan_keys.back().add_null();
-            end_scan_keys.emplace_back();
-            end_scan_keys.back().add_null();
-        }
+        bool is_empty = false;
 
         if (!is_begin_include()) {
-            if (min_value == TYPE_MAX) {
-                is_empty_range = true;
+            if (_low_value == TYPE_MIN) {
+                is_empty = true;
             } else {
-                begin_include = true;
-                ++min_value;
+                ++_low_value;
             }
         }
         if (!is_end_include()) {
-            if (max_value == TYPE_MIN) {
-                is_empty_range = true;
+            if (_high_value == TYPE_MAX) {
+                is_empty = true;
             } else {
-                end_include = true;
-                --max_value;
+                --_high_value;
             }
         }
 
-        if (begin_include && end_include && min_value > max_value) {
-            is_empty_range = true;
+        if (_high_value < _low_value) {
+            is_empty = true;
         }
-        if (is_empty_range) {
-            if (!contain_null()) {
-                begin_scan_keys.clear();
-                end_scan_keys.clear();
-                *eos = true;
-            }
-            return false;
+
+        if (is_empty && !contain_null()) {
+            begin_scan_keys.clear();
+            end_scan_keys.clear();
+            return true;
         }
+    }
+    return false;
+}
 
-        int128_t range_size = is_fixed_value_convertible() ? 
(int128_t)max_value - min_value : 0;
-        size_t step_size = range_size / max_scan_key_num;
+// The return value indicates whether the split result is range or fixed value.
+template <PrimitiveType primitive_type>
+bool ColumnValueRange<primitive_type>::convert_to_avg_range_value(
+        std::vector<OlapTuple>& begin_scan_keys, std::vector<OlapTuple>& 
end_scan_keys,
+        bool& begin_include, bool& end_include, int32_t max_scan_key_num) {
+    if constexpr (!_is_reject_split_type) {
+        auto no_split = [&]() -> bool {
+            begin_scan_keys.emplace_back();
+            begin_scan_keys.back().add_value(
+                    cast_to_string<primitive_type, 
CppType>(get_range_min_value(), scale()),
+                    contain_null());
+            end_scan_keys.emplace_back();
+            end_scan_keys.back().add_value(
+                    cast_to_string<primitive_type, 
CppType>(get_range_max_value(), scale()));
+            return true;
+        };
 
-        auto current = min_value;
+        CppType min_value = get_range_min_value();
+        CppType max_value = get_range_max_value();
         if constexpr (primitive_type == PrimitiveType::TYPE_DATE) {
-            current.set_type(TimeType::TIME_DATE);
+            min_value.set_type(TimeType::TIME_DATE);
+            max_value.set_type(TimeType::TIME_DATE);
         }
 
-        while (current <= max_value) {
+        if (contain_null()) {
+            begin_scan_keys.emplace_back();
+            begin_scan_keys.back().add_null();
+            end_scan_keys.emplace_back();
+            end_scan_keys.back().add_null();
+        }
+
+        if (min_value > max_value || max_scan_key_num == 1) {
+            return no_split();
+        }
+
+        auto cast = [](const CppType& value) {
+            if constexpr (primitive_type == PrimitiveType::TYPE_DATE ||
+                          primitive_type == PrimitiveType::TYPE_DATEV2) {
+                return value;
+            } else {
+                return (int128_t)value;
+            }
+        };
+
+        // When CppType is date, we can not convert it to integer number and 
calculate distance.
+        // In other case, we convert element to int128 to avoit overflow.
+        size_t step_size = (cast(max_value) - min_value) / max_scan_key_num;
+
+        constexpr size_t MAX_STEP_SIZE = 1 << 20;
+        // When the step size is too large, the range is easy to not really 
contain data.
+        if (step_size > MAX_STEP_SIZE) {
+            return no_split();
+        }
+
+        while (true) {
             begin_scan_keys.emplace_back();
             begin_scan_keys.back().add_value(
-                    cast_to_string<primitive_type, CppType>(current, scale()));
+                    cast_to_string<primitive_type, CppType>(min_value, 
scale()));
 
-            if ((int128_t)max_value - current < step_size) {
-                current = max_value;
+            if (cast(max_value) - min_value < step_size) {
+                min_value = max_value;
             } else {
-                current += step_size;
+                min_value += step_size;
             }
 
             end_scan_keys.emplace_back();
             end_scan_keys.back().add_value(
-                    cast_to_string<primitive_type, CppType>(current, scale()));
+                    cast_to_string<primitive_type, CppType>(min_value, 
scale()));
 
-            if (current == max_value) {
+            if (min_value == max_value) {
                 break;
             }
-            ++current;
+            ++min_value;
         }
+
         return step_size != 0;
     }
+    return false;
 }
 
 template <PrimitiveType primitive_type>
@@ -945,9 +974,13 @@ Status 
OlapScanKeys::extend_scan_key(ColumnValueRange<primitive_type>& range,
             }
         }
     } else {
-        if (_begin_scan_keys.empty() && range.is_fixed_value_convertible() && 
_is_convertible) {
+        if (_begin_scan_keys.empty() && range.is_fixed_value_convertible() && 
_is_convertible &&
+            !range.is_reject_split_type()) {
+            *eos |= range.convert_to_close_range(_begin_scan_keys, 
_end_scan_keys, _begin_include,
+                                                 _end_include);
+
             if (range.convert_to_avg_range_value(_begin_scan_keys, 
_end_scan_keys, _begin_include,
-                                                 _end_include, eos, 
max_scan_key_num)) {
+                                                 _end_include, 
max_scan_key_num)) {
                 _has_range_value = true;
             }
             return Status::OK();
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp 
b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index 26a609864f..407021b891 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -17,6 +17,7 @@
 
 #include "vec/exec/scan/new_olap_scan_node.h"
 
+#include "common/status.h"
 #include "olap/storage_engine.h"
 #include "olap/tablet.h"
 #include "util/to_string.h"
@@ -135,6 +136,18 @@ static std::string olap_filters_to_string(const 
std::vector<doris::TCondition>&
     return filters_string;
 }
 
+inline std::string push_down_agg_to_string(const TPushAggOp::type& op) {
+    if (op == TPushAggOp::MINMAX) {
+        return "MINMAX";
+    } else if (op == TPushAggOp::COUNT) {
+        return "COUNT";
+    } else if (op == TPushAggOp::MIX) {
+        return "MIX";
+    } else {
+        return "NONE";
+    }
+}
+
 static std::string tablets_id_to_string(
         const std::vector<std::unique_ptr<TPaloScanRange>>& scan_ranges) {
     if (scan_ranges.empty()) {
@@ -160,57 +173,64 @@ Status NewOlapScanNode::_process_conjuncts() {
 }
 
 Status NewOlapScanNode::_build_key_ranges_and_filters() {
-    const std::vector<std::string>& column_names = 
_olap_scan_node.key_column_name;
-    const std::vector<TPrimitiveType::type>& column_types = 
_olap_scan_node.key_column_type;
-    DCHECK(column_types.size() == column_names.size());
-
-    // 1. construct scan key except last olap engine short key
-    _scan_keys.set_is_convertible(limit() == -1);
-
-    // we use `exact_range` to identify a key range is an exact range or not 
when we convert
-    // it to `_scan_keys`. If `exact_range` is true, we can just discard it 
from `_olap_filters`.
-    bool exact_range = true;
-    bool eos = false;
-    for (int column_index = 0;
-         column_index < column_names.size() && !_scan_keys.has_range_value() 
&& !eos;
-         ++column_index) {
-        auto iter = _colname_to_value_range.find(column_names[column_index]);
-        if (_colname_to_value_range.end() == iter) {
-            break;
-        }
+    if (!_olap_scan_node.__isset.push_down_agg_type_opt ||
+        _olap_scan_node.push_down_agg_type_opt == TPushAggOp::NONE) {
+        const std::vector<std::string>& column_names = 
_olap_scan_node.key_column_name;
+        const std::vector<TPrimitiveType::type>& column_types = 
_olap_scan_node.key_column_type;
+        DCHECK(column_types.size() == column_names.size());
+
+        // 1. construct scan key except last olap engine short key
+        _scan_keys.set_is_convertible(limit() == -1);
+
+        // we use `exact_range` to identify a key range is an exact range or 
not when we convert
+        // it to `_scan_keys`. If `exact_range` is true, we can just discard 
it from `_olap_filters`.
+        bool exact_range = true;
+        bool eos = false;
+        for (int column_index = 0;
+             column_index < column_names.size() && 
!_scan_keys.has_range_value() && !eos;
+             ++column_index) {
+            auto iter = 
_colname_to_value_range.find(column_names[column_index]);
+            if (_colname_to_value_range.end() == iter) {
+                break;
+            }
 
-        RETURN_IF_ERROR(std::visit(
-                [&](auto&& range) {
-                    // make a copy or range and pass to extend_scan_key, keep 
the range unchanged
-                    // because extend_scan_key method may change the first 
parameter.
-                    // but the original range may be converted to olap 
filters, if it's not a exact_range.
-                    auto temp_range = range;
-                    if (range.get_fixed_value_size() <= 
_max_pushdown_conditions_per_column) {
-                        RETURN_IF_ERROR(_scan_keys.extend_scan_key(temp_range, 
_max_scan_key_num,
-                                                                   
&exact_range, &eos));
-                        if (exact_range) {
-                            _colname_to_value_range.erase(iter->first);
+            RETURN_IF_ERROR(std::visit(
+                    [&](auto&& range) {
+                        // make a copy or range and pass to extend_scan_key, 
keep the range unchanged
+                        // because extend_scan_key method may change the first 
parameter.
+                        // but the original range may be converted to olap 
filters, if it's not a exact_range.
+                        auto temp_range = range;
+                        if (range.get_fixed_value_size() <= 
_max_pushdown_conditions_per_column) {
+                            RETURN_IF_ERROR(_scan_keys.extend_scan_key(
+                                    temp_range, _max_scan_key_num, 
&exact_range, &eos));
+                            if (exact_range) {
+                                _colname_to_value_range.erase(iter->first);
+                            }
                         }
-                    }
-                    return Status::OK();
-                },
-                iter->second));
-    }
-    _eos |= eos;
+                        return Status::OK();
+                    },
+                    iter->second));
+        }
+        _eos |= eos;
 
-    for (auto& iter : _colname_to_value_range) {
-        std::vector<TCondition> filters;
-        std::visit([&](auto&& range) { range.to_olap_filter(filters); }, 
iter.second);
+        for (auto& iter : _colname_to_value_range) {
+            std::vector<TCondition> filters;
+            std::visit([&](auto&& range) { range.to_olap_filter(filters); }, 
iter.second);
 
-        for (const auto& filter : filters) {
-            _olap_filters.push_back(filter);
+            for (const auto& filter : filters) {
+                _olap_filters.push_back(filter);
+            }
         }
-    }
 
-    // Append value ranges in "_not_in_value_ranges"
-    for (auto& range : _not_in_value_ranges) {
-        std::visit([&](auto&& the_range) { 
the_range.to_in_condition(_olap_filters, false); },
-                   range);
+        // Append value ranges in "_not_in_value_ranges"
+        for (auto& range : _not_in_value_ranges) {
+            std::visit([&](auto&& the_range) { 
the_range.to_in_condition(_olap_filters, false); },
+                       range);
+        }
+    } else {
+        _runtime_profile->add_info_string(
+                "PushDownAggregate",
+                
push_down_agg_to_string(_olap_scan_node.push_down_agg_type_opt));
     }
 
     if (_state->enable_profile()) {
diff --git a/be/test/exec/olap_common_test.cpp 
b/be/test/exec/olap_common_test.cpp
index 7ed520e84c..adf152bf0f 100644
--- a/be/test/exec/olap_common_test.cpp
+++ b/be/test/exec/olap_common_test.cpp
@@ -630,9 +630,9 @@ TEST_F(OlapScanKeysTest, EachtypeTest) {
         EXPECT_EQ(exact_range, true);
         scan_keys.get_key_range(&key_range);
         // contain null, [-128, 127]
-        EXPECT_EQ(key_range.size(), 1);
-        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), 
"null(-128)");
-        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->end_scan_range), 
"127");
+        EXPECT_EQ(key_range.size(), 257);
+        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[1]->begin_scan_range), 
"-128");
+        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[256]->end_scan_range), 
"127");
 
         EXPECT_TRUE(range.add_range(FILTER_LESS, 50).ok());
         scan_keys.clear();
@@ -655,9 +655,9 @@ TEST_F(OlapScanKeysTest, EachtypeTest) {
         EXPECT_TRUE(scan_keys.extend_scan_key(range, max_scan_key, 
&exact_range, &eos).ok());
         EXPECT_EQ(exact_range, true);
         scan_keys.get_key_range(&key_range);
-        EXPECT_EQ(key_range.size(), 1);
-        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), 
"null(-32768)");
-        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->end_scan_range), 
"32767");
+        EXPECT_EQ(key_range.size(), 49);
+        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[1]->begin_scan_range), 
"-32768");
+        
EXPECT_EQ(OlapScanKeys::to_print_key(key_range[max_scan_key]->end_scan_range), 
"32767");
 
         EXPECT_TRUE(range.add_range(FILTER_LARGER, 0).ok());
         scan_keys.clear();
@@ -678,7 +678,7 @@ TEST_F(OlapScanKeysTest, EachtypeTest) {
         scan_keys.get_key_range(&key_range);
 
         EXPECT_EQ(key_range.size(), max_scan_key);
-        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), 
"1");
+        EXPECT_EQ(OlapScanKeys::to_print_key(key_range[0]->begin_scan_range), 
"2");
         EXPECT_EQ(OlapScanKeys::to_print_key(key_range[max_scan_key - 
1]->end_scan_range), "32765");
     }
 }
diff --git 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.out
 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.out
index 41a9dda202..a66a5bc843 100644
--- 
a/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.out
+++ 
b/regression-test/data/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.out
@@ -223,5 +223,19 @@ beijing chengdu shanghai
 2
 
 -- !select44 --
-6      3975    2003    33035710        25819.948       78965.368       
4449.5830001831055
+6      3975    2003    33035710        25819.948000000 78965.368       
4449.5830001831055
+
+-- !select45 --
+1      10
+2      8
+2      441
+3      10
+5      29
+6      101
+
+-- !select46 --
+5      29
+
+-- !select47 --
+6
 
diff --git 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.groovy
 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.groovy
index 29efba378f..bfdaf70d67 100644
--- 
a/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/aggregate_functions/test_aggregate_all_functions.groovy
@@ -469,5 +469,11 @@ suite("test_aggregate_all_functions") {
        
     sql "DROP TABLE IF EXISTS ${tableName_10}"
 
-    qt_select44 """ select sum(distinct k1), sum(distinct k2), sum(distinct 
k3), sum(distinct cast(k4 as largeint)), sum(distinct k5), sum(distinct k8), 
sum(distinct k9) from test_query_db.test  """
+    qt_select44 """select sum(distinct k1), sum(distinct k2), sum(distinct 
k3), sum(distinct cast(k4 as largeint)), sum(distinct k5), sum(distinct k8), 
sum(distinct k9) from test_query_db.test  """
+
+    qt_select45 """select * from ${tableName_12} order by id,level"""
+
+    qt_select46 """select * from ${tableName_12} where id>=5 and id <=5 and 
level >10  order by id,level;"""
+
+    qt_select47 """select count(*) from ${tableName_12}"""
 }
diff --git a/run-be-ut.sh b/run-be-ut.sh
index fdbd59fd01..f66d65a64e 100755
--- a/run-be-ut.sh
+++ b/run-be-ut.sh
@@ -72,7 +72,7 @@ fi
 
 eval set -- "${OPTS}"
 
-PARALLEL="$(($(nproc) / 2 + 1))"
+PARALLEL="$(($(nproc) / 5 + 1))"
 
 CLEAN=0
 RUN=0


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to