(doris) branch branch-2.1 updated: [opt](inverted index) performance optimization for need_read_data in compound #35346 #36292 (#36404)

kxiao Wed, 19 Jun 2024 17:43:28 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new dabd27edd26 [opt](inverted index) performance optimization for 
need_read_data in compound  #35346 #36292 (#36404)
dabd27edd26 is described below

commit dabd27edd26f7bb857b03dad9e0521b27fd82267
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Jun 20 08:43:16 2024 +0800

    [opt](inverted index) performance optimization for need_read_data in 
compound  #35346 #36292 (#36404)
    
    pick from master
    https://github.com/apache/doris/pull/35346
    https://github.com/apache/doris/pull/36292
---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 54 ++++++++++---
 .../test_need_read_data_fault_injection.out        | 22 +++++
 .../data/inverted_index_p0/test_need_read_data.out | 15 ++++
 .../test_need_read_data_fault_injection.groovy     | 94 ++++++++++++++++++++++
 .../inverted_index_p0/test_need_read_data.groovy   | 82 +++++++++++++++++++
 5 files changed, 254 insertions(+), 13 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index c31ac3c659a..f93d6264058 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -359,6 +359,34 @@ Status SegmentIterator::_init_impl(const 
StorageReadOptions& opts) {
             _storage_name_and_type[i] = std::make_pair(field_name, 
storage_type);
         }
     }
+
+    // find columns that definitely require reading data, such as functions 
that are not pushed down.
+    {
+        std::set<std::string> push_down_preds;
+        for (auto* pred : _col_predicates) {
+            if (!_check_apply_by_inverted_index(pred)) {
+                continue;
+            }
+            push_down_preds.insert(_gen_predicate_result_sign(pred));
+        }
+        for (auto* pred : _col_preds_except_leafnode_of_andnode) {
+            if (!_check_apply_by_inverted_index(pred)) {
+                continue;
+            }
+            push_down_preds.insert(_gen_predicate_result_sign(pred));
+        }
+        for (auto& preds_in_remaining_vconjuct : 
_column_pred_in_remaining_vconjunct) {
+            const auto& column_name = preds_in_remaining_vconjuct.first;
+            for (auto& pred_info : preds_in_remaining_vconjuct.second) {
+                auto column_sign = _gen_predicate_result_sign(&pred_info);
+                if (!push_down_preds.contains(column_sign)) {
+                    auto cid = _opts.tablet_schema->field_index(column_name);
+                    _need_read_data_indices[cid] = true;
+                }
+            }
+        }
+    }
+
     return Status::OK();
 }
 
@@ -891,6 +919,7 @@ Status 
SegmentIterator::_apply_inverted_index_except_leafnode_of_andnode(
 
 Status SegmentIterator::_apply_index_except_leafnode_of_andnode() {
     for (auto* pred : _col_preds_except_leafnode_of_andnode) {
+        auto column_id = pred->column_id();
         auto pred_type = pred->type();
         bool is_support = pred_type == PredicateType::EQ || pred_type == 
PredicateType::NE ||
                           pred_type == PredicateType::LT || pred_type == 
PredicateType::LE ||
@@ -899,6 +928,7 @@ Status 
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
                           pred_type == PredicateType::IN_LIST ||
                           pred_type == PredicateType::NOT_IN_LIST;
         if (!is_support) {
+            _need_read_data_indices[column_id] = true;
             continue;
         }
 
@@ -908,16 +938,17 @@ Status 
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
         if (can_apply_by_inverted_index) {
             res = _apply_inverted_index_except_leafnode_of_andnode(pred, 
&bitmap);
         } else {
+            _need_read_data_indices[column_id] = true;
             continue;
         }
 
-        bool need_remaining_after_evaluate = 
_column_has_fulltext_index(pred->column_id()) &&
+        bool need_remaining_after_evaluate = 
_column_has_fulltext_index(column_id) &&
                                              
PredicateTypeTraits::is_equal_or_list(pred_type);
         if (!res.ok()) {
             if (_downgrade_without_index(res, need_remaining_after_evaluate)) {
                 // downgrade without index query
-                _not_apply_index_pred.insert(pred->column_id());
-                _need_read_data_indices[pred->column_id()] = true;
+                _not_apply_index_pred.insert(column_id);
+                _need_read_data_indices[column_id] = true;
                 continue;
             }
             LOG(WARNING) << "failed to evaluate index"
@@ -928,17 +959,10 @@ Status 
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
 
         std::string pred_result_sign = _gen_predicate_result_sign(pred);
         _rowid_result_for_index.emplace(pred_result_sign, std::make_pair(true, 
std::move(bitmap)));
-    }
 
-    for (auto* pred : _col_preds_except_leafnode_of_andnode) {
-        auto column_name = _schema->column(pred->column_id())->name();
-        if (!_remaining_conjunct_roots.empty() &&
-            _check_column_pred_all_push_down(column_name, true,
-                                             pred->type() == 
PredicateType::MATCH) &&
-            !pred->predicate_params()->marked_by_runtime_filter) {
-            // if column's need_read_data already set true, we can not set it 
to false now.
-            if (_need_read_data_indices.find(pred->column_id()) == 
_need_read_data_indices.end()) {
-                _need_read_data_indices[pred->column_id()] = false;
+        if (!pred->predicate_params()->marked_by_runtime_filter) {
+            if (!_need_read_data_indices.contains(column_id)) {
+                _need_read_data_indices[column_id] = false;
             }
         }
     }
@@ -1928,6 +1952,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t 
nrows_read_limit, uint32
             continue;
         }
 
+        DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", {
+            return Status::Error<ErrorCode::INTERNAL_ERROR>("{} does not need 
to read data");
+        })
+
         if (is_continuous) {
             size_t rows_read = nrows_read;
             _opts.stats->block_first_read_seek_num += 1;
diff --git 
a/regression-test/data/fault_injection_p0/test_need_read_data_fault_injection.out
 
b/regression-test/data/fault_injection_p0/test_need_read_data_fault_injection.out
new file mode 100644
index 00000000000..37885e404d3
--- /dev/null
+++ 
b/regression-test/data/fault_injection_p0/test_need_read_data_fault_injection.out
@@ -0,0 +1,22 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+863
+
+-- !sql --
+210
+
+-- !sql --
+0
+
+-- !sql --
+819
+
+-- !sql --
+199
+
+-- !sql --
+713
+
+-- !sql --
+18
+
diff --git a/regression-test/data/inverted_index_p0/test_need_read_data.out 
b/regression-test/data/inverted_index_p0/test_need_read_data.out
new file mode 100644
index 00000000000..d4ea5870e3e
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_need_read_data.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+8      \N
+4      -10
+13     -4
+2      1
+3      2
+3      3
+5      4
+5      5
+1      6
+1      7
+4      8
+1      9
+
diff --git 
a/regression-test/suites/fault_injection_p0/test_need_read_data_fault_injection.groovy
 
b/regression-test/suites/fault_injection_p0/test_need_read_data_fault_injection.groovy
new file mode 100644
index 00000000000..d7a92f8e7e9
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/test_need_read_data_fault_injection.groovy
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_need_read_data_fault_injection", "nonConcurrent") {
+    // define a sql table
+    def indexTbName = "test_need_read_data_fault_injection"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName}"
+    sql """
+      CREATE TABLE ${indexTbName} (
+        `@timestamp` int(11) NULL COMMENT "",
+        `clientip` varchar(20) NULL COMMENT "",
+        `request` text NULL COMMENT "",
+        `status` int(11) NULL COMMENT "",
+        `size` int(11) NULL COMMENT "",
+        INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1",
+        "disable_auto_compaction" = "true"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+            }
+        }
+    }
+
+    try {
+      load_httplogs_data.call(indexTbName, 
'test_need_read_data_fault_injection', 'true', 'json', 'documents-1000.json')
+
+      sql "sync"
+
+      try {
+        
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+
+        qt_sql """ select count() from ${indexTbName} where (request 
match_phrase 'hm' or request match_phrase 'jpg' or request match_phrase 'gif'); 
"""
+        qt_sql """ select count() from ${indexTbName} where (request 
match_phrase 'hm' or request match_phrase 'jpg' and request match_phrase 
'gif'); """
+        qt_sql """ select count() from ${indexTbName} where (request 
match_phrase 'hm' and request match_phrase 'jpg' and request match_phrase 
'gif'); """
+        qt_sql """ select count() from ${indexTbName} where (request 
match_phrase 'hm' and request match_phrase 'jpg' or request match_phrase 
'gif'); """
+
+        qt_sql """ select count() from ${indexTbName} where (clientip match 
'1' or request match 'jpg' or clientip match '2'); """
+        qt_sql """ select count() from ${indexTbName} where (clientip match 
'3' or request match 'gif' or clientip match '4'); """
+        qt_sql """ select count() from ${indexTbName} where (clientip match 
'images' or clientip match '5' or clientip match 'english'); """
+
+      } finally {
+        
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+      }
+    } finally {
+    }
+}
\ No newline at end of file
diff --git 
a/regression-test/suites/inverted_index_p0/test_need_read_data.groovy 
b/regression-test/suites/inverted_index_p0/test_need_read_data.groovy
new file mode 100644
index 00000000000..86993d81e03
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_need_read_data.groovy
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_need_read_data", "p0"){
+    def indexTbName1 = "test_need_read_data"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+    sql """
+      create table ${indexTbName1} (
+      col_int_undef_signed_not_null int  not null  ,
+      col_bigint_undef_signed_not_null_index_inverted bigint  not null  ,
+      col_bigint_undef_signed_not_null bigint  not null  ,
+      col_int_undef_signed int  null  ,
+      col_int_undef_signed_index_inverted int  null  ,
+      col_int_undef_signed_not_null_index_inverted int  not null  ,
+      col_bigint_undef_signed bigint  null  ,
+      col_bigint_undef_signed_index_inverted bigint  null  ,
+      col_date_undef_signed date  null  ,
+      col_date_undef_signed_index_inverted date  null  ,
+      col_date_undef_signed_not_null date  not null  ,
+      col_date_undef_signed_not_null_index_inverted date  not null  ,
+      col_varchar_10__undef_signed varchar(10)  null  ,
+      col_varchar_10__undef_signed_index_inverted varchar(10)  null  ,
+      col_varchar_10__undef_signed_not_null varchar(10)  not null  ,
+      col_varchar_10__undef_signed_not_null_index_inverted varchar(10)  not 
null  ,
+      col_varchar_1024__undef_signed varchar(1024)  null  ,
+      col_varchar_1024__undef_signed_index_inverted varchar(1024)  null  ,
+      col_varchar_1024__undef_signed_not_null varchar(1024)  not null  ,
+      col_varchar_1024__undef_signed_not_null_index_inverted varchar(1024)  
not null  ,
+      pk int,
+      INDEX col_int_undef_signed_index_inverted_idx 
(`col_int_undef_signed_index_inverted`) USING INVERTED,
+      INDEX col_int_undef_signed_not_null_index_inverted_idx 
(`col_int_undef_signed_not_null_index_inverted`) USING INVERTED,
+      INDEX col_bigint_undef_signed_index_inverted_idx 
(`col_bigint_undef_signed_index_inverted`) USING INVERTED,
+      INDEX col_bigint_undef_signed_not_null_index_inverted_idx 
(`col_bigint_undef_signed_not_null_index_inverted`) USING INVERTED,
+      INDEX col_date_undef_signed_index_inverted_idx 
(`col_date_undef_signed_index_inverted`) USING INVERTED,
+      INDEX col_date_undef_signed_not_null_index_inverted_idx 
(`col_date_undef_signed_not_null_index_inverted`) USING INVERTED,
+      INDEX col_varchar_10__undef_signed_index_inverted_idx 
(`col_varchar_10__undef_signed_index_inverted`) USING INVERTED,
+      INDEX col_varchar_10__undef_signed_not_null_index_inverted_idx 
(`col_varchar_10__undef_signed_not_null_index_inverted`) USING INVERTED,
+      INDEX col_varchar_1024__undef_signed_index_inverted_idx 
(`col_varchar_1024__undef_signed_index_inverted`) USING INVERTED,
+      INDEX col_varchar_1024__undef_signed_not_null_index_inverted_idx 
(`col_varchar_1024__undef_signed_not_null_index_inverted`) USING INVERTED
+      ) engine=olap
+      UNIQUE KEY(col_int_undef_signed_not_null, 
col_bigint_undef_signed_not_null_index_inverted, 
col_bigint_undef_signed_not_null)
+      PARTITION BY             RANGE(col_int_undef_signed_not_null) (
+                      PARTITION p0 VALUES LESS THAN ('4'),
+                      PARTITION p1 VALUES LESS THAN ('6'),
+                      PARTITION p2 VALUES LESS THAN ('7'),
+                      PARTITION p3 VALUES LESS THAN ('8'),
+                      PARTITION p4 VALUES LESS THAN ('10'),
+                      PARTITION p5 VALUES LESS THAN ('1147483647'),
+                      PARTITION p100 VALUES LESS THAN ('2147483647')
+                  )
+      distributed by hash(col_bigint_undef_signed_not_null)
+      properties("enable_unique_key_merge_on_write" = "true", 
"replication_num" = "1");
+    """
+
+    try {
+        sql """ insert into 
${indexTbName1}(pk,col_int_undef_signed,col_int_undef_signed_index_inverted,col_int_undef_signed_not_null,col_int_undef_signed_not_null_index_inverted,col_bigint_undef_signed,col_bigint_undef_signed_index_inverted,col_bigint_undef_signed_not_null,col_bigint_undef_signed_not_null_index_inverted,col_date_undef_signed,col_date_undef_signed_index_inverted,col_date_undef_signed_not_null,col_date_undef_signed_not_null_index_inverted,col_varchar_10__undef_signed,col_
 [...]
+
+        sql "sync"
+
+        qt_sql """ SELECT COUNT( *) AS field1, col_int_undef_signed AS field2 
FROM ${indexTbName1} WHERE( col_date_undef_signed_not_null_index_inverted == 
'2024-01-01' OR day( col_date_undef_signed_not_null_index_inverted ) !=0 ) 
GROUP BY field2 ORDER BY field2; """
+
+    } finally {
+        //try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch branch-2.1 updated: [opt](inverted index) performance optimization for need_read_data in compound #35346 #36292 (#36404)

Reply via email to