This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c8fd93da3c [fix](inverted index) fix multi match result error (#38931)
0c8fd93da3c is described below

commit 0c8fd93da3cd3e8d077a99b8a1517fdb1bb16f50
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Wed Aug 7 21:49:04 2024 +0800

    [fix](inverted index) fix multi match result error (#38931)
    
    1. multi_match result merging logic is incorrect
---
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  14 +--
 be/src/olap/rowset/segment_v2/segment_iterator.h   |   2 +-
 be/src/vec/exprs/vectorized_fn_call.cpp            |   5 +-
 be/src/vec/exprs/vectorized_fn_call.h              |   3 +-
 be/src/vec/exprs/vexpr.h                           |   3 +-
 be/src/vec/functions/function.h                    |  10 +-
 be/src/vec/functions/function_multi_match.cpp      |  33 +++---
 be/src/vec/functions/function_multi_match.h        |   4 +-
 .../inverted_index_p0/test_index_multi_match.out   |  25 +++++
 .../test_index_multi_match.groovy                  | 124 +++++++++++++++++++++
 10 files changed, 187 insertions(+), 36 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 0473ff128fc..db27bc45405 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1476,13 +1476,12 @@ Status 
SegmentIterator::_init_inverted_index_iterators() {
 }
 
 Status SegmentIterator::_init_inverted_index_iterators(ColumnId cid) {
+    std::lock_guard lock(_idx_init_lock);
     if (_inverted_index_iterators[cid] == nullptr) {
-        return _init_single_inverted_index_iterator.call([&] {
-            return _segment->new_inverted_index_iterator(
-                    _opts.tablet_schema->column(cid),
-                    
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
-                    _opts, &_inverted_index_iterators[cid]);
-        });
+        return _segment->new_inverted_index_iterator(
+                _opts.tablet_schema->column(cid),
+                
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
+                _opts, &_inverted_index_iterators[cid]);
     }
     return Status::OK();
 }
@@ -3066,9 +3065,8 @@ Status SegmentIterator::execute_func_expr(const 
vectorized::VExprSPtr& expr,
     params._unique_id = _schema->unique_id(slot_expr->column_id());
     params._column_name = 
_opts.tablet_schema->column(params._column_id).name();
     params._segment_iterator = this;
-    params.result = result;
 
-    return expr->eval_inverted_index(expr_ctx.get(), params);
+    return expr->eval_inverted_index(expr_ctx.get(), params, result);
 }
 
 } // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h 
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index f163376d95f..8056036bc98 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -532,7 +532,7 @@ private:
     std::unordered_map<int, std::unordered_map<std::string, bool>>
             _column_predicate_inverted_index_status;
 
-    DorisCallOnce<Status> _init_single_inverted_index_iterator;
+    std::mutex _idx_init_lock;
 };
 
 } // namespace segment_v2
diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp 
b/be/src/vec/exprs/vectorized_fn_call.cpp
index 7be01469db7..591d72d6f26 100644
--- a/be/src/vec/exprs/vectorized_fn_call.cpp
+++ b/be/src/vec/exprs/vectorized_fn_call.cpp
@@ -263,8 +263,9 @@ bool VectorizedFnCall::can_fast_execute() const {
 }
 
 Status VectorizedFnCall::eval_inverted_index(VExprContext* context,
-                                             segment_v2::FuncExprParams& 
params) {
-    return 
_function->eval_inverted_index(context->fn_context(_fn_context_index), params);
+                                             segment_v2::FuncExprParams& 
params,
+                                             
std::shared_ptr<roaring::Roaring>& result) {
+    return 
_function->eval_inverted_index(context->fn_context(_fn_context_index), params, 
result);
 }
 
 bool VectorizedFnCall::equals(const VExpr& other) {
diff --git a/be/src/vec/exprs/vectorized_fn_call.h 
b/be/src/vec/exprs/vectorized_fn_call.h
index 0fa41b88522..02d843b1795 100644
--- a/be/src/vec/exprs/vectorized_fn_call.h
+++ b/be/src/vec/exprs/vectorized_fn_call.h
@@ -75,7 +75,8 @@ public:
 
     bool can_push_down_to_index() const override;
     bool can_fast_execute() const override;
-    Status eval_inverted_index(VExprContext* context, 
segment_v2::FuncExprParams& params) override;
+    Status eval_inverted_index(VExprContext* context, 
segment_v2::FuncExprParams& params,
+                               std::shared_ptr<roaring::Roaring>& result) 
override;
     bool equals(const VExpr& other) override;
 
 protected:
diff --git a/be/src/vec/exprs/vexpr.h b/be/src/vec/exprs/vexpr.h
index ee2a221866a..4b13f635f78 100644
--- a/be/src/vec/exprs/vexpr.h
+++ b/be/src/vec/exprs/vexpr.h
@@ -241,7 +241,8 @@ public:
 
     virtual bool can_push_down_to_index() const { return false; }
     virtual bool can_fast_execute() const { return false; }
-    virtual Status eval_inverted_index(VExprContext* context, 
segment_v2::FuncExprParams& params) {
+    virtual Status eval_inverted_index(VExprContext* context, 
segment_v2::FuncExprParams& params,
+                                       std::shared_ptr<roaring::Roaring>& 
result) {
         return Status::NotSupported("Not supported 
execute_with_inverted_index");
     }
     virtual bool equals(const VExpr& other);
diff --git a/be/src/vec/functions/function.h b/be/src/vec/functions/function.h
index f4243875d64..149b1f6e9d5 100644
--- a/be/src/vec/functions/function.h
+++ b/be/src/vec/functions/function.h
@@ -235,8 +235,8 @@ public:
     }
 
     virtual bool can_push_down_to_index() const { return false; }
-    virtual Status eval_inverted_index(FunctionContext* context,
-                                       segment_v2::FuncExprParams& params) {
+    virtual Status eval_inverted_index(FunctionContext* context, 
segment_v2::FuncExprParams& params,
+                                       std::shared_ptr<roaring::Roaring>& 
result) {
         return Status::NotSupported("eval_inverted_index is not supported in 
function: ",
                                     get_name());
     }
@@ -543,9 +543,9 @@ public:
     }
 
     bool can_push_down_to_index() const override { return 
function->can_push_down_to_index(); }
-    Status eval_inverted_index(FunctionContext* context,
-                               segment_v2::FuncExprParams& params) override {
-        return function->eval_inverted_index(context, params);
+    Status eval_inverted_index(FunctionContext* context, 
segment_v2::FuncExprParams& params,
+                               std::shared_ptr<roaring::Roaring>& result) 
override {
+        return function->eval_inverted_index(context, params, result);
     }
 
 private:
diff --git a/be/src/vec/functions/function_multi_match.cpp 
b/be/src/vec/functions/function_multi_match.cpp
index d4ff5c10dca..ba7fa887f19 100644
--- a/be/src/vec/functions/function_multi_match.cpp
+++ b/be/src/vec/functions/function_multi_match.cpp
@@ -75,7 +75,11 @@ Status FunctionMultiMatch::open(FunctionContext* context,
                         field_names_str.end());
                 std::vector<std::string> field_names;
                 boost::split(field_names, field_names_str, 
boost::algorithm::is_any_of(","));
-                state->fields.insert(field_names.begin(), field_names.end());
+                for (const auto& field_name : field_names) {
+                    if (!field_name.empty()) {
+                        state->fields.insert(field_name);
+                    }
+                }
             } break;
             case 2:
                 state->type = const_data.to_string();
@@ -93,7 +97,8 @@ Status FunctionMultiMatch::open(FunctionContext* context,
 }
 
 Status FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
-                                               segment_v2::FuncExprParams& 
params) {
+                                               segment_v2::FuncExprParams& 
params,
+                                               
std::shared_ptr<roaring::Roaring>& result) {
     auto* match_param = reinterpret_cast<MatchParam*>(
             context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
     if (match_param == nullptr) {
@@ -106,7 +111,6 @@ Status 
FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
     const auto& tablet_schema = opts.tablet_schema;
 
     std::vector<ColumnId> columns_ids;
-
     for (const auto& column_name : match_param->fields) {
         auto cid = tablet_schema->field_index(column_name);
         if (cid < 0) {
@@ -148,14 +152,14 @@ Status 
FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
     auto* cache = InvertedIndexQueryCache::instance();
     InvertedIndexQueryCacheHandle cache_handler;
     if (cache->lookup(cache_key, &cache_handler)) {
-        params.result = cache_handler.get_bitmap();
+        result = cache_handler.get_bitmap();
         return Status::OK();
     }
 
     // search
-    bool first = true;
     for (const auto& column_name : match_param->fields) {
         auto cid = tablet_schema->field_index(column_name);
+        const auto& column = *DORIS_TRY(tablet_schema->column(column_name));
 
         auto& index_iterator = 
segment_iterator->inverted_index_iterators()[cid];
         if (!index_iterator) {
@@ -163,19 +167,16 @@ Status 
FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
         }
         const auto& index_reader = index_iterator->reader();
 
-        auto result = std::make_shared<roaring::Roaring>();
-        RETURN_IF_ERROR(index_reader->query(opts.stats, opts.runtime_state, 
column_name,
-                                            match_param->query.data(), 
query_type, result));
-        if (first) {
-            (*params.result).swap(*result);
-            first = false;
-        } else {
-            (*params.result) |= (*result);
-        }
+        auto single_result = std::make_shared<roaring::Roaring>();
+        StringRef query_value(match_param->query.data());
+        RETURN_IF_ERROR(index_reader->query(opts.stats, opts.runtime_state,
+                                            
std::to_string(column.unique_id()), &query_value,
+                                            query_type, single_result));
+        (*result) |= (*single_result);
     }
 
-    params.result->runOptimize();
-    cache->insert(cache_key, params.result, &cache_handler);
+    result->runOptimize();
+    cache->insert(cache_key, result, &cache_handler);
 
     return Status::OK();
 }
diff --git a/be/src/vec/functions/function_multi_match.h 
b/be/src/vec/functions/function_multi_match.h
index 55e13778ab7..b7d2bd3c30e 100644
--- a/be/src/vec/functions/function_multi_match.h
+++ b/be/src/vec/functions/function_multi_match.h
@@ -63,8 +63,8 @@ public:
 
     bool can_push_down_to_index() const override { return true; }
 
-    Status eval_inverted_index(FunctionContext* context,
-                               segment_v2::FuncExprParams& params) override;
+    Status eval_inverted_index(FunctionContext* context, 
segment_v2::FuncExprParams& params,
+                               std::shared_ptr<roaring::Roaring>& result) 
override;
 };
 
 } // namespace doris::vectorized
diff --git a/regression-test/data/inverted_index_p0/test_index_multi_match.out 
b/regression-test/data/inverted_index_p0/test_index_multi_match.out
new file mode 100644
index 00000000000..0a2ed2730b4
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_multi_match.out
@@ -0,0 +1,25 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+178
+
+-- !sql --
+180
+
+-- !sql --
+859
+
+-- !sql --
+44
+
+-- !sql --
+178
+
+-- !sql --
+180
+
+-- !sql --
+859
+
+-- !sql --
+44
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_multi_match.groovy 
b/regression-test/suites/inverted_index_p0/test_index_multi_match.groovy
new file mode 100644
index 00000000000..f08dd984a67
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_multi_match.groovy
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_multi_match", "p0"){
+    def indexTbName1 = "test_index_multi_match_1"
+    def indexTbName2 = "test_index_multi_match_2"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+    sql "DROP TABLE IF EXISTS ${indexTbName2}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+        `@timestamp` int(11) NULL COMMENT "",
+        `clientip` text NULL COMMENT "",
+        `request` text NULL COMMENT "",
+        `status` text NULL COMMENT "",
+        `size` text NULL COMMENT "",
+        INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX status_idx (`status`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX size_idx (`size`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1",
+        "disable_auto_compaction" = "true"
+      );
+    """
+
+    sql """
+      CREATE TABLE ${indexTbName2} (
+        `@timestamp` int(11) NULL COMMENT "",
+        `clientip` text NULL COMMENT "",
+        `request` text NULL COMMENT "",
+        `status` text NULL COMMENT "",
+        `size` text NULL COMMENT "",
+        INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX status_idx (`status`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT '',
+        INDEX size_idx (`size`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1",
+        "disable_auto_compaction" = "true"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+        
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+            
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+                    assertEquals("success", json.Status.toLowerCase())
+                    if (expected_succ_rows >= 0) {
+                        assertEquals(json.NumberLoadedRows, expected_succ_rows)
+                    } else {
+                        assertEquals(json.NumberTotalRows, 
json.NumberLoadedRows + json.NumberUnselectedRows)
+                        assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes 
> 0)
+                }
+            }
+        }
+    }
+
+    try {
+        load_httplogs_data.call(indexTbName1, 'test_index_multi_match_1', 
'true', 'json', 'documents-1000.json')
+        load_httplogs_data.call(indexTbName2, 'test_index_multi_match_2', 
'true', 'json', 'documents-1000.json')
+
+        sql "sync"
+
+        qt_sql """ select count() from ${indexTbName1} where (clientip 
match_phrase_prefix '2'); """
+        qt_sql """ select count() from ${indexTbName1} where (clientip 
match_phrase_prefix '2' or request match_phrase_prefix '2'); """
+        qt_sql """ select count() from ${indexTbName1} where (clientip 
match_phrase_prefix '2' or request match_phrase_prefix '2' or status 
match_phrase_prefix '2' or size match_phrase_prefix '2'); """
+        qt_sql """ select count() from ${indexTbName1} where (clientip 
match_phrase_prefix 'a' or request match_phrase_prefix 'a' or status 
match_phrase_prefix 'a' or size match_phrase_prefix 'a'); """
+
+        qt_sql """ select count() from ${indexTbName2} where 
multi_match(clientip, '', 'phrase_prefix', '2'); """
+        qt_sql """ select count() from ${indexTbName2} where 
multi_match(clientip, 'request', 'phrase_prefix', '2'); """
+        qt_sql """ select count() from ${indexTbName2} where 
multi_match(clientip, 'request, status, size', 'phrase_prefix', '2'); """
+        qt_sql """ select count() from ${indexTbName2} where 
multi_match(clientip, 'request, status, size', 'phrase_prefix', 'a'); """
+
+    } finally {
+        //try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to