(doris) 05/07: [feature](invert index) match_phrase_edge feature added (#31142)

yiguolei Thu, 29 Feb 2024 03:53:23 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 92e3b31f506ae229d8f37207edfbb57539e3a63a
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Feb 29 19:47:26 2024 +0800

    [feature](invert index) match_phrase_edge feature added (#31142)
---
 be/src/exec/olap_common.h                          |   2 +
 be/src/exec/olap_utils.h                           |  14 ++-
 be/src/olap/match_predicate.cpp                    |   6 +-
 .../inverted_index/query/phrase_edge_query.cpp     | 128 +++++++++++++++++++++
 .../inverted_index/query/phrase_edge_query.h       |  52 +++++++++
 .../inverted_index/query/query_factory.h           |  15 +--
 .../rowset/segment_v2/inverted_index_query_type.h  |   7 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    |   5 +-
 be/src/vec/functions/match.cpp                     |   1 +
 be/src/vec/functions/match.h                       |  17 +++
 .../antlr4/org/apache/doris/nereids/DorisLexer.g4  |   1 +
 .../antlr4/org/apache/doris/nereids/DorisParser.g4 |   2 +-
 fe/fe-core/src/main/cup/sql_parser.cup             |   5 +-
 .../org/apache/doris/analysis/MatchPredicate.java  |  11 ++
 .../doris/nereids/parser/LogicalPlanBuilder.java   |   7 ++
 .../doris/nereids/trees/expressions/Match.java     |   2 +
 .../nereids/trees/expressions/MatchPhraseEdge.java |  49 ++++++++
 .../expressions/visitor/ExpressionVisitor.java     |   5 +
 fe/fe-core/src/main/jflex/sql_scanner.flex         |   1 +
 gensrc/thrift/Opcodes.thrift                       |   1 +
 .../test_index_match_phrase_edge.out               |  31 +++++
 .../test_index_match_phrase_edge.groovy            |  69 +++++++++++
 22 files changed, 413 insertions(+), 18 deletions(-)

diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h
index 0ea771deb88..634f9128553 100644
--- a/be/src/exec/olap_common.h
+++ b/be/src/exec/olap_common.h
@@ -314,6 +314,8 @@ public:
                 condition.__set_condition_op("match_phrase_prefix");
             } else if (value.first == MatchType::MATCH_REGEXP) {
                 condition.__set_condition_op("match_regexp");
+            } else if (value.first == MatchType::MATCH_PHRASE_EDGE) {
+                condition.__set_condition_op("match_phrase_edge");
             } else if (value.first == MatchType::MATCH_ELEMENT_EQ) {
                 condition.__set_condition_op("match_element_eq");
             } else if (value.first == MatchType::MATCH_ELEMENT_LT) {
diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h
index 106ded98c7d..4024337c462 100644
--- a/be/src/exec/olap_utils.h
+++ b/be/src/exec/olap_utils.h
@@ -171,6 +171,7 @@ enum class MatchType {
     MATCH_ELEMENT_GE = 7,
     MATCH_PHRASE_PREFIX = 8,
     MATCH_REGEXP = 9,
+    MATCH_PHRASE_EDGE = 10,
 };
 
 inline MatchType to_match_type(TExprOpcode::type type) {
@@ -190,6 +191,9 @@ inline MatchType to_match_type(TExprOpcode::type type) {
     case TExprOpcode::type::MATCH_REGEXP:
         return MatchType::MATCH_REGEXP;
         break;
+    case TExprOpcode::type::MATCH_PHRASE_EDGE:
+        return MatchType::MATCH_PHRASE_EDGE;
+        break;
     case TExprOpcode::type::MATCH_ELEMENT_EQ:
         return MatchType::MATCH_ELEMENT_EQ;
         break;
@@ -223,6 +227,8 @@ inline MatchType to_match_type(const std::string& 
condition_op) {
         return MatchType::MATCH_PHRASE_PREFIX;
     } else if (condition_op.compare("match_regexp") == 0) {
         return MatchType::MATCH_REGEXP;
+    } else if (condition_op.compare("match_phrase_edge") == 0) {
+        return MatchType::MATCH_PHRASE_EDGE;
     } else if (condition_op.compare("match_element_eq") == 0) {
         return MatchType::MATCH_ELEMENT_EQ;
     } else if (condition_op.compare("match_element_lt") == 0) {
@@ -242,6 +248,7 @@ inline bool is_match_condition(const std::string& op) {
         0 == strcasecmp(op.c_str(), "match_phrase") ||
         0 == strcasecmp(op.c_str(), "match_phrase_prefix") ||
         0 == strcasecmp(op.c_str(), "match_regexp") ||
+        0 == strcasecmp(op.c_str(), "match_phrase_edge") ||
         0 == strcasecmp(op.c_str(), "match_element_eq") ||
         0 == strcasecmp(op.c_str(), "match_element_lt") ||
         0 == strcasecmp(op.c_str(), "match_element_gt") ||
@@ -255,9 +262,10 @@ inline bool is_match_condition(const std::string& op) {
 inline bool is_match_operator(const TExprOpcode::type& op_type) {
     return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == 
op_type ||
            TExprOpcode::MATCH_PHRASE == op_type || 
TExprOpcode::MATCH_PHRASE_PREFIX == op_type ||
-           TExprOpcode::MATCH_REGEXP == op_type || 
TExprOpcode::MATCH_ELEMENT_EQ == op_type ||
-           TExprOpcode::MATCH_ELEMENT_LT == op_type || 
TExprOpcode::MATCH_ELEMENT_GT == op_type ||
-           TExprOpcode::MATCH_ELEMENT_LE == op_type || 
TExprOpcode::MATCH_ELEMENT_GE == op_type;
+           TExprOpcode::MATCH_REGEXP == op_type || 
TExprOpcode::MATCH_PHRASE_EDGE == op_type ||
+           TExprOpcode::MATCH_ELEMENT_EQ == op_type || 
TExprOpcode::MATCH_ELEMENT_LT == op_type ||
+           TExprOpcode::MATCH_ELEMENT_GT == op_type || 
TExprOpcode::MATCH_ELEMENT_LE == op_type ||
+           TExprOpcode::MATCH_ELEMENT_GE == op_type;
 }
 
 } // namespace doris
diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp
index 36f167d0d04..13fd982b0b5 100644
--- a/be/src/olap/match_predicate.cpp
+++ b/be/src/olap/match_predicate.cpp
@@ -113,6 +113,9 @@ InvertedIndexQueryType 
MatchPredicate::_to_inverted_index_query_type(MatchType m
     case MatchType::MATCH_REGEXP:
         ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY;
         break;
+    case MatchType::MATCH_PHRASE_EDGE:
+        ret = InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
+        break;
     case MatchType::MATCH_ELEMENT_EQ:
         ret = InvertedIndexQueryType::EQUAL_QUERY;
         break;
@@ -135,7 +138,8 @@ InvertedIndexQueryType 
MatchPredicate::_to_inverted_index_query_type(MatchType m
 }
 
 bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const {
-    if ((_match_type == MatchType::MATCH_PHRASE || _match_type == 
MatchType::MATCH_PHRASE_PREFIX) &&
+    if ((_match_type == MatchType::MATCH_PHRASE || _match_type == 
MatchType::MATCH_PHRASE_PREFIX ||
+         _match_type == MatchType::MATCH_PHRASE_EDGE) &&
         iterator->get_inverted_index_reader_type() == 
InvertedIndexReaderType::FULLTEXT &&
         
get_parser_phrase_support_string_from_properties(iterator->get_index_properties())
 ==
                 INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
new file mode 100644
index 00000000000..ac185259059
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "phrase_edge_query.h"
+
+#include <fstream>
+#include <functional>
+#include <string>
+#include <string_view>
+
+#include "CLucene/config/repl_wchar.h"
+#include "CLucene/util/stringUtil.h"
+#include "common/logging.h"
+
+namespace doris::segment_v2 {
+
+PhraseEdgeQuery::PhraseEdgeQuery(const 
std::shared_ptr<lucene::search::IndexSearcher>& searcher,
+                                 const TQueryOptions& query_options)
+        : _searcher(searcher), 
_query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()) {}
+
+void PhraseEdgeQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
+    if (terms.empty()) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "PhraseEdgeQuery::add: terms empty");
+    }
+
+    if (terms.size() == 1) {
+        std::vector<CL_NS(index)::Term*> checked_terms;
+        std::wstring sub_term = StringUtil::string_to_wstring(terms[0]);
+
+        find_words([&checked_terms, &sub_term](Term* term) {
+            std::wstring_view ws_term(term->text(), term->textLength());
+
+            if (ws_term.find(sub_term) != std::wstring::npos) {
+                checked_terms.push_back(_CL_POINTER(term));
+            }
+        });
+
+        handle_terms(field_name, sub_term, checked_terms);
+    } else {
+        std::wstring suffix_term = StringUtil::string_to_wstring(terms[0]);
+        std::wstring prefix_term = StringUtil::string_to_wstring(terms.back());
+
+        std::vector<CL_NS(index)::Term*> suffix_terms;
+        std::vector<CL_NS(index)::Term*> prefix_terms;
+
+        find_words([&suffix_term, &suffix_terms, &prefix_term, 
&prefix_terms](Term* term) {
+            std::wstring_view ws_term(term->text(), term->textLength());
+
+            if (ws_term.ends_with(suffix_term)) {
+                suffix_terms.push_back(_CL_POINTER(term));
+            }
+
+            if (ws_term.starts_with(prefix_term)) {
+                prefix_terms.push_back(_CL_POINTER(term));
+            }
+        });
+
+        for (size_t i = 0; i < terms.size(); i++) {
+            if (i == 0) {
+                handle_terms(field_name, suffix_term, suffix_terms);
+            } else if (i == terms.size() - 1) {
+                handle_terms(field_name, prefix_term, prefix_terms);
+            } else {
+                std::wstring ws_term = StringUtil::string_to_wstring(terms[i]);
+                add_default_term(field_name, ws_term);
+            }
+        }
+    }
+}
+
+void PhraseEdgeQuery::search(roaring::Roaring& roaring) {
+    _searcher->_search(_query.get(), [&roaring](const int32_t docid, const 
float_t /*score*/) {
+        roaring.add(docid);
+    });
+}
+
+void PhraseEdgeQuery::add_default_term(const std::wstring& field_name,
+                                       const std::wstring& ws_term) {
+    Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
+    _query->add(t);
+    _CLLDECDELETE(t);
+}
+
+void PhraseEdgeQuery::handle_terms(const std::wstring& field_name, const 
std::wstring& ws_term,
+                                   std::vector<CL_NS(index)::Term*>& 
checked_terms) {
+    if (checked_terms.empty()) {
+        add_default_term(field_name, ws_term);
+    } else {
+        _query->add(checked_terms);
+        for (const auto& t : checked_terms) {
+            _CLLDECDELETE(t);
+        }
+    }
+};
+
+void PhraseEdgeQuery::find_words(const std::function<void(Term*)>& cb) {
+    Term* term = nullptr;
+    TermEnum* enumerator = nullptr;
+    try {
+        enumerator = _searcher->getReader()->terms();
+        while (enumerator->next()) {
+            term = enumerator->term();
+            cb(term);
+            _CLDECDELETE(term);
+        }
+    }
+    _CLFINALLY({
+        _CLDECDELETE(term);
+        enumerator->close();
+        _CLDELETE(enumerator);
+    })
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
new file mode 100644
index 00000000000..27612be1592
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+// clang-format off
+#include "olap/rowset/segment_v2/inverted_index/query/query.h"
+#include "CLucene/search/MultiPhraseQuery.h"
+// clang-format on
+
+CL_NS_USE(search)
+
+namespace doris::segment_v2 {
+
+class PhraseEdgeQuery : public Query {
+public:
+    PhraseEdgeQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
+                    const TQueryOptions& query_options);
+    ~PhraseEdgeQuery() override = default;
+
+    void add(const std::wstring& field_name, const std::vector<std::string>& 
terms) override;
+    void search(roaring::Roaring& roaring) override;
+
+private:
+    void add_default_term(const std::wstring& field_name, const std::wstring& 
ws_term);
+    void handle_terms(const std::wstring& field_name, const std::wstring& 
ws_term,
+                      std::vector<CL_NS(index)::Term*>& checked_terms);
+    void find_words(const std::function<void(Term*)>& cb);
+
+private:
+    std::shared_ptr<lucene::search::IndexSearcher> _searcher;
+
+    std::unique_ptr<CL_NS(search)::MultiPhraseQuery> _query;
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h
index a24a1379396..09d96211f99 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h
@@ -17,17 +17,16 @@
 
 #pragma once
 
+#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
+#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
 #include "olap/rowset/segment_v2/inverted_index_query_type.h"
 
 namespace doris::segment_v2 {
 
-class Query;
-class DisjunctionQuery;
-class ConjunctionQuery;
-class PhraseQuery;
-class PhrasePrefixQuery;
-class RegexpQuery;
-
 class QueryFactory {
 public:
     template <typename... Args>
@@ -44,6 +43,8 @@ public:
             return 
std::make_unique<PhrasePrefixQuery>(std::forward<Args>(args)...);
         case InvertedIndexQueryType::MATCH_REGEXP_QUERY:
             return std::make_unique<RegexpQuery>(std::forward<Args>(args)...);
+        case InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY:
+            return 
std::make_unique<PhraseEdgeQuery>(std::forward<Args>(args)...);
         default:
             return nullptr;
         }
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h 
b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
index 495c03b8637..f1a47ebdd0f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h
@@ -78,6 +78,7 @@ enum class InvertedIndexQueryType {
     MATCH_PHRASE_QUERY = 7,
     MATCH_PHRASE_PREFIX_QUERY = 8,
     MATCH_REGEXP_QUERY = 9,
+    MATCH_PHRASE_EDGE_QUERY = 10,
 };
 
 inline bool is_range_query(InvertedIndexQueryType query_type) {
@@ -92,7 +93,8 @@ inline bool is_match_query(InvertedIndexQueryType query_type) 
{
             query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY);
+            query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY);
 }
 
 inline std::string query_type_to_string(InvertedIndexQueryType query_type) {
@@ -130,6 +132,9 @@ inline std::string 
query_type_to_string(InvertedIndexQueryType query_type) {
     case InvertedIndexQueryType::MATCH_REGEXP_QUERY: {
         return "MREGEXP";
     }
+    case InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY: {
+        return "MPHRASEEDGE";
+    }
     default:
         return "";
     }
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 919670d2182..b8475cbf509 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -58,11 +58,7 @@
 #include "olap/key_coder.h"
 #include "olap/olap_common.h"
 #include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
-#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
-#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h"
-#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
 #include "olap/rowset/segment_v2/inverted_index/query/query_factory.h"
-#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h"
 #include "olap/rowset/segment_v2/inverted_index_cache.h"
 #include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
 #include "olap/rowset/segment_v2/inverted_index_searcher.h"
@@ -309,6 +305,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
         roaring::Roaring query_match_bitmap;
         if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
             query_type == InvertedIndexQueryType::EQUAL_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) {
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 5002ef3f715..3f2564227ba 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -397,6 +397,7 @@ void register_function_match(SimpleFunctionFactory& 
factory) {
     factory.register_function<FunctionMatchPhrase>();
     factory.register_function<FunctionMatchPhrasePrefix>();
     factory.register_function<FunctionMatchRegexp>();
+    factory.register_function<FunctionMatchPhraseEdge>();
     factory.register_function<FunctionMatchElementEQ>();
     factory.register_function<FunctionMatchElementLT>();
     factory.register_function<FunctionMatchElementGT>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index ebd6a48ba23..d3b7c912edc 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -159,6 +159,23 @@ public:
     }
 };
 
+class FunctionMatchPhraseEdge : public FunctionMatchBase {
+public:
+    static constexpr auto name = "match_phrase_edge";
+    static FunctionPtr create() { return 
std::make_shared<FunctionMatchPhraseEdge>(); }
+
+    String get_name() const override { return name; }
+
+    Status execute_match(const std::string& column_name, const std::string& 
match_query_str,
+                         size_t input_rows_count, const ColumnString* 
string_col,
+                         InvertedIndexCtx* inverted_index_ctx,
+                         const ColumnArray::Offsets64* array_offsets,
+                         ColumnUInt8::Container& result) const override {
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "FunctionMatchPhraseEdge not support execute_match");
+    }
+};
+
 class FunctionMatchElementEQ : public FunctionMatchBase {
 public:
     static constexpr auto name = "match_element_eq";
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 
b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
index 9d51f8dfd27..28806fd1d76 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4
@@ -355,6 +355,7 @@ MATCH_ELEMENT_LT: 'ELEMENT_LT';
 MATCH_PHRASE: 'MATCH_PHRASE';
 MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX';
 MATCH_REGEXP: 'MATCH_REGEXP';
+MATCH_PHRASE_EDGE: 'MATCH_PHRASE_EDGE';
 MATERIALIZED: 'MATERIALIZED';
 MAX: 'MAX';
 MAXVALUE: 'MAXVALUE';
diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 
b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
index 241bb55f041..f271fbe5216 100644
--- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
+++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4
@@ -615,7 +615,7 @@ rowConstructorItem
 predicate
     : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression
     | NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression
-    | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | 
MATCH_PHRASE_PREFIX | MATCH_REGEXP) pattern=valueExpression
+    | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | 
MATCH_PHRASE_PREFIX | MATCH_REGEXP | MATCH_PHRASE_EDGE) pattern=valueExpression
     | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN
     | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN
     | IS NOT? kind=NULL
diff --git a/fe/fe-core/src/main/cup/sql_parser.cup 
b/fe/fe-core/src/main/cup/sql_parser.cup
index ab5c0ec2b32..0ebb58c3756 100644
--- a/fe/fe-core/src/main/cup/sql_parser.cup
+++ b/fe/fe-core/src/main/cup/sql_parser.cup
@@ -484,6 +484,7 @@ terminal String
     KW_MATCH_PHRASE,
     KW_MATCH_PHRASE_PREFIX,
     KW_MATCH_REGEXP,
+    KW_MATCH_PHRASE_EDGE,
     KW_MATCH_ELEMENT_EQ,
     KW_MATCH_ELEMENT_LT,
     KW_MATCH_ELEMENT_GT,
@@ -992,7 +993,7 @@ precedence left KW_AND;
 precedence left KW_NOT, NOT;
 precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS;
 precedence left KW_LIKE, KW_REGEXP;
-precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, 
KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH, KW_MATCH_ELEMENT_EQ, 
KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, 
KW_MATCH_ELEMENT_GE;
+precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, 
KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH_PHRASE_EDGE, KW_MATCH, 
KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, 
KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE;
 precedence left EQUAL, LESSTHAN, GREATERTHAN;
 precedence left ADD, SUBTRACT;
 precedence left AT, STAR, DIVIDE, MOD, KW_DIV;
@@ -7241,6 +7242,8 @@ match_predicate ::=
   {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, 
e1, e2); :}
   | expr:e1 KW_MATCH_REGEXP expr:e2
   {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_REGEXP, e1, 
e2); :}
+  | expr:e1 KW_MATCH_PHRASE_EDGE expr:e2
+  {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_EDGE, 
e1, e2); :}
   | expr:e1 KW_MATCH_ELEMENT_EQ expr:e2
   {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, 
e2); :}
   | expr:e1 KW_MATCH_ELEMENT_LT expr:e2
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
index e284d86e2bb..f0d961d6761 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
@@ -52,6 +52,7 @@ public class MatchPredicate extends Predicate {
         MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
         MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", 
TExprOpcode.MATCH_PHRASE_PREFIX),
         MATCH_REGEXP("MATCH_REGEXP", "match_regexp", TExprOpcode.MATCH_REGEXP),
+        MATCH_PHRASE_EDGE("MATCH_PHRASE_EDGE", "match_phrase_edge", 
TExprOpcode.MATCH_PHRASE_EDGE),
         MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", 
TExprOpcode.MATCH_ELEMENT_EQ),
         MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", 
TExprOpcode.MATCH_ELEMENT_LT),
         MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", 
TExprOpcode.MATCH_ELEMENT_GT),
@@ -169,6 +170,16 @@ public class MatchPredicate extends Predicate {
                     symbolNotUsed,
                     Lists.<Type>newArrayList(new ArrayType(t), t),
                     Type.BOOLEAN));
+            
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_PHRASE_EDGE.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(t, t),
+                    Type.BOOLEAN));
+            
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
+                    Operator.MATCH_PHRASE_EDGE.getName(),
+                    symbolNotUsed,
+                    Lists.<Type>newArrayList(new ArrayType(t), t),
+                    Type.BOOLEAN));
         }
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
index bc5926f263d..03ddefff8cf 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java
@@ -239,6 +239,7 @@ import org.apache.doris.nereids.trees.expressions.ListQuery;
 import org.apache.doris.nereids.trees.expressions.MatchAll;
 import org.apache.doris.nereids.trees.expressions.MatchAny;
 import org.apache.doris.nereids.trees.expressions.MatchPhrase;
+import org.apache.doris.nereids.trees.expressions.MatchPhraseEdge;
 import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
 import org.apache.doris.nereids.trees.expressions.MatchRegexp;
 import org.apache.doris.nereids.trees.expressions.Mod;
@@ -3151,6 +3152,12 @@ public class LogicalPlanBuilder extends 
DorisParserBaseVisitor<Object> {
                         getExpression(ctx.pattern)
                     );
                     break;
+                case DorisParser.MATCH_PHRASE_EDGE:
+                    outExpression = new MatchPhraseEdge(
+                        valueExpression,
+                        getExpression(ctx.pattern)
+                    );
+                    break;
                 default:
                     throw new ParseException("Unsupported predicate type: " + 
ctx.kind.getText(), ctx);
             }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
index 976e46830ef..d9dcde287d3 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java
@@ -53,6 +53,8 @@ public abstract class Match extends BinaryOperator implements 
PropagateNullable
                 return Operator.MATCH_PHRASE_PREFIX;
             case "MATCH_REGEXP":
                 return Operator.MATCH_REGEXP;
+            case "MATCH_PHRASE_EDGE":
+                return Operator.MATCH_PHRASE_EDGE;
             default:
                 throw new AnalysisException("UnSupported type for match: " + 
symbol);
         }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhraseEdge.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhraseEdge.java
new file mode 100644
index 00000000000..188f3317a0f
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhraseEdge.java
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions;
+
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * like expression: a MATCH_PHRASE_EDGE 'llo w'.
+ */
+public class MatchPhraseEdge extends Match {
+    public MatchPhraseEdge(Expression left, Expression right) {
+        super(ImmutableList.of(left, right), "MATCH_PHRASE_EDGE");
+    }
+
+    private MatchPhraseEdge(List<Expression> children) {
+        super(children, "MATCH_PHRASE_EDGE");
+    }
+
+    @Override
+    public MatchPhraseEdge withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 2);
+        return new MatchPhraseEdge(children);
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitMatchPhraseEdge(this, context);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
index 0ffa1d98fdf..feea6cfe7d5 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java
@@ -59,6 +59,7 @@ import org.apache.doris.nereids.trees.expressions.Match;
 import org.apache.doris.nereids.trees.expressions.MatchAll;
 import org.apache.doris.nereids.trees.expressions.MatchAny;
 import org.apache.doris.nereids.trees.expressions.MatchPhrase;
+import org.apache.doris.nereids.trees.expressions.MatchPhraseEdge;
 import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix;
 import org.apache.doris.nereids.trees.expressions.MatchRegexp;
 import org.apache.doris.nereids.trees.expressions.Mod;
@@ -500,6 +501,10 @@ public abstract class ExpressionVisitor<R, C>
         return visitMatch(matchRegexp, context);
     }
 
+    public R visitMatchPhraseEdge(MatchPhraseEdge matchPhraseEdge, C context) {
+        return visitMatch(matchPhraseEdge, context);
+    }
+
     public R visitAny(Any any, C context) {
         return visit(any, context);
     }
diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex 
b/fe/fe-core/src/main/jflex/sql_scanner.flex
index bc744373ad4..175744532c0 100644
--- a/fe/fe-core/src/main/jflex/sql_scanner.flex
+++ b/fe/fe-core/src/main/jflex/sql_scanner.flex
@@ -323,6 +323,7 @@ import org.apache.doris.qe.SqlModeHelper;
         keywordMap.put("match_phrase", new 
Integer(SqlParserSymbols.KW_MATCH_PHRASE));
         keywordMap.put("match_phrase_prefix", new 
Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX));
         keywordMap.put("match_regexp", new 
Integer(SqlParserSymbols.KW_MATCH_REGEXP));
+        keywordMap.put("match_phrase_edge", new 
Integer(SqlParserSymbols.KW_MATCH_PHRASE_EDGE));
         keywordMap.put("element_eq", new 
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ));
         keywordMap.put("element_lt", new 
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT));
         keywordMap.put("element_gt", new 
Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT));
diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift
index 72a1d80e036..9c0211cf50f 100644
--- a/gensrc/thrift/Opcodes.thrift
+++ b/gensrc/thrift/Opcodes.thrift
@@ -95,4 +95,5 @@ enum TExprOpcode {
     MATCH_ELEMENT_GE,
     MATCH_PHRASE_PREFIX,
     MATCH_REGEXP,
+    MATCH_PHRASE_EDGE,
 }
diff --git 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
new file mode 100644
index 00000000000..e650f9b39b2
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
@@ -0,0 +1,31 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1      index.html      首先我 index html 想说的是这里有 index html 条评论看了之后很让人无语
+
+-- !sql --
+2      nav_inet.html   尤其看看 nav inet html 原价应当 nav inet html 是一本精美的书
+
+-- !sql --
+3      splash_inet.html        封面 splash inet html 红色 splash inet html 书封非常精致
+
+-- !sql --
+5      nav_bg_top.gif  该书研究了英语 nav bg top gif 各种语法 nav bg top gif 结构下的歧义问题
+
+-- !sql --
+8      nav_venue_off.gif       本书既适合 nav venue off gif 家长 nav venue off gif 
和孩子一起学习使用
+
+-- !sql --
+1      index.html      首先我 index html 想说的是这里有 index html 条评论看了之后很让人无语
+
+-- !sql --
+2      nav_inet.html   尤其看看 nav inet html 原价应当 nav inet html 是一本精美的书
+
+-- !sql --
+5      nav_bg_top.gif  该书研究了英语 nav bg top gif 各种语法 nav bg top gif 结构下的歧义问题
+
+-- !sql --
+8      nav_venue_off.gif       本书既适合 nav venue off gif 家长 nav venue off gif 
和孩子一起学习使用
+
+-- !sql --
+10     nav_tickets_off.gif     习惯于生活中很多 nav tickets off gif 虚假 nav tickets off 
gif 美化的人来说
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
new file mode 100644
index 00000000000..e05f6bb1ec9
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_match_phrase_edge", "p0"){
+    def indexTbName1 = "test_index_match_phrase_edge"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+      `a` int(11) NULL COMMENT "",
+      `b` text NULL COMMENT "",
+      `c` text NULL COMMENT "",
+      INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "english", 
"support_phrase" = "true") COMMENT '',
+      INDEX c_idx (`c`) USING INVERTED PROPERTIES("parser" = "unicode", 
"support_phrase" = "true") COMMENT '',
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`a`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1"
+      );
+    """
+
+    sql """ INSERT INTO ${indexTbName1} VALUES (1, "index.html", "首先我 index 
html 想说的是这里有 index html 条评论看了之后很让人无语"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (2, "nav_inet.html", "尤其看看 nav 
inet html 原价应当 nav inet html 是一本精美的书"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (3, "splash_inet.html", "封面 
splash inet html 红色 splash inet html 书封非常精致"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (4, "nav_top_inet.html", 
"个人觉得定义 nav top inet html 和 nav top inet html 写法特别有帮助"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (5, "nav_bg_top.gif", "该书研究了英语 
nav bg top gif 各种语法 nav bg top gif 结构下的歧义问题"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (6, "nav_news_off.gif", "作品当然是 
nav news off gif 喜欢的 nav news off gif 否则也不会买原版"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (7, "nav_comp_off.gif", 
"对于理解英语的 nav comp off gif 节奏和 nav comp off gif 韵律很有好处"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (8, "nav_venue_off.gif", "本书既适合 
nav venue off gif 家长 nav venue off gif 和孩子一起学习使用"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (9, "hm_bg.jpg", "前几日 hm bg jpg 
在别处 hm bg jpg 购得"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (10, "nav_tickets_off.gif", 
"习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说"); """
+
+    try {
+        sql "sync"
+
+        qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 
'x.h'; """
+        qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 
'v_i'; """
+        qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 
'sh_inet.h'; """
+        qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 
'v_bg_t'; """
+        qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 
'v_venue_of'; """
+
+        qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ml 
想说的是这里有 in'; """
+        qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ml 
原价应当 na'; """
+        qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'op 
gif 各种语法 nav b'; """
+        qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ue 
off gif 家长 na'; """
+        qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'if 
虚假 na'; """
+
+    } finally {
+        //try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) 05/07: [feature](invert index) match_phrase_edge feature added (#31142)

Reply via email to