This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 92e3b31f506ae229d8f37207edfbb57539e3a63a Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Thu Feb 29 19:47:26 2024 +0800 [feature](invert index) match_phrase_edge feature added (#31142) --- be/src/exec/olap_common.h | 2 + be/src/exec/olap_utils.h | 14 ++- be/src/olap/match_predicate.cpp | 6 +- .../inverted_index/query/phrase_edge_query.cpp | 128 +++++++++++++++++++++ .../inverted_index/query/phrase_edge_query.h | 52 +++++++++ .../inverted_index/query/query_factory.h | 15 +-- .../rowset/segment_v2/inverted_index_query_type.h | 7 +- .../rowset/segment_v2/inverted_index_reader.cpp | 5 +- be/src/vec/functions/match.cpp | 1 + be/src/vec/functions/match.h | 17 +++ .../antlr4/org/apache/doris/nereids/DorisLexer.g4 | 1 + .../antlr4/org/apache/doris/nereids/DorisParser.g4 | 2 +- fe/fe-core/src/main/cup/sql_parser.cup | 5 +- .../org/apache/doris/analysis/MatchPredicate.java | 11 ++ .../doris/nereids/parser/LogicalPlanBuilder.java | 7 ++ .../doris/nereids/trees/expressions/Match.java | 2 + .../nereids/trees/expressions/MatchPhraseEdge.java | 49 ++++++++ .../expressions/visitor/ExpressionVisitor.java | 5 + fe/fe-core/src/main/jflex/sql_scanner.flex | 1 + gensrc/thrift/Opcodes.thrift | 1 + .../test_index_match_phrase_edge.out | 31 +++++ .../test_index_match_phrase_edge.groovy | 69 +++++++++++ 22 files changed, 413 insertions(+), 18 deletions(-) diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 0ea771deb88..634f9128553 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -314,6 +314,8 @@ public: condition.__set_condition_op("match_phrase_prefix"); } else if (value.first == MatchType::MATCH_REGEXP) { condition.__set_condition_op("match_regexp"); + } else if (value.first == MatchType::MATCH_PHRASE_EDGE) { + condition.__set_condition_op("match_phrase_edge"); } else if (value.first == MatchType::MATCH_ELEMENT_EQ) { condition.__set_condition_op("match_element_eq"); } else if (value.first == MatchType::MATCH_ELEMENT_LT) { diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h index 106ded98c7d..4024337c462 100644 --- a/be/src/exec/olap_utils.h +++ b/be/src/exec/olap_utils.h @@ -171,6 +171,7 @@ enum class MatchType { MATCH_ELEMENT_GE = 7, MATCH_PHRASE_PREFIX = 8, MATCH_REGEXP = 9, + MATCH_PHRASE_EDGE = 10, }; inline MatchType to_match_type(TExprOpcode::type type) { @@ -190,6 +191,9 @@ inline MatchType to_match_type(TExprOpcode::type type) { case TExprOpcode::type::MATCH_REGEXP: return MatchType::MATCH_REGEXP; break; + case TExprOpcode::type::MATCH_PHRASE_EDGE: + return MatchType::MATCH_PHRASE_EDGE; + break; case TExprOpcode::type::MATCH_ELEMENT_EQ: return MatchType::MATCH_ELEMENT_EQ; break; @@ -223,6 +227,8 @@ inline MatchType to_match_type(const std::string& condition_op) { return MatchType::MATCH_PHRASE_PREFIX; } else if (condition_op.compare("match_regexp") == 0) { return MatchType::MATCH_REGEXP; + } else if (condition_op.compare("match_phrase_edge") == 0) { + return MatchType::MATCH_PHRASE_EDGE; } else if (condition_op.compare("match_element_eq") == 0) { return MatchType::MATCH_ELEMENT_EQ; } else if (condition_op.compare("match_element_lt") == 0) { @@ -242,6 +248,7 @@ inline bool is_match_condition(const std::string& op) { 0 == strcasecmp(op.c_str(), "match_phrase") || 0 == strcasecmp(op.c_str(), "match_phrase_prefix") || 0 == strcasecmp(op.c_str(), "match_regexp") || + 0 == strcasecmp(op.c_str(), "match_phrase_edge") || 0 == strcasecmp(op.c_str(), "match_element_eq") || 0 == strcasecmp(op.c_str(), "match_element_lt") || 0 == strcasecmp(op.c_str(), "match_element_gt") || @@ -255,9 +262,10 @@ inline bool is_match_condition(const std::string& op) { inline bool is_match_operator(const TExprOpcode::type& op_type) { return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == op_type || TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_PHRASE_PREFIX == op_type || - TExprOpcode::MATCH_REGEXP == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type || - TExprOpcode::MATCH_ELEMENT_LT == op_type || TExprOpcode::MATCH_ELEMENT_GT == op_type || - TExprOpcode::MATCH_ELEMENT_LE == op_type || TExprOpcode::MATCH_ELEMENT_GE == op_type; + TExprOpcode::MATCH_REGEXP == op_type || TExprOpcode::MATCH_PHRASE_EDGE == op_type || + TExprOpcode::MATCH_ELEMENT_EQ == op_type || TExprOpcode::MATCH_ELEMENT_LT == op_type || + TExprOpcode::MATCH_ELEMENT_GT == op_type || TExprOpcode::MATCH_ELEMENT_LE == op_type || + TExprOpcode::MATCH_ELEMENT_GE == op_type; } } // namespace doris diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 36f167d0d04..13fd982b0b5 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -113,6 +113,9 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m case MatchType::MATCH_REGEXP: ret = InvertedIndexQueryType::MATCH_REGEXP_QUERY; break; + case MatchType::MATCH_PHRASE_EDGE: + ret = InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY; + break; case MatchType::MATCH_ELEMENT_EQ: ret = InvertedIndexQueryType::EQUAL_QUERY; break; @@ -135,7 +138,8 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m } bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX) && + if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || + _match_type == MatchType::MATCH_PHRASE_EDGE) && iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp new file mode 100644 index 00000000000..ac185259059 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "phrase_edge_query.h" + +#include <fstream> +#include <functional> +#include <string> +#include <string_view> + +#include "CLucene/config/repl_wchar.h" +#include "CLucene/util/stringUtil.h" +#include "common/logging.h" + +namespace doris::segment_v2 { + +PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher, + const TQueryOptions& query_options) + : _searcher(searcher), _query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()) {} + +void PhraseEdgeQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) { + if (terms.empty()) { + _CLTHROWA(CL_ERR_IllegalArgument, "PhraseEdgeQuery::add: terms empty"); + } + + if (terms.size() == 1) { + std::vector<CL_NS(index)::Term*> checked_terms; + std::wstring sub_term = StringUtil::string_to_wstring(terms[0]); + + find_words([&checked_terms, &sub_term](Term* term) { + std::wstring_view ws_term(term->text(), term->textLength()); + + if (ws_term.find(sub_term) != std::wstring::npos) { + checked_terms.push_back(_CL_POINTER(term)); + } + }); + + handle_terms(field_name, sub_term, checked_terms); + } else { + std::wstring suffix_term = StringUtil::string_to_wstring(terms[0]); + std::wstring prefix_term = StringUtil::string_to_wstring(terms.back()); + + std::vector<CL_NS(index)::Term*> suffix_terms; + std::vector<CL_NS(index)::Term*> prefix_terms; + + find_words([&suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { + std::wstring_view ws_term(term->text(), term->textLength()); + + if (ws_term.ends_with(suffix_term)) { + suffix_terms.push_back(_CL_POINTER(term)); + } + + if (ws_term.starts_with(prefix_term)) { + prefix_terms.push_back(_CL_POINTER(term)); + } + }); + + for (size_t i = 0; i < terms.size(); i++) { + if (i == 0) { + handle_terms(field_name, suffix_term, suffix_terms); + } else if (i == terms.size() - 1) { + handle_terms(field_name, prefix_term, prefix_terms); + } else { + std::wstring ws_term = StringUtil::string_to_wstring(terms[i]); + add_default_term(field_name, ws_term); + } + } + } +} + +void PhraseEdgeQuery::search(roaring::Roaring& roaring) { + _searcher->_search(_query.get(), [&roaring](const int32_t docid, const float_t /*score*/) { + roaring.add(docid); + }); +} + +void PhraseEdgeQuery::add_default_term(const std::wstring& field_name, + const std::wstring& ws_term) { + Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); + _query->add(t); + _CLLDECDELETE(t); +} + +void PhraseEdgeQuery::handle_terms(const std::wstring& field_name, const std::wstring& ws_term, + std::vector<CL_NS(index)::Term*>& checked_terms) { + if (checked_terms.empty()) { + add_default_term(field_name, ws_term); + } else { + _query->add(checked_terms); + for (const auto& t : checked_terms) { + _CLLDECDELETE(t); + } + } +}; + +void PhraseEdgeQuery::find_words(const std::function<void(Term*)>& cb) { + Term* term = nullptr; + TermEnum* enumerator = nullptr; + try { + enumerator = _searcher->getReader()->terms(); + while (enumerator->next()) { + term = enumerator->term(); + cb(term); + _CLDECDELETE(term); + } + } + _CLFINALLY({ + _CLDECDELETE(term); + enumerator->close(); + _CLDELETE(enumerator); + }) +} + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h new file mode 100644 index 00000000000..27612be1592 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +// clang-format off +#include "olap/rowset/segment_v2/inverted_index/query/query.h" +#include "CLucene/search/MultiPhraseQuery.h" +// clang-format on + +CL_NS_USE(search) + +namespace doris::segment_v2 { + +class PhraseEdgeQuery : public Query { +public: + PhraseEdgeQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher, + const TQueryOptions& query_options); + ~PhraseEdgeQuery() override = default; + + void add(const std::wstring& field_name, const std::vector<std::string>& terms) override; + void search(roaring::Roaring& roaring) override; + +private: + void add_default_term(const std::wstring& field_name, const std::wstring& ws_term); + void handle_terms(const std::wstring& field_name, const std::wstring& ws_term, + std::vector<CL_NS(index)::Term*>& checked_terms); + void find_words(const std::function<void(Term*)>& cb); + +private: + std::shared_ptr<lucene::search::IndexSearcher> _searcher; + + std::unique_ptr<CL_NS(search)::MultiPhraseQuery> _query; +}; + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h index a24a1379396..09d96211f99 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query_factory.h @@ -17,17 +17,16 @@ #pragma once +#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/disjunction_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h" #include "olap/rowset/segment_v2/inverted_index_query_type.h" namespace doris::segment_v2 { -class Query; -class DisjunctionQuery; -class ConjunctionQuery; -class PhraseQuery; -class PhrasePrefixQuery; -class RegexpQuery; - class QueryFactory { public: template <typename... Args> @@ -44,6 +43,8 @@ public: return std::make_unique<PhrasePrefixQuery>(std::forward<Args>(args)...); case InvertedIndexQueryType::MATCH_REGEXP_QUERY: return std::make_unique<RegexpQuery>(std::forward<Args>(args)...); + case InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY: + return std::make_unique<PhraseEdgeQuery>(std::forward<Args>(args)...); default: return nullptr; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h index 495c03b8637..f1a47ebdd0f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h @@ -78,6 +78,7 @@ enum class InvertedIndexQueryType { MATCH_PHRASE_QUERY = 7, MATCH_PHRASE_PREFIX_QUERY = 8, MATCH_REGEXP_QUERY = 9, + MATCH_PHRASE_EDGE_QUERY = 10, }; inline bool is_range_query(InvertedIndexQueryType query_type) { @@ -92,7 +93,8 @@ inline bool is_match_query(InvertedIndexQueryType query_type) { query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || - query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY); + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY); } inline std::string query_type_to_string(InvertedIndexQueryType query_type) { @@ -130,6 +132,9 @@ inline std::string query_type_to_string(InvertedIndexQueryType query_type) { case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { return "MREGEXP"; } + case InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY: { + return "MPHRASEEDGE"; + } default: return ""; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 919670d2182..b8475cbf509 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -58,11 +58,7 @@ #include "olap/key_coder.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" -#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" -#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h" -#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h" #include "olap/rowset/segment_v2/inverted_index/query/query_factory.h" -#include "olap/rowset/segment_v2/inverted_index/query/regexp_query.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/rowset/segment_v2/inverted_index_searcher.h" @@ -309,6 +305,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run roaring::Roaring query_match_bitmap; if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY || query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) { diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 5002ef3f715..3f2564227ba 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -397,6 +397,7 @@ void register_function_match(SimpleFunctionFactory& factory) { factory.register_function<FunctionMatchPhrase>(); factory.register_function<FunctionMatchPhrasePrefix>(); factory.register_function<FunctionMatchRegexp>(); + factory.register_function<FunctionMatchPhraseEdge>(); factory.register_function<FunctionMatchElementEQ>(); factory.register_function<FunctionMatchElementLT>(); factory.register_function<FunctionMatchElementGT>(); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index ebd6a48ba23..d3b7c912edc 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -159,6 +159,23 @@ public: } }; +class FunctionMatchPhraseEdge : public FunctionMatchBase { +public: + static constexpr auto name = "match_phrase_edge"; + static FunctionPtr create() { return std::make_shared<FunctionMatchPhraseEdge>(); } + + String get_name() const override { return name; } + + Status execute_match(const std::string& column_name, const std::string& match_query_str, + size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) const override { + return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( + "FunctionMatchPhraseEdge not support execute_match"); + } +}; + class FunctionMatchElementEQ : public FunctionMatchBase { public: static constexpr auto name = "match_element_eq"; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 9d51f8dfd27..28806fd1d76 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -355,6 +355,7 @@ MATCH_ELEMENT_LT: 'ELEMENT_LT'; MATCH_PHRASE: 'MATCH_PHRASE'; MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX'; MATCH_REGEXP: 'MATCH_REGEXP'; +MATCH_PHRASE_EDGE: 'MATCH_PHRASE_EDGE'; MATERIALIZED: 'MATERIALIZED'; MAX: 'MAX'; MAXVALUE: 'MAXVALUE'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 241bb55f041..f271fbe5216 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -615,7 +615,7 @@ rowConstructorItem predicate : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression | NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression - | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX | MATCH_REGEXP) pattern=valueExpression + | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX | MATCH_REGEXP | MATCH_PHRASE_EDGE) pattern=valueExpression | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN | IS NOT? kind=NULL diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index ab5c0ec2b32..0ebb58c3756 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -484,6 +484,7 @@ terminal String KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, + KW_MATCH_PHRASE_EDGE, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, @@ -992,7 +993,7 @@ precedence left KW_AND; precedence left KW_NOT, NOT; precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS; precedence left KW_LIKE, KW_REGEXP; -precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; +precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH_REGEXP, KW_MATCH_PHRASE_EDGE, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; precedence left EQUAL, LESSTHAN, GREATERTHAN; precedence left ADD, SUBTRACT; precedence left AT, STAR, DIVIDE, MOD, KW_DIV; @@ -7241,6 +7242,8 @@ match_predicate ::= {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, e1, e2); :} | expr:e1 KW_MATCH_REGEXP expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_REGEXP, e1, e2); :} + | expr:e1 KW_MATCH_PHRASE_EDGE expr:e2 + {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_EDGE, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_EQ expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_LT expr:e2 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java index e284d86e2bb..f0d961d6761 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java @@ -52,6 +52,7 @@ public class MatchPredicate extends Predicate { MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE), MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", TExprOpcode.MATCH_PHRASE_PREFIX), MATCH_REGEXP("MATCH_REGEXP", "match_regexp", TExprOpcode.MATCH_REGEXP), + MATCH_PHRASE_EDGE("MATCH_PHRASE_EDGE", "match_phrase_edge", TExprOpcode.MATCH_PHRASE_EDGE), MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ), MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT), MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT), @@ -169,6 +170,16 @@ public class MatchPredicate extends Predicate { symbolNotUsed, Lists.<Type>newArrayList(new ArrayType(t), t), Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_PHRASE_EDGE.getName(), + symbolNotUsed, + Lists.<Type>newArrayList(t, t), + Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_PHRASE_EDGE.getName(), + symbolNotUsed, + Lists.<Type>newArrayList(new ArrayType(t), t), + Type.BOOLEAN)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index bc5926f263d..03ddefff8cf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -239,6 +239,7 @@ import org.apache.doris.nereids.trees.expressions.ListQuery; import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; +import org.apache.doris.nereids.trees.expressions.MatchPhraseEdge; import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; import org.apache.doris.nereids.trees.expressions.MatchRegexp; import org.apache.doris.nereids.trees.expressions.Mod; @@ -3151,6 +3152,12 @@ public class LogicalPlanBuilder extends DorisParserBaseVisitor<Object> { getExpression(ctx.pattern) ); break; + case DorisParser.MATCH_PHRASE_EDGE: + outExpression = new MatchPhraseEdge( + valueExpression, + getExpression(ctx.pattern) + ); + break; default: throw new ParseException("Unsupported predicate type: " + ctx.kind.getText(), ctx); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java index 976e46830ef..d9dcde287d3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java @@ -53,6 +53,8 @@ public abstract class Match extends BinaryOperator implements PropagateNullable return Operator.MATCH_PHRASE_PREFIX; case "MATCH_REGEXP": return Operator.MATCH_REGEXP; + case "MATCH_PHRASE_EDGE": + return Operator.MATCH_PHRASE_EDGE; default: throw new AnalysisException("UnSupported type for match: " + symbol); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhraseEdge.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhraseEdge.java new file mode 100644 index 00000000000..188f3317a0f --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhraseEdge.java @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * like expression: a MATCH_PHRASE_EDGE 'llo w'. + */ +public class MatchPhraseEdge extends Match { + public MatchPhraseEdge(Expression left, Expression right) { + super(ImmutableList.of(left, right), "MATCH_PHRASE_EDGE"); + } + + private MatchPhraseEdge(List<Expression> children) { + super(children, "MATCH_PHRASE_EDGE"); + } + + @Override + public MatchPhraseEdge withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 2); + return new MatchPhraseEdge(children); + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitMatchPhraseEdge(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index 0ffa1d98fdf..feea6cfe7d5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -59,6 +59,7 @@ import org.apache.doris.nereids.trees.expressions.Match; import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; +import org.apache.doris.nereids.trees.expressions.MatchPhraseEdge; import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; import org.apache.doris.nereids.trees.expressions.MatchRegexp; import org.apache.doris.nereids.trees.expressions.Mod; @@ -500,6 +501,10 @@ public abstract class ExpressionVisitor<R, C> return visitMatch(matchRegexp, context); } + public R visitMatchPhraseEdge(MatchPhraseEdge matchPhraseEdge, C context) { + return visitMatch(matchPhraseEdge, context); + } + public R visitAny(Any any, C context) { return visit(any, context); } diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index bc744373ad4..175744532c0 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -323,6 +323,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE)); keywordMap.put("match_phrase_prefix", new Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX)); keywordMap.put("match_regexp", new Integer(SqlParserSymbols.KW_MATCH_REGEXP)); + keywordMap.put("match_phrase_edge", new Integer(SqlParserSymbols.KW_MATCH_PHRASE_EDGE)); keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ)); keywordMap.put("element_lt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT)); keywordMap.put("element_gt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT)); diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift index 72a1d80e036..9c0211cf50f 100644 --- a/gensrc/thrift/Opcodes.thrift +++ b/gensrc/thrift/Opcodes.thrift @@ -95,4 +95,5 @@ enum TExprOpcode { MATCH_ELEMENT_GE, MATCH_PHRASE_PREFIX, MATCH_REGEXP, + MATCH_PHRASE_EDGE, } diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out new file mode 100644 index 00000000000..e650f9b39b2 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out @@ -0,0 +1,31 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 index.html 首先我 index html 想说的是这里有 index html 条评论看了之后很让人无语 + +-- !sql -- +2 nav_inet.html 尤其看看 nav inet html 原价应当 nav inet html 是一本精美的书 + +-- !sql -- +3 splash_inet.html 封面 splash inet html 红色 splash inet html 书封非常精致 + +-- !sql -- +5 nav_bg_top.gif 该书研究了英语 nav bg top gif 各种语法 nav bg top gif 结构下的歧义问题 + +-- !sql -- +8 nav_venue_off.gif 本书既适合 nav venue off gif 家长 nav venue off gif 和孩子一起学习使用 + +-- !sql -- +1 index.html 首先我 index html 想说的是这里有 index html 条评论看了之后很让人无语 + +-- !sql -- +2 nav_inet.html 尤其看看 nav inet html 原价应当 nav inet html 是一本精美的书 + +-- !sql -- +5 nav_bg_top.gif 该书研究了英语 nav bg top gif 各种语法 nav bg top gif 结构下的歧义问题 + +-- !sql -- +8 nav_venue_off.gif 本书既适合 nav venue off gif 家长 nav venue off gif 和孩子一起学习使用 + +-- !sql -- +10 nav_tickets_off.gif 习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy new file mode 100644 index 00000000000..e05f6bb1ec9 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_phrase_edge", "p0"){ + def indexTbName1 = "test_index_match_phrase_edge" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `a` int(11) NULL COMMENT "", + `b` text NULL COMMENT "", + `c` text NULL COMMENT "", + INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '', + INDEX c_idx (`c`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true") COMMENT '', + ) ENGINE=OLAP + DUPLICATE KEY(`a`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ INSERT INTO ${indexTbName1} VALUES (1, "index.html", "首先我 index html 想说的是这里有 index html 条评论看了之后很让人无语"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (2, "nav_inet.html", "尤其看看 nav inet html 原价应当 nav inet html 是一本精美的书"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (3, "splash_inet.html", "封面 splash inet html 红色 splash inet html 书封非常精致"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (4, "nav_top_inet.html", "个人觉得定义 nav top inet html 和 nav top inet html 写法特别有帮助"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (5, "nav_bg_top.gif", "该书研究了英语 nav bg top gif 各种语法 nav bg top gif 结构下的歧义问题"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (6, "nav_news_off.gif", "作品当然是 nav news off gif 喜欢的 nav news off gif 否则也不会买原版"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (7, "nav_comp_off.gif", "对于理解英语的 nav comp off gif 节奏和 nav comp off gif 韵律很有好处"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (8, "nav_venue_off.gif", "本书既适合 nav venue off gif 家长 nav venue off gif 和孩子一起学习使用"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (9, "hm_bg.jpg", "前几日 hm bg jpg 在别处 hm bg jpg 购得"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (10, "nav_tickets_off.gif", "习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说"); """ + + try { + sql "sync" + + qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 'x.h'; """ + qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 'v_i'; """ + qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 'sh_inet.h'; """ + qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 'v_bg_t'; """ + qt_sql """ select * from ${indexTbName1} where b match_phrase_edge 'v_venue_of'; """ + + qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ml 想说的是这里有 in'; """ + qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ml 原价应当 na'; """ + qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'op gif 各种语法 nav b'; """ + qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ue off gif 家长 na'; """ + qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'if 虚假 na'; """ + + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org