Tanya-W commented on code in PR #15823:
URL: https://github.com/apache/doris/pull/15823#discussion_r1068095484


##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -17,30 +17,288 @@
 
 #include "olap/rowset/segment_v2/inverted_index_reader.h"
 
-#include "common/status.h"
+#include <CLucene/search/BooleanQuery.h>
+#include <CLucene/search/PhraseQuery.h>
+
+#include "common/config.h"
+#include "gutil/strings/strip.h"
+#include "io/fs/file_system.h"
+#include "olap/key_coder.h"
+#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
+#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
+#include "olap/rowset/segment_v2/inverted_index_desc.h"
+#include "olap/tablet_schema.h"
+#include "olap/utils.h"
+#include "runtime/string_value.h"
+#include "util/time.h"
 
 namespace doris {
 namespace segment_v2 {
 
+bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) {
+    return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+}
+
 bool InvertedIndexReader::indexExists(io::Path& index_file_path) {
     bool exists = false;
     RETURN_IF_ERROR(_fs->exists(index_file_path, &exists));
     return exists;
 }
 
+std::vector<std::string> FullTextIndexReader::get_analyse_result(
+        const std::wstring& field_name, const std::wstring& value,
+        InvertedIndexQueryType query_type, InvertedIndexParserType 
analyser_type) {
+    std::vector<std::string> analyse_result;
+    std::shared_ptr<lucene::analysis::Analyzer> analyzer;
+    if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
+        analyzer = 
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
+    } else {
+        // default
+        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
+    }
+
+    std::unique_ptr<lucene::util::StringReader> reader(
+            new lucene::util::StringReader(value.c_str()));
+    std::unique_ptr<lucene::analysis::TokenStream> token_stream(
+            analyzer->tokenStream(field_name.c_str(), reader.get()));
+
+    lucene::analysis::Token token;
+
+    while (token_stream->next(&token)) {
+        std::string tk =
+                lucene::util::Misc::toString(token.termBuffer<TCHAR>(), 
token.termLength<TCHAR>());
+        analyse_result.emplace_back(tk);
+    }
+
+    if (token_stream != nullptr) {
+        token_stream->close();
+    }
+
+    if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
+        query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
+        std::set<std::string> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
+        analyse_result.assign(unrepeated_result.begin(), 
unrepeated_result.end());
+    }
+
+    return analyse_result;
+}
+
+Status FullTextIndexReader::new_iterator(const TabletIndex* index_meta,
+                                         InvertedIndexIterator** iterator) {
+    *iterator = new InvertedIndexIterator(index_meta, this);
+    return Status::OK();
+}
+
+Status FullTextIndexReader::query(const std::string& column_name, const void* 
query_value,
+                                  InvertedIndexQueryType query_type,
+                                  InvertedIndexParserType analyser_type,
+                                  roaring::Roaring* bit_map) {
+    std::string search_str = reinterpret_cast<const 
StringValue*>(query_value)->to_string();
+    LOG(INFO) << column_name

Review Comment:
   changed to debug log



##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -17,30 +17,288 @@
 
 #include "olap/rowset/segment_v2/inverted_index_reader.h"
 
-#include "common/status.h"
+#include <CLucene/search/BooleanQuery.h>
+#include <CLucene/search/PhraseQuery.h>
+
+#include "common/config.h"
+#include "gutil/strings/strip.h"
+#include "io/fs/file_system.h"
+#include "olap/key_coder.h"
+#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
+#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
+#include "olap/rowset/segment_v2/inverted_index_desc.h"
+#include "olap/tablet_schema.h"
+#include "olap/utils.h"
+#include "runtime/string_value.h"
+#include "util/time.h"
 
 namespace doris {
 namespace segment_v2 {
 
+bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) {
+    return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY);
+}
+
 bool InvertedIndexReader::indexExists(io::Path& index_file_path) {
     bool exists = false;
     RETURN_IF_ERROR(_fs->exists(index_file_path, &exists));
     return exists;
 }
 
+std::vector<std::string> FullTextIndexReader::get_analyse_result(
+        const std::wstring& field_name, const std::wstring& value,
+        InvertedIndexQueryType query_type, InvertedIndexParserType 
analyser_type) {
+    std::vector<std::string> analyse_result;
+    std::shared_ptr<lucene::analysis::Analyzer> analyzer;
+    if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
+        analyzer = 
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
+    } else {
+        // default
+        analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
+    }
+
+    std::unique_ptr<lucene::util::StringReader> reader(
+            new lucene::util::StringReader(value.c_str()));
+    std::unique_ptr<lucene::analysis::TokenStream> token_stream(
+            analyzer->tokenStream(field_name.c_str(), reader.get()));
+
+    lucene::analysis::Token token;
+
+    while (token_stream->next(&token)) {
+        std::string tk =
+                lucene::util::Misc::toString(token.termBuffer<TCHAR>(), 
token.termLength<TCHAR>());
+        analyse_result.emplace_back(tk);
+    }
+
+    if (token_stream != nullptr) {
+        token_stream->close();
+    }
+
+    if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
+        query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
+        std::set<std::string> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
+        analyse_result.assign(unrepeated_result.begin(), 
unrepeated_result.end());
+    }
+
+    return analyse_result;
+}
+
+Status FullTextIndexReader::new_iterator(const TabletIndex* index_meta,
+                                         InvertedIndexIterator** iterator) {
+    *iterator = new InvertedIndexIterator(index_meta, this);
+    return Status::OK();
+}
+
+Status FullTextIndexReader::query(const std::string& column_name, const void* 
query_value,
+                                  InvertedIndexQueryType query_type,
+                                  InvertedIndexParserType analyser_type,
+                                  roaring::Roaring* bit_map) {
+    std::string search_str = reinterpret_cast<const 
StringValue*>(query_value)->to_string();
+    LOG(INFO) << column_name
+              << " begin to load the fulltext index from clucene, query_str=" 
<< search_str;
+    std::unique_ptr<lucene::search::Query> query;
+    std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
+    std::wstring search_str_ws = std::wstring(search_str.begin(), 
search_str.end());
+    try {
+        std::vector<std::string> analyse_result =
+                get_analyse_result(field_ws, search_str_ws, query_type, 
analyser_type);
+
+        if (analyse_result.empty()) {
+            LOG(WARNING) << "invalid input query_str: " << search_str
+                         << ", please check your query sql";
+            return 
Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>();
+        }
+
+        switch (query_type) {
+        case InvertedIndexQueryType::MATCH_ANY_QUERY: {
+            query.reset(_CLNEW lucene::search::BooleanQuery());
+            for (auto token : analyse_result) {
+                std::wstring token_ws = std::wstring(token.begin(), 
token.end());
+                lucene::index::Term* term =
+                        _CLNEW lucene::index::Term(field_ws.c_str(), 
token_ws.c_str());
+                static_cast<lucene::search::BooleanQuery*>(query.get())
+                        ->add(_CLNEW lucene::search::TermQuery(term), true,
+                              lucene::search::BooleanClause::SHOULD);
+                _CLDECDELETE(term);
+            }
+            break;
+        }
+        case InvertedIndexQueryType::MATCH_ALL_QUERY: {
+            query.reset(_CLNEW lucene::search::BooleanQuery());
+            for (auto token : analyse_result) {
+                std::wstring token_ws = std::wstring(token.begin(), 
token.end());
+                lucene::index::Term* term =
+                        _CLNEW lucene::index::Term(field_ws.c_str(), 
token_ws.c_str());
+                static_cast<lucene::search::BooleanQuery*>(query.get())
+                        ->add(_CLNEW lucene::search::TermQuery(term), true,
+                              lucene::search::BooleanClause::MUST);
+                _CLDECDELETE(term);
+            }
+            break;
+        }
+        case InvertedIndexQueryType::MATCH_PHRASE_QUERY: {
+            LOG(WARNING) << "match phrase of fulltext is not supported";
+            return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+        }
+        default:
+            LOG(ERROR) << "fulltext query do not support query type other than 
match, column: "
+                       << column_name;
+            return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
+        }
+
+    } catch (const CLuceneError& e) {
+        LOG(WARNING) << "CLuceneError occured, error msg: " << e.what();
+        return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>();
+    }
+
+    io::Path path(_path);
+    auto index_dir = path.parent_path();
+    auto index_file_name = 
InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id);
+
+    // check index file existence
+    auto index_file_path = index_dir / index_file_name;
+    if (!indexExists(index_file_path)) {
+        LOG(WARNING) << "inverted index path: " << index_file_path.string() << 
" not exist.";
+        return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>();
+    }
+
+    std::shared_ptr<lucene::search::IndexSearcher> index_searcher = nullptr;
+    {
+        DorisCompoundReader* directory = new DorisCompoundReader(
+                DorisCompoundDirectory::getDirectory(_fs, index_dir.c_str()),
+                index_file_name.c_str());
+        index_searcher = 
std::make_shared<lucene::search::IndexSearcher>(directory, true);
+        _CLDECDELETE(directory)
+    }
+
+    roaring::Roaring result;
+    try {
+        index_searcher->_search(query.get(),
+                                [&result](const int32_t docid, const float_t 
/*score*/) {
+                                    // docid equal to rowid in segment
+                                    result.add(docid);
+                                });
+    } catch (const CLuceneError& e) {
+        LOG(WARNING) << "CLuceneError occured, error msg: " << e.what();
+        return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>();
+    }
+    bit_map->swap(result);
+    return Status::OK();
+}
+
+InvertedIndexReaderType FullTextIndexReader::type() {
+    return InvertedIndexReaderType::FULLTEXT;
+}
+
+Status StringTypeInvertedIndexReader::new_iterator(const TabletIndex* 
index_meta,
+                                                   InvertedIndexIterator** 
iterator) {
+    *iterator = new InvertedIndexIterator(index_meta, this);
+    return Status::OK();
+}
+
+Status StringTypeInvertedIndexReader::query(const std::string& column_name, 
const void* query_value,
+                                            InvertedIndexQueryType query_type,
+                                            InvertedIndexParserType 
analyser_type,
+                                            roaring::Roaring* bit_map) {
+    const StringValue* search_query = reinterpret_cast<const 
StringValue*>(query_value);
+    auto act_len = strnlen(search_query->ptr, search_query->len);
+    std::string search_str(search_query->ptr, act_len);
+    LOG(INFO) << "begin to query the inverted index from clucene"

Review Comment:
   changed to debug log



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to