xiaokang commented on code in PR #23871:
URL: https://github.com/apache/doris/pull/23871#discussion_r1317041352


##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -264,83 +251,35 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, const std::string
             }
         }
 
-        std::unique_ptr<lucene::search::Query> query;
-        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
-
-        auto index_search = [&](bool& null_bitmap_already_read,
-                                std::shared_ptr<roaring::Roaring>& 
term_match_bitmap,
-                                InvertedIndexQueryCache* cache,
-                                InvertedIndexQueryCache::CacheKey& cache_key,
-                                InvertedIndexQueryCacheHandle& cache_handle) {
-            // check index file existence
-            if (!indexExists(index_file_path)) {
-                return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
-                        "inverted index path: {} not exist.", 
index_file_path.string());
-            }
-
-            InvertedIndexCacheHandle inverted_index_cache_handle;
-            InvertedIndexSearcherCache::instance()->get_index_searcher(
-                    _fs, index_dir.c_str(), index_file_name, 
&inverted_index_cache_handle, stats);
-            auto index_searcher = 
inverted_index_cache_handle.get_index_searcher();
-
-            // try to reuse index_searcher's directory to read null_bitmap to 
cache
-            // to avoid open directory additionally for null_bitmap
-            if (!null_bitmap_already_read) {
-                InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
-                read_null_bitmap(&null_bitmap_cache_handle,
-                                 index_searcher->getReader()->directory());
-                null_bitmap_already_read = true;
-            }
+        // check index file existence
+        if (!indexExists(index_file_path)) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
+                    "inverted index path: {} not exist.", 
index_file_path.string());
+        }
 
-            try {
-                if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
-                    query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
-                    query_type == InvertedIndexQueryType::EQUAL_QUERY) {
-                    
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-                    index_searcher->_search(query.get(), 
[&term_match_bitmap](DocRange* docRange) {
-                        if (docRange->type_ == DocRangeType::kMany) {
-                            
term_match_bitmap->addMany(docRange->doc_many_size_,
-                                                       
docRange->doc_many.data());
-                        } else {
-                            
term_match_bitmap->addRange(docRange->doc_range.first,
-                                                        
docRange->doc_range.second);
-                        }
-                    });
-                } else {
-                    
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-                    index_searcher->_search(
-                            query.get(),
-                            [&term_match_bitmap](const int32_t docid, const 
float_t /*score*/) {
-                                // docid equal to rowid in segment
-                                term_match_bitmap->add(docid);
-                            });
-                }
-            } catch (const CLuceneError& e) {
-                return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
-                        "CLuceneError occured: {}", e.what());
-            }
+        InvertedIndexCacheHandle inverted_index_cache_handle;
+        InvertedIndexSearcherCache::instance()->get_index_searcher(
+                _fs, index_dir.c_str(), index_file_name, 
&inverted_index_cache_handle, stats);
+        auto index_searcher = inverted_index_cache_handle.get_index_searcher();
 
-            {
-                // add to cache
-                term_match_bitmap->runOptimize();
-                cache->insert(cache_key, term_match_bitmap, &cache_handle);
-            }
-            return Status::OK();
-        };
+        std::unique_ptr<lucene::search::Query> query;
+        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
 
         roaring::Roaring query_match_bitmap;
         bool null_bitmap_already_read = false;
-        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
             std::wstring str_tokens;
             for (auto& token : analyse_result) {
                 str_tokens += token;
+                str_tokens += L"_";

Review Comment:
   'a b_c' and 'a_b c' will be the same key. This kind of problem can be avoid 
by using whitespace can.



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+
+void ConjunctionQuery::add(const std::wstring& fieldName, const 
std::vector<std::wstring>& wterms) {
+    if (wterms.size() < 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    }
+
+    std::vector<TermIterator> iterators;
+    for (auto& wterm : wterms) {
+        Term* t = _CLNEW Term(fieldName.c_str(), wterm.c_str());
+        _terms.push_back(t);
+        TermDocs* termDoc = _reader->termDocs(t);
+        _termDocs.push_back(termDoc);
+        iterators.emplace_back(termDoc);
+    }
+
+    std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
+        return a.docFreq() < b.docFreq();
+    });
+
+    if (iterators.size() == 1) {
+        _lead1 = iterators[0];
+    } else {
+        _lead1 = iterators[0];
+        _lead2 = iterators[1];
+        for (int32_t i = 2; i < _terms.size(); i++) {
+            _others.push_back(iterators[i]);
+        }
+    }
+
+    if (iterators.size() >= 2) {
+        int32_t little = iterators[0].docFreq();
+        int32_t big = iterators[iterators.size() - 1].docFreq();
+        if (little == 0) {
+            _useSkip = true;
+        } else if ((big / little) > _conjunction_ratio) {
+            _useSkip = true;
+        }
+    }
+}
+
+void ConjunctionQuery::search(roaring::Roaring& roaring) {
+    if (_lead1.isEmpty()) return;
+
+    if (_indexVersion == IndexVersion::kV0) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    if (!_useSkip) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    search_by_skiplist(roaring);
+}
+
+void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) {
+    auto func = [&roaring](const TermIterator& termDocs) {
+        roaring::Roaring result;
+        DocRange docRange;
+        while (termDocs.readRange(&docRange)) {
+            if (docRange.type_ == DocRangeType::kMany) {
+                result.addMany(docRange.doc_many_size_, 
docRange.doc_many->data());
+            } else {
+                result.addRange(docRange.doc_range.first, 
docRange.doc_range.second);
+            }
+        }
+        roaring &= result;
+    };
+
+    {
+        DocRange docRange;
+        while (_lead1.readRange(&docRange)) {
+            if (docRange.type_ == DocRangeType::kMany) {
+                roaring.addMany(docRange.doc_many_size_, 
docRange.doc_many->data());
+            } else {

Review Comment:
   continue others for loop or outer for loop?



##########
fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java:
##########
@@ -394,6 +394,8 @@ public class SessionVariable implements Serializable, 
Writable {
     public static final String ENABLE_MEMTABLE_ON_SINK_NODE =
             "enable_memtable_on_sink_node";
 
+    public static final String CONJUNCTION_RATIO = "conjunction_ratio";

Review Comment:
   The name is not easy to understand. one suggestion: 
inverted_index_conjunction_opt_threshold.
   BTW, add document for this session var.



##########
gensrc/thrift/PaloInternalService.thrift:
##########
@@ -240,6 +240,8 @@ struct TQueryOptions {
 
   // A tag used to distinguish fe start epoch.
   82: optional i64 fe_process_uuid = 0;
+
+  83: optional i32 conjunction_ratio = 1000;

Review Comment:
   name



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h:
##########
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "CLucene/index/IndexReader.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/search/query/TermIterator.h"
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris {
+
+class ConjunctionQuery {
+public:
+    ConjunctionQuery(IndexReader* reader);
+    ~ConjunctionQuery();
+
+    void set_conjunction_ratio(int32_t conjunction_ratio) {
+        _conjunction_ratio = conjunction_ratio;
+    }
+
+    void add(const std::wstring& fieldName, const std::vector<std::wstring>& 
wterms);
+    void search(roaring::Roaring& roaring);
+
+private:
+    void search_by_bitmap(roaring::Roaring& roaring);
+
+    void search_by_skiplist(roaring::Roaring& roaring) {
+        int32_t doc = 0;
+        while ((doc = next_doc()) != INT32_MAX) {
+            roaring.add(doc);
+        }
+    }
+
+    int32_t next_doc() { return do_next(_lead1.nextDoc()); }

Review Comment:
   next_doc is only used once and is simple, is it necessary to create a new 
function for it?



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+
+void ConjunctionQuery::add(const std::wstring& fieldName, const 
std::vector<std::wstring>& wterms) {
+    if (wterms.size() < 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    }
+
+    std::vector<TermIterator> iterators;
+    for (auto& wterm : wterms) {
+        Term* t = _CLNEW Term(fieldName.c_str(), wterm.c_str());
+        _terms.push_back(t);
+        TermDocs* termDoc = _reader->termDocs(t);
+        _termDocs.push_back(termDoc);
+        iterators.emplace_back(termDoc);
+    }
+
+    std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
+        return a.docFreq() < b.docFreq();
+    });
+
+    if (iterators.size() == 1) {
+        _lead1 = iterators[0];
+    } else {
+        _lead1 = iterators[0];
+        _lead2 = iterators[1];
+        for (int32_t i = 2; i < _terms.size(); i++) {
+            _others.push_back(iterators[i]);
+        }
+    }
+
+    if (iterators.size() >= 2) {
+        int32_t little = iterators[0].docFreq();
+        int32_t big = iterators[iterators.size() - 1].docFreq();
+        if (little == 0) {
+            _useSkip = true;
+        } else if ((big / little) > _conjunction_ratio) {
+            _useSkip = true;
+        }
+    }
+}
+
+void ConjunctionQuery::search(roaring::Roaring& roaring) {
+    if (_lead1.isEmpty()) return;
+
+    if (_indexVersion == IndexVersion::kV0) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    if (!_useSkip) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    search_by_skiplist(roaring);
+}
+
+void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) {
+    auto func = [&roaring](const TermIterator& termDocs) {
+        roaring::Roaring result;
+        DocRange docRange;
+        while (termDocs.readRange(&docRange)) {
+            if (docRange.type_ == DocRangeType::kMany) {
+                result.addMany(docRange.doc_many_size_, 
docRange.doc_many->data());
+            } else {
+                result.addRange(docRange.doc_range.first, 
docRange.doc_range.second);
+            }
+        }
+        roaring &= result;
+    };
+
+    {
+        DocRange docRange;
+        while (_lead1.readRange(&docRange)) {
+            if (docRange.type_ == DocRangeType::kMany) {
+                roaring.addMany(docRange.doc_many_size_, 
docRange.doc_many->data());
+            } else {
+                roaring.addRange(docRange.doc_range.first, 
docRange.doc_range.second);
+            }
+        }
+    }
+
+    if (!_lead2.isEmpty()) {
+        func(_lead2);

Review Comment:
   goto is not suggesed to be used.



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h:
##########
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "CLucene/index/IndexReader.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/search/query/TermIterator.h"
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris {
+
+class ConjunctionQuery {
+public:
+    ConjunctionQuery(IndexReader* reader);
+    ~ConjunctionQuery();
+
+    void set_conjunction_ratio(int32_t conjunction_ratio) {
+        _conjunction_ratio = conjunction_ratio;
+    }
+
+    void add(const std::wstring& fieldName, const std::vector<std::wstring>& 
wterms);
+    void search(roaring::Roaring& roaring);
+
+private:
+    void search_by_bitmap(roaring::Roaring& roaring);
+
+    void search_by_skiplist(roaring::Roaring& roaring) {
+        int32_t doc = 0;
+        while ((doc = next_doc()) != INT32_MAX) {
+            roaring.add(doc);
+        }
+    }
+
+    int32_t next_doc() { return do_next(_lead1.nextDoc()); }
+
+    int32_t do_next(int32_t doc);
+
+private:
+    IndexReader* _reader = nullptr;
+    IndexVersion _indexVersion = IndexVersion::kV0;
+    int32_t _conjunction_ratio = 1000;
+    bool _useSkip = false;

Review Comment:
   _use_skiplist



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h:
##########
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "CLucene/index/IndexReader.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/search/query/TermIterator.h"
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris {
+
+class ConjunctionQuery {
+public:
+    ConjunctionQuery(IndexReader* reader);
+    ~ConjunctionQuery();
+
+    void set_conjunction_ratio(int32_t conjunction_ratio) {
+        _conjunction_ratio = conjunction_ratio;
+    }
+
+    void add(const std::wstring& fieldName, const std::vector<std::wstring>& 
wterms);
+    void search(roaring::Roaring& roaring);
+
+private:
+    void search_by_bitmap(roaring::Roaring& roaring);
+
+    void search_by_skiplist(roaring::Roaring& roaring) {
+        int32_t doc = 0;
+        while ((doc = next_doc()) != INT32_MAX) {
+            roaring.add(doc);
+        }
+    }
+
+    int32_t next_doc() { return do_next(_lead1.nextDoc()); }
+
+    int32_t do_next(int32_t doc);
+
+private:
+    IndexReader* _reader = nullptr;
+    IndexVersion _indexVersion = IndexVersion::kV0;
+    int32_t _conjunction_ratio = 1000;
+    bool _useSkip = false;
+
+    TermIterator _lead1;
+    TermIterator _lead2;
+    std::vector<TermIterator> _others;
+
+    std::vector<Term*> _terms;
+    std::vector<TermDocs*> _termDocs;

Review Comment:
   _term_docs



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+
+void ConjunctionQuery::add(const std::wstring& fieldName, const 
std::vector<std::wstring>& wterms) {
+    if (wterms.size() < 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    }
+
+    std::vector<TermIterator> iterators;
+    for (auto& wterm : wterms) {
+        Term* t = _CLNEW Term(fieldName.c_str(), wterm.c_str());
+        _terms.push_back(t);

Review Comment:
   can be merged to set _use_skiplist and just select 
search_by_bitmap/search_by_skiplist by _use_skiplist



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h:
##########
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "CLucene/index/IndexReader.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/search/query/TermIterator.h"
+#include "roaring/roaring.hh"
+
+CL_NS_USE(index)
+
+namespace doris {
+
+class ConjunctionQuery {
+public:
+    ConjunctionQuery(IndexReader* reader);
+    ~ConjunctionQuery();
+
+    void set_conjunction_ratio(int32_t conjunction_ratio) {
+        _conjunction_ratio = conjunction_ratio;
+    }
+
+    void add(const std::wstring& fieldName, const std::vector<std::wstring>& 
wterms);
+    void search(roaring::Roaring& roaring);
+
+private:
+    void search_by_bitmap(roaring::Roaring& roaring);
+
+    void search_by_skiplist(roaring::Roaring& roaring) {

Review Comment:
   it's confusing to put search_by_skiplist and next_doc impl in .h 



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+

Review Comment:
   combine to 'little == 0 || (big / little) > _conjunction_ratio'



##########
be/src/olap/rowset/segment_v2/inverted_index_reader.h:
##########
@@ -130,6 +133,20 @@ class FullTextIndexReader : public InvertedIndexReader {
     }
 
     InvertedIndexReaderType type() override;
+
+private:
+    Status index_search(OlapReaderStatistics* stats, InvertedIndexQueryType 
query_type,
+                        const IndexSearcherPtr& index_searcher, bool& 
null_bitmap_already_read,
+                        const std::unique_ptr<lucene::search::Query>& query,
+                        const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
+
+    Status match_all_search(OlapReaderStatistics* stats, RuntimeState* 
runtime_state,

Review Comment:
   function name index_search and match_all_search is not symmetrical. 
suggestion: normal_index_search, match_all_index_search



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+
+void ConjunctionQuery::add(const std::wstring& fieldName, const 
std::vector<std::wstring>& wterms) {
+    if (wterms.size() < 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    }
+
+    std::vector<TermIterator> iterators;
+    for (auto& wterm : wterms) {
+        Term* t = _CLNEW Term(fieldName.c_str(), wterm.c_str());
+        _terms.push_back(t);
+        TermDocs* termDoc = _reader->termDocs(t);
+        _termDocs.push_back(termDoc);
+        iterators.emplace_back(termDoc);
+    }
+
+    std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
+        return a.docFreq() < b.docFreq();
+    });
+
+    if (iterators.size() == 1) {
+        _lead1 = iterators[0];
+    } else {
+        _lead1 = iterators[0];
+        _lead2 = iterators[1];
+        for (int32_t i = 2; i < _terms.size(); i++) {
+            _others.push_back(iterators[i]);
+        }
+    }
+
+    if (iterators.size() >= 2) {
+        int32_t little = iterators[0].docFreq();
+        int32_t big = iterators[iterators.size() - 1].docFreq();
+        if (little == 0) {
+            _useSkip = true;
+        } else if ((big / little) > _conjunction_ratio) {
+            _useSkip = true;
+        }
+    }
+}
+
+void ConjunctionQuery::search(roaring::Roaring& roaring) {
+    if (_lead1.isEmpty()) return;
+
+    if (_indexVersion == IndexVersion::kV0) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    if (!_useSkip) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    search_by_skiplist(roaring);
+}
+
+void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) {
+    auto func = [&roaring](const TermIterator& termDocs) {
+        roaring::Roaring result;
+        DocRange docRange;
+        while (termDocs.readRange(&docRange)) {
+            if (docRange.type_ == DocRangeType::kMany) {
+                result.addMany(docRange.doc_many_size_, 
docRange.doc_many->data());

Review Comment:
   while (true)



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+
+void ConjunctionQuery::add(const std::wstring& fieldName, const 
std::vector<std::wstring>& wterms) {
+    if (wterms.size() < 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    }
+
+    std::vector<TermIterator> iterators;
+    for (auto& wterm : wterms) {
+        Term* t = _CLNEW Term(fieldName.c_str(), wterm.c_str());
+        _terms.push_back(t);
+        TermDocs* termDoc = _reader->termDocs(t);
+        _termDocs.push_back(termDoc);
+        iterators.emplace_back(termDoc);
+    }
+
+    std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
+        return a.docFreq() < b.docFreq();
+    });
+
+    if (iterators.size() == 1) {
+        _lead1 = iterators[0];
+    } else {
+        _lead1 = iterators[0];
+        _lead2 = iterators[1];
+        for (int32_t i = 2; i < _terms.size(); i++) {
+            _others.push_back(iterators[i]);
+        }
+    }
+
+    if (iterators.size() >= 2) {
+        int32_t little = iterators[0].docFreq();
+        int32_t big = iterators[iterators.size() - 1].docFreq();
+        if (little == 0) {
+            _useSkip = true;
+        } else if ((big / little) > _conjunction_ratio) {
+            _useSkip = true;
+        }
+    }
+}
+
+void ConjunctionQuery::search(roaring::Roaring& roaring) {
+    if (_lead1.isEmpty()) return;
+
+    if (_indexVersion == IndexVersion::kV0) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    if (!_useSkip) {
+        search_by_bitmap(roaring);
+        return;
+    }
+
+    search_by_skiplist(roaring);
+}
+
+void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) {
+    auto func = [&roaring](const TermIterator& termDocs) {
+        roaring::Roaring result;
+        DocRange docRange;
+        while (termDocs.readRange(&docRange)) {

Review Comment:
   add comment for this algrithm



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to