This is an automated email from the ASF dual-hosted git repository.
yangsiyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0eaac8c104f [fix](inverted index) implement phrase-level BM25 scoring
with phrase frequency (#60331)
0eaac8c104f is described below
commit 0eaac8c104f41700767201fc3fbca33a34600f8e
Author: zzzxl <[email protected]>
AuthorDate: Mon Mar 2 16:34:28 2026 +0800
[fix](inverted index) implement phrase-level BM25 scoring with phrase
frequency (#60331)
---
.../inverted_index/query/phrase_query.cpp | 52 +-
.../segment_v2/inverted_index/query/phrase_query.h | 3 +-
.../query/phrase_query/exact_phrase_matcher.cpp | 9 +
.../query/phrase_query/exact_phrase_matcher.h | 1 +
.../phrase_query/ordered_sloppy_phrase_matcher.cpp | 13 +
.../phrase_query/ordered_sloppy_phrase_matcher.h | 2 +
.../query/phrase_query/sloppy_phrase_matcher.cpp | 13 +
.../query/phrase_query/sloppy_phrase_matcher.h | 2 +
.../inverted_index/util/docid_set_iterator.h | 7 +
.../segment_v2/inverted_index/util/mock_iterator.h | 2 +
.../inverted_index/util/union_term_iterator.h | 6 +-
.../query/phrase_query/phrase_freq_test.cpp | 559 +++++++++++++++++++++
12 files changed, 642 insertions(+), 27 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
index 9494a3571d2..0e9cbd1fb8c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -24,7 +24,6 @@
#include "CLucene/index/Terms.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/rowset/segment_v2/inverted_index/query/query.h"
-#include "olap/rowset/segment_v2/inverted_index/query/query_helper.h"
#include "olap/rowset/segment_v2/inverted_index/util/term_position_iterator.h"
namespace doris::segment_v2 {
@@ -138,29 +137,16 @@ void
PhraseQuery::init_ordered_sloppy_phrase_matcher(const InvertedIndexQueryInf
}
void PhraseQuery::init_similarities(const std::wstring& field_name, bool
is_similarity) {
- // TODO: Current implementation - computes BM25 scores separately for each
term
- // Note: This approach is suitable for TermQuery but does not conform to
BM25 specification for PhraseQuery
- // BM25 phrase query specification requires:
- // idf component = sum of idf values for all terms
- // tf component = phrase frequency (number of times entire phrase
appears in document)
- // doc_length = total document length
- //
- // Future optimization direction:
- // 1. Shift to unified phrase scoring: calculate sum of idf for all
terms as combined idf
- // 2. Use phrase frequency instead of individual term frequencies
- // 3. Maintain document length normalization
- // 4. Refactor to create a single Similarity object handling the entire
phrase
if (is_similarity) {
- _similarities.resize(_iterators.size());
- for (size_t i = 0; i < _iterators.size(); i++) {
- const auto& iter = _iterators[i];
+ std::vector<std::wstring> all_terms;
+ for (const auto& iter : _iterators) {
if (std::holds_alternative<TermPositionsIterPtr>(iter)) {
const auto& term_iter = std::get<TermPositionsIterPtr>(iter);
- auto similarity = std::make_unique<BM25Similarity>();
- similarity->for_one_term(_context, field_name,
term_iter->term());
- _similarities[i] = std::move(similarity);
+ all_terms.push_back(term_iter->term());
}
}
+ _phrase_similarity = std::make_unique<BM25Similarity>();
+ _phrase_similarity->for_terms(_context, field_name, all_terms);
}
}
@@ -176,13 +162,21 @@ void PhraseQuery::search(roaring::Roaring& roaring) {
void PhraseQuery::search_by_skiplist(roaring::Roaring& roaring) {
int32_t doc = 0;
while ((doc = do_next(visit_node(*_lead1, NextDoc {}))) != INT32_MAX) {
- if (!matches(doc)) {
- continue;
- }
- roaring.add(doc);
+ if (_phrase_similarity) {
+ float phrase_freq = count_phrase_freq(doc);
+ if (phrase_freq <= 0.0F) {
+ continue;
+ }
+ roaring.add(doc);
+ int32_t norm = visit_node(*_lead1, Norm {});
+ float score = _phrase_similarity->score(phrase_freq,
static_cast<int64_t>(norm));
- if (!_similarities.empty()) {
- QueryHelper::collect(_context, _similarities, _iterators, doc);
+ _context->collection_similarity->collect(doc, score);
+ } else {
+ if (!matches(doc)) {
+ continue;
+ }
+ roaring.add(doc);
}
}
}
@@ -230,6 +224,14 @@ bool PhraseQuery::matches(int32_t doc) {
});
}
+float PhraseQuery::count_phrase_freq(int32_t doc) {
+ float total_freq = 0.0F;
+ for (auto& matcher : _matchers) {
+ total_freq += std::visit([&doc](auto&& m) -> float { return
m.phrase_freq(doc); }, matcher);
+ }
+ return total_freq;
+}
+
void PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo&
query_info) {
auto is_digits = [](const std::string_view& str) {
return std::all_of(str.begin(), str.end(), [](unsigned char c) {
return std::isdigit(c); });
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
index 2251fce7740..aaf21c8b199 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -51,6 +51,7 @@ private:
int32_t do_next(int32_t doc);
bool matches(int32_t doc);
+ float count_phrase_freq(int32_t doc);
void init_exact_phrase_matcher(const InvertedIndexQueryInfo& query_info,
bool is_similarity);
void init_sloppy_phrase_matcher(const InvertedIndexQueryInfo& query_info,
bool is_similarity);
@@ -78,7 +79,7 @@ private:
std::vector<Matcher> _matchers;
- std::vector<SimilarityPtr> _similarities;
+ SimilarityPtr _phrase_similarity;
};
} // namespace doris::segment_v2
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
index e88f2d93652..db4ec1708dd 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.cpp
@@ -89,4 +89,13 @@ bool
ExactPhraseMatcher::advance_position(PostingsAndPosition& posting, int32_t
return true;
}
+float ExactPhraseMatcher::phrase_freq(int32_t doc) {
+ reset(doc);
+ float freq = 0.0F;
+ while (next_match()) {
+ freq += 1.0F;
+ }
+ return freq;
+}
+
} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
index c6e6c631ab4..b1e5c16b59e 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h
@@ -27,6 +27,7 @@ public:
void reset(int32_t doc);
bool next_match();
+ float phrase_freq(int32_t doc);
private:
bool advance_position(PostingsAndPosition& posting, int32_t target);
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
index 64cbd855cc9..97a03fc8af7 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.cpp
@@ -81,4 +81,17 @@ bool
OrderedSloppyPhraseMatcher::advance_position(PostingsAndPosition& posting,
return true;
}
+float OrderedSloppyPhraseMatcher::sloppy_weight() const {
+ return 1.0F / (1.0F + static_cast<float>(_match_width));
+}
+
+float OrderedSloppyPhraseMatcher::phrase_freq(int32_t doc) {
+ reset(doc);
+ float freq = 0.0F;
+ while (next_match()) {
+ freq += sloppy_weight();
+ }
+ return freq;
+}
+
} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
index 7ac8b748890..7573d5cae59 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h
@@ -27,6 +27,8 @@ public:
void reset(int32_t doc);
bool next_match();
+ float sloppy_weight() const;
+ float phrase_freq(int32_t doc);
private:
bool stretch_to_order(PostingsAndPosition* prev_posting);
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
index 342e04b1490..e98ad8b9f90 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.cpp
@@ -315,5 +315,18 @@ bool SloppyPhraseMatcher::init_complex() {
return true;
}
+float SloppyPhraseMatcher::sloppy_weight() const {
+ return 1.0F / (1.0F + static_cast<float>(_match_length));
+}
+
+float SloppyPhraseMatcher::phrase_freq(int32_t doc) {
+ reset(doc);
+ float freq = 0.0F;
+ while (next_match()) {
+ freq += sloppy_weight();
+ }
+ return freq;
+}
+
#include "common/compile_check_end.h"
} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
index d008389d6fc..7e2c360692a 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h
@@ -32,6 +32,8 @@ public:
void reset(int32_t doc);
bool next_match();
bool advance_rpts(PhrasePositions* pp);
+ float sloppy_weight() const;
+ float phrase_freq(int32_t doc);
private:
bool advance_pp(PhrasePositions* pp);
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
b/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
index 7ebda6cbc82..26fbd129d73 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h
@@ -79,5 +79,12 @@ struct NextPosition {
}
};
+struct Norm {
+ template <typename T>
+ int32_t operator()(const T& iter) const {
+ return iter->norm();
+ }
+};
+
#include "common/compile_check_end.h"
} // namespace doris::segment_v2
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
b/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
index 5e3540b7803..aacfb29ffc1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/mock_iterator.h
@@ -89,6 +89,8 @@ public:
return current_doc->second[pos_idx++];
}
+ int32_t norm() const MOCK_DEFINE(override) { return 1; }
+
bool read_range(DocRange* doc_range) const MOCK_DEFINE(override) {
if (!doc_range) {
return false;
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
b/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
index 0ed9ec2f539..73ecc87268d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h
@@ -129,7 +129,6 @@ public:
top = _docs_queue->update_top();
} while (top->doc_id() == doc);
return top->doc_id();
- return 0;
}
int32_t advance(int32_t target) const {
@@ -143,6 +142,11 @@ public:
int32_t doc_freq() const { return _cost; }
+ int32_t norm() const {
+ throw Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
+ "UnionTermIterator does not support scoring");
+ }
+
private:
int32_t _cost = 0;
int32_t pos_queue_doc = -2;
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query/phrase_freq_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query/phrase_freq_test.cpp
new file mode 100644
index 00000000000..265cc0fef83
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query/phrase_freq_test.cpp
@@ -0,0 +1,559 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include
"olap/rowset/segment_v2/inverted_index/query/phrase_query/exact_phrase_matcher.h"
+#include
"olap/rowset/segment_v2/inverted_index/query/phrase_query/ordered_sloppy_phrase_matcher.h"
+#include
"olap/rowset/segment_v2/inverted_index/query/phrase_query/sloppy_phrase_matcher.h"
+#include "olap/rowset/segment_v2/inverted_index/util/docid_set_iterator.h"
+#include "olap/rowset/segment_v2/inverted_index/util/mock_iterator.h"
+#include "olap/rowset/segment_v2/inverted_index/util/union_term_iterator.h"
+
+namespace doris::segment_v2 {
+
+using namespace inverted_index;
+
+class PhraseFreqTest : public ::testing::Test {
+protected:
+ DISI create_mock_disi(std::map<int32_t, std::vector<int32_t>> postings) {
+ auto mock = std::make_shared<MockIterator>();
+ mock->set_postings(postings);
+ return mock;
+ }
+};
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_SingleMatch) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_MultipleMatches) {
+ auto disi1 = create_mock_disi({{1, {0, 2}}});
+ auto disi2 = create_mock_disi({{1, {1, 3}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 2.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_NoMatch) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {2}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_ThreeTermPhrase) {
+ auto disi1 = create_mock_disi({{1, {1}}});
+ auto disi2 = create_mock_disi({{1, {2}}});
+ auto disi3 = create_mock_disi({{1, {3}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ postings.emplace_back(disi3, 2);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ExactMatch) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_WithSlop) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {2}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GT(freq, 0.0F);
+ EXPECT_LE(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_SloppyWeight_AfterMatch) {
+ // Test sloppy_weight after next_match() returns true
+ // Positions: term1 at 0, term2 at 1 (exact match, match_width = 0)
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 5);
+ matcher.reset(1);
+ ASSERT_TRUE(matcher.next_match());
+
+ // match_width = 0 for exact match, so weight = 1/(1+0) = 1.0
+ float weight = matcher.sloppy_weight();
+ EXPECT_FLOAT_EQ(weight, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_SloppyWeight_WithGap) {
+ // Test sloppy_weight with gap between terms
+ // Positions: term1 at 0, term2 at 3 (gap of 2, match_width = 2)
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {3}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 5);
+ matcher.reset(1);
+ ASSERT_TRUE(matcher.next_match());
+
+ // match_width = 2, so weight = 1/(1+2) = 0.333...
+ float weight = matcher.sloppy_weight();
+ EXPECT_FLOAT_EQ(weight, 1.0F / 3.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ExceedsSlop) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {5}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_ExactMatch) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"big"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"red"});
+
+ SloppyPhraseMatcher matcher(postings, 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GT(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_SloppyWeight_AfterMatch) {
+ // Test sloppy_weight after next_match() returns true
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"big"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"red"});
+
+ SloppyPhraseMatcher matcher(postings, 5);
+ matcher.reset(1);
+ ASSERT_TRUE(matcher.next_match());
+
+ float weight = matcher.sloppy_weight();
+ EXPECT_GT(weight, 0.0F);
+ EXPECT_LE(weight, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_ReorderedTerms) {
+ auto disi1 = create_mock_disi({{1, {1}}});
+ auto disi2 = create_mock_disi({{1, {0}}});
+
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"big"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"red"});
+
+ SloppyPhraseMatcher matcher(postings, 3);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GE(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, NormVisitor_MockIterator) {
+ auto mock = std::make_shared<MockIterator>();
+ mock->set_postings({{1, {0, 1, 2}}});
+
+ int32_t norm = mock->norm();
+ EXPECT_EQ(norm, 1);
+}
+
+TEST_F(PhraseFreqTest, NormVisitor_WithDISI) {
+ auto disi = create_mock_disi({{1, {0}}});
+
+ int32_t norm = visit_node(disi, Norm {});
+ EXPECT_EQ(norm, 1);
+}
+
+TEST_F(PhraseFreqTest, UnionTermIterator_NormThrowsException) {
+ auto mock1 = std::make_shared<MockIterator>();
+ mock1->set_postings({{1, {0}}});
+ auto mock2 = std::make_shared<MockIterator>();
+ mock2->set_postings({{1, {1}}});
+
+ std::vector<std::shared_ptr<MockIterator>> subs = {mock1, mock2};
+ UnionTermIterator<MockIterator> union_iter(subs);
+
+ EXPECT_THROW(union_iter.norm(), Exception);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_MultipleDocuments) {
+ {
+ auto d1 = create_mock_disi({{1, {0}}});
+ auto d2 = create_mock_disi({{1, {1}}});
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(d1, 0);
+ postings.emplace_back(d2, 1);
+ ExactPhraseMatcher matcher(std::move(postings));
+ EXPECT_FLOAT_EQ(matcher.phrase_freq(1), 1.0F);
+ }
+
+ {
+ auto d1 = create_mock_disi({{3, {0, 2}}});
+ auto d2 = create_mock_disi({{3, {1, 3}}});
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(d1, 0);
+ postings.emplace_back(d2, 1);
+ ExactPhraseMatcher matcher(std::move(postings));
+ EXPECT_FLOAT_EQ(matcher.phrase_freq(3), 2.0F);
+ }
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_MultipleMatches) {
+ auto disi1 = create_mock_disi({{1, {0, 3}}});
+ auto disi2 = create_mock_disi({{1, {2, 5}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GT(freq, 0.0F);
+}
+
+// ==================== Additional Coverage Tests ====================
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_FourTermPhrase) {
+ // Test longer phrase matching
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+ auto disi3 = create_mock_disi({{1, {2}}});
+ auto disi4 = create_mock_disi({{1, {3}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ postings.emplace_back(disi3, 2);
+ postings.emplace_back(disi4, 3);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_PartialMatch) {
+ // First two terms match but third doesn't
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+ auto disi3 = create_mock_disi({{1, {5}}}); // Gap breaks the phrase
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ postings.emplace_back(disi3, 2);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_OverlappingMatches) {
+ // Positions that could form overlapping phrases: "a b a b"
+ // Phrase "a b" appears at positions (0,1) and (2,3)
+ auto disi1 = create_mock_disi({{1, {0, 2}}});
+ auto disi2 = create_mock_disi({{1, {1, 3}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 2.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ThreeTerms) {
+ // Three term phrase with slop
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {2}}});
+ auto disi3 = create_mock_disi({{1, {4}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ postings.emplace_back(disi3, 2);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GT(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ThreeTerms_ExceedsSlop) {
+ // Three term phrase exceeds slop
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {5}}});
+ auto disi3 = create_mock_disi({{1, {10}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ postings.emplace_back(disi3, 2);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_PhraseFreqAccumulation) {
+ // Multiple matches with different sloppy weights
+ // Match 1: positions 0, 1 (match_width=0, weight=1.0)
+ // Match 2: positions 3, 5 (match_width=1, weight=0.5)
+ auto disi1 = create_mock_disi({{1, {0, 3}}});
+ auto disi2 = create_mock_disi({{1, {1, 5}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 5);
+ float freq = matcher.phrase_freq(1);
+
+ // Expected: 1.0 + 0.5 = 1.5
+ EXPECT_FLOAT_EQ(freq, 1.5F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_NoMatch) {
+ // Terms too far apart
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {100}}});
+
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"hello"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"world"});
+
+ SloppyPhraseMatcher matcher(postings, 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_ThreeTerms) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+ auto disi3 = create_mock_disi({{1, {2}}});
+
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"the"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"quick"});
+ postings.emplace_back(disi3, 2, std::vector<std::string> {"fox"});
+
+ SloppyPhraseMatcher matcher(postings, 3);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GT(freq, 0.0F);
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_MultipleMatches) {
+ // Document with phrase appearing twice
+ auto disi1 = create_mock_disi({{1, {0, 5}}});
+ auto disi2 = create_mock_disi({{1, {1, 6}}});
+
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"hello"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"world"});
+
+ SloppyPhraseMatcher matcher(postings, 2);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_GT(freq, 1.0F); // Should have accumulated weight from multiple
matches
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_Matches_Consistency) {
+ // Verify matches() and phrase_freq() are consistent
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ {
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ ExactPhraseMatcher matcher(std::move(postings));
+ EXPECT_TRUE(matcher.matches(1));
+ }
+
+ // Reset iterators
+ disi1 = create_mock_disi({{1, {0}}});
+ disi2 = create_mock_disi({{1, {1}}});
+
+ {
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ ExactPhraseMatcher matcher(std::move(postings));
+ EXPECT_GT(matcher.phrase_freq(1), 0.0F);
+ }
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_Matches_Consistency) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {2}}});
+
+ {
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+ EXPECT_TRUE(matcher.matches(1));
+ }
+
+ disi1 = create_mock_disi({{1, {0}}});
+ disi2 = create_mock_disi({{1, {2}}});
+
+ {
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 3);
+ EXPECT_GT(matcher.phrase_freq(1), 0.0F);
+ }
+}
+
+TEST_F(PhraseFreqTest, SloppyPhraseMatcher_Matches_Consistency) {
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ {
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"a"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"b"});
+ SloppyPhraseMatcher matcher(postings, 2);
+ EXPECT_TRUE(matcher.matches(1));
+ }
+
+ disi1 = create_mock_disi({{1, {0}}});
+ disi2 = create_mock_disi({{1, {1}}});
+
+ {
+ std::vector<PostingsAndFreq> postings;
+ postings.emplace_back(disi1, 0, std::vector<std::string> {"a"});
+ postings.emplace_back(disi2, 1, std::vector<std::string> {"b"});
+ SloppyPhraseMatcher matcher(postings, 2);
+ EXPECT_GT(matcher.phrase_freq(1), 0.0F);
+ }
+}
+
+TEST_F(PhraseFreqTest, ExactPhraseMatcher_HighFrequencyTerms) {
+ // Test with terms appearing many times
+ auto disi1 = create_mock_disi({{1, {0, 2, 4, 6, 8}}});
+ auto disi2 = create_mock_disi({{1, {1, 3, 5, 7, 9}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ ExactPhraseMatcher matcher(std::move(postings));
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 5.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ZeroSlop) {
+ // Zero slop should behave like exact match
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {1}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 0);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 1.0F);
+}
+
+TEST_F(PhraseFreqTest, OrderedSloppyPhraseMatcher_ZeroSlop_NoMatch) {
+ // Zero slop with gap should not match
+ auto disi1 = create_mock_disi({{1, {0}}});
+ auto disi2 = create_mock_disi({{1, {2}}});
+
+ std::vector<PostingsAndPosition> postings;
+ postings.emplace_back(disi1, 0);
+ postings.emplace_back(disi2, 1);
+
+ OrderedSloppyPhraseMatcher matcher(std::move(postings), 0);
+ float freq = matcher.phrase_freq(1);
+
+ EXPECT_FLOAT_EQ(freq, 0.0F);
+}
+
+} // namespace doris::segment_v2
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]