This is an automated email from the ASF dual-hosted git repository.
yangsiyu pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push:
new ac9475a6a61 [opt](inverted index) implement block max WAND algorithm
with BM25 similarity for query optimization (#374)
ac9475a6a61 is described below
commit ac9475a6a61d37117fb44d0ed9d0bc1b5b57f86f
Author: zzzxl <[email protected]>
AuthorDate: Thu Jan 15 10:48:04 2026 +0800
[opt](inverted index) implement block max WAND algorithm with BM25
similarity for query optimization (#374)
---
src/core/CLucene/index/BlockMaxBM25Similarity.cpp | 78 ++++++++++
src/core/CLucene/index/BlockMaxBM25Similarity.h | 43 ++++++
src/core/CLucene/index/DocRange.h | 32 ++---
src/core/CLucene/index/IndexWriter.cpp | 53 ++++++-
src/core/CLucene/index/IndexWriter.h | 3 +-
src/core/CLucene/index/MultiSegmentReader.cpp | 15 +-
src/core/CLucene/index/SDocumentWriter.cpp | 69 +++++++--
src/core/CLucene/index/SDocumentWriter.h | 13 +-
src/core/CLucene/index/SegmentTermDocs.cpp | 90 +++++++++---
src/core/CLucene/index/SkipListReader.cpp | 15 ++
src/core/CLucene/index/SkipListWriter.cpp | 4 +-
src/core/CLucene/index/Terms.h | 20 ++-
src/core/CLucene/index/_MultiSegmentReader.h | 3 +-
src/core/CLucene/index/_SegmentHeader.h | 10 +-
src/core/CLucene/index/_SkipListReader.h | 10 +-
src/core/CLucene/index/_SkipListWriter.h | 6 +-
src/core/CMakeLists.txt | 1 +
src/test/index/TestReadRange.cpp | 165 ++++++++++++++++++++--
src/test/posting/TestBlockMaxScoreV3.cpp | 4 +-
19 files changed, 535 insertions(+), 99 deletions(-)
diff --git a/src/core/CLucene/index/BlockMaxBM25Similarity.cpp
b/src/core/CLucene/index/BlockMaxBM25Similarity.cpp
new file mode 100644
index 00000000000..4dfbda9b5c1
--- /dev/null
+++ b/src/core/CLucene/index/BlockMaxBM25Similarity.cpp
@@ -0,0 +1,78 @@
+//
+// BlockMaxBM25Similarity - 用于 Block Max Score 计算的 BM25 相似度类实现
+//
+// 与 doris::segment_v2::BM25Similarity 保持一致的实现方式。
+//
+
+#include "BlockMaxBM25Similarity.h"
+
+#include <cstddef>
+#include <limits>
+
+namespace lucene::index {
+
+const int32_t BlockMaxBM25Similarity::MAX_INT32 =
std::numeric_limits<int32_t>::max();
+const uint32_t BlockMaxBM25Similarity::MAX_INT4 =
long_to_int4(static_cast<uint64_t>(MAX_INT32));
+const int32_t BlockMaxBM25Similarity::NUM_FREE_VALUES = 255 -
static_cast<int>(MAX_INT4);
+
+std::vector<float> BlockMaxBM25Similarity::LENGTH_TABLE = []() {
+ std::vector<float> table(256);
+ for (int32_t i = 0; i < 256; i++) {
+ table[i] = byte4_to_int(static_cast<uint8_t>(i));
+ }
+ return table;
+}();
+
+BlockMaxBM25Similarity::BlockMaxBM25Similarity(float avgdl) : _avgdl(avgdl),
_cache(256) {
+ compute_cache();
+}
+
+void BlockMaxBM25Similarity::compute_cache() {
+ for (size_t i = 0; i < 256; i++) {
+ _cache[i] = 1.0F / (_k1 * ((1 - _b) + _b * LENGTH_TABLE[i] / _avgdl));
+ }
+}
+
+int32_t BlockMaxBM25Similarity::number_of_leading_zeros(uint64_t value) {
+ if (value == 0) {
+ return 64;
+ }
+ return __builtin_clzll(value);
+}
+
+uint32_t BlockMaxBM25Similarity::long_to_int4(uint64_t i) {
+ if (i > std::numeric_limits<uint64_t>::max()) {
+ throw;
+ }
+
+ int32_t numBits = 64 - number_of_leading_zeros(i);
+ if (numBits < 4) {
+ return static_cast<uint32_t>(i);
+ } else {
+ int32_t shift = numBits - 4;
+ uint32_t encoded = static_cast<uint32_t>(i >> shift) & 0x07;
+ return encoded | ((shift + 1) << 3);
+ }
+}
+
+uint64_t BlockMaxBM25Similarity::int4_to_long(uint32_t i) {
+ uint64_t bits = i & 0x07;
+ int32_t shift = (i >> 3) - 1;
+
+ if (shift < 0) {
+ return bits;
+ } else {
+ return (bits | 0x08) << shift;
+ }
+}
+
+float BlockMaxBM25Similarity::byte4_to_int(uint8_t b) {
+ if (b < NUM_FREE_VALUES) {
+ return static_cast<float>(b);
+ } else {
+ uint64_t decoded = NUM_FREE_VALUES + int4_to_long(b - NUM_FREE_VALUES);
+ return static_cast<float>(decoded);
+ }
+}
+
+} // namespace lucene::index
diff --git a/src/core/CLucene/index/BlockMaxBM25Similarity.h
b/src/core/CLucene/index/BlockMaxBM25Similarity.h
new file mode 100644
index 00000000000..23f77fe0897
--- /dev/null
+++ b/src/core/CLucene/index/BlockMaxBM25Similarity.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace lucene::index {
+
+class BlockMaxBM25Similarity {
+public:
+ explicit BlockMaxBM25Similarity(float avgdl);
+ ~BlockMaxBM25Similarity() = default;
+
+ float tf_factor(int32_t freq, uint8_t norm_byte) const {
+ float norm_inverse = _cache[norm_byte];
+ return 1.0F - 1.0F / (1.0F + static_cast<float>(freq) * norm_inverse);
+ }
+
+ float avgdl() const { return _avgdl; }
+
+private:
+ void compute_cache();
+
+ static int32_t number_of_leading_zeros(uint64_t value);
+ static uint32_t long_to_int4(uint64_t i);
+ static uint64_t int4_to_long(uint32_t i);
+ static float byte4_to_int(uint8_t b);
+
+ static const int32_t MAX_INT32;
+ static const uint32_t MAX_INT4;
+ static const int32_t NUM_FREE_VALUES;
+
+ static std::vector<float> LENGTH_TABLE;
+
+ float _k1 = 1.2F;
+ float _b = 0.75F;
+ float _avgdl = 0.0F;
+
+ std::vector<float> _cache;
+};
+using BlockMaxBM25SimilarityPtr = std::shared_ptr<BlockMaxBM25Similarity>;
+
+} // namespace lucene::index
\ No newline at end of file
diff --git a/src/core/CLucene/index/DocRange.h
b/src/core/CLucene/index/DocRange.h
index 726144b7fe8..bd3d40a7420 100644
--- a/src/core/CLucene/index/DocRange.h
+++ b/src/core/CLucene/index/DocRange.h
@@ -7,29 +7,27 @@
#include "CLucene/CLConfig.h"
enum class DocRangeType {
- kMany = 0,
- kRange = 1,
+ kMany = 0,
+ kRange = 1,
- kNone
+ kNone
};
class DocRange {
- public:
- DocRange() = default;
- ~DocRange() = default;
+public:
+ DocRange() = default;
+ ~DocRange() = default;
- public:
- DocRangeType type_ = DocRangeType::kNone;
+public:
+ DocRangeType type_ = DocRangeType::kNone;
- uint32_t doc_many_size_ = 0;
- uint32_t freq_many_size_ = 0;
- uint32_t norm_many_size_ = 0;
+ uint32_t doc_many_size_ = 0;
+ uint32_t freq_many_size_ = 0;
+ uint32_t norm_many_size_ = 0;
- std::vector<uint32_t>* doc_many = nullptr;
- std::vector<uint32_t>* freq_many = nullptr;
- std::vector<uint32_t>* norm_many = nullptr;
+ std::vector<uint32_t>* doc_many = nullptr;
+ std::vector<uint32_t>* freq_many = nullptr;
+ std::vector<uint32_t>* norm_many = nullptr;
- std::pair<uint32_t, uint32_t> doc_range;
-
- bool need_positions = false;
+ std::pair<uint32_t, uint32_t> doc_range;
};
\ No newline at end of file
diff --git a/src/core/CLucene/index/IndexWriter.cpp
b/src/core/CLucene/index/IndexWriter.cpp
index 490782f144b..7ad04859e61 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1327,6 +1327,9 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
}
}
+ // Build destFieldNormsMapValues from srcFieldNormsMapValues using
_trans_vec
+ std::vector<std::map<std::wstring, std::vector<uint8_t>>>
destFieldNormsMapValues(
+ numDestIndexes);
if (hasNorms) {
for (int srcIndex = 0; srcIndex < numIndices; srcIndex++) {
auto reader = readers[srcIndex];
@@ -1351,6 +1354,24 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
}
}
}
+
+ // Build destFieldNormsMapValues
+ for (int destIdx = 0; destIdx < numDestIndexes; destIdx++) {
+ for (const auto& [fieldName, _] : srcFieldNormsMapValues[0]) {
+
destFieldNormsMapValues[destIdx][fieldName].resize(dest_index_docs[destIdx], 0);
+ }
+ }
+ for (int srcIdx = 0; srcIdx < numIndices; srcIdx++) {
+ for (const auto& [fieldName, srcNorms] :
srcFieldNormsMapValues[srcIdx]) {
+ for (size_t srcDocId = 0; srcDocId < srcNorms.size();
srcDocId++) {
+ auto [destIdx, destDocId] =
_trans_vec[srcIdx][srcDocId];
+ if (destIdx != UINT32_MAX && destDocId != UINT32_MAX) {
+
destFieldNormsMapValues[destIdx][fieldName][destDocId] =
+ srcNorms[srcDocId];
+ }
+ }
+ }
+ }
}
/// write fields and create files writers
@@ -1406,7 +1427,7 @@ void
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
}
/// merge terms
- mergeTerms(hasProx, indexVersion);
+ mergeTerms(hasProx, hasNorms, indexVersion, destFieldNormsMapValues);
/// merge norms if have
if (hasNorms){
@@ -1643,7 +1664,8 @@ protected:
};
-void IndexWriter::mergeTerms(bool hasProx, IndexVersion indexVersion) {
+void IndexWriter::mergeTerms(bool hasProx, bool hasNorms, IndexVersion
indexVersion,
+ const std::vector<std::map<std::wstring, std::vector<uint8_t>>>&
destFieldNormsMapValues) {
auto queue = _CLNEW SegmentMergeQueue(readers.size());
auto numSrcIndexes = readers.size();
//std::vector<TermPositions *> postingsList(numSrcIndexes);
@@ -1794,7 +1816,32 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion
indexVersion) {
PforUtil::encodePos(proxOut, posBuffer);
}
- skipWriter->setSkipData(lastDoc, false, -1);
+ int32_t maxBlockFreq = -1;
+ int32_t minBlockNorm = -1;
+
+ if (hasProx && hasNorms) {
+ auto normsIt =
destFieldNormsMapValues[destIdx].find(smallestTerm->field());
+ const std::vector<uint8_t>* destNorms =
+ (normsIt !=
destFieldNormsMapValues[destIdx].end())
+ ? &normsIt->second
+ : nullptr;
+
+ for (size_t i = 0; i < docDeltaBuffer.size(); ++i) {
+ int32_t freq = freqBuffer[i];
+ maxBlockFreq = std::max(maxBlockFreq, freq);
+ if (destNorms) {
+ uint32_t docId = docDeltaBuffer[i];
+ if (docId < destNorms->size()) {
+ uint8_t normByte = (*destNorms)[docId];
+ if (minBlockNorm == -1 || normByte <
minBlockNorm) {
+ minBlockNorm = normByte;
+ }
+ }
+ }
+ }
+ }
+
+ skipWriter->setSkipData(lastDoc, false, -1, maxBlockFreq,
minBlockNorm);
skipWriter->bufferSkip(df);
}
diff --git a/src/core/CLucene/index/IndexWriter.h
b/src/core/CLucene/index/IndexWriter.h
index e8c5fbd0ec3..de1bab67bde 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -328,7 +328,8 @@ public:
// write fields info file
void writeFields(lucene::store::Directory* d, std::string segment);
// merge terms and write files
- void mergeTerms(bool hasProx, IndexVersion indexVersion);
+ void mergeTerms(bool hasProx, bool hasNorms, IndexVersion indexVersion,
+ const std::vector<std::map<std::wstring, std::vector<uint8_t>>>&
destFieldNormsMapValues);
// merge norms and write files
void mergeNorms(
std::vector<uint32_t> dest_index_docs,
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp
b/src/core/CLucene/index/MultiSegmentReader.cpp
index 73428860f1a..c102808b4a9 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -270,6 +270,10 @@ const ArrayBase<IndexReader*>*
MultiSegmentReader::getSubReaders() const{
return subReaders;
}
+const int32_t* MultiSegmentReader::getStarts() const{
+ return starts;
+}
+
bool MultiSegmentReader::document(int32_t n, CL_NS(document)::Document& doc,
const FieldSelector* fieldSelector){
ensureOpen();
int32_t i = readerIndex(n); // find segment num
@@ -771,17 +775,6 @@ bool MultiTermDocs::skipTo(const int32_t target) {
}
}
-void MultiTermDocs::skipToBlock(const int32_t target) {
- while (pointer < subReaders->length && target >= starts[pointer]) {
- base = starts[pointer];
- current = termDocs(pointer++);
- }
-
- if (current != NULL) {
- current->skipToBlock(target - base);
- }
-}
-
void MultiTermDocs::close() {
//Func - Closes all MultiTermDocs managed by this instance
//Pre - true
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp
b/src/core/CLucene/index/SDocumentWriter.cpp
index e85d96dbd81..39ea1720d05 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -986,8 +986,8 @@ void SDocumentsWriter<T>::writeNorms(const std::string
&segmentName, int32_t tot
v = 0;
} else {
normsOut->writeLong(n->total_term_count_);
- v = n->out.getFilePointer();
- n->out.writeTo(normsOut);
+ v = n->norms_buffer.size();
+ normsOut->writeBytes(n->norms_buffer.data(),
n->norms_buffer.size());
n->reset();
}
if (v < totalNumDoc)
@@ -1032,6 +1032,21 @@ void
SDocumentsWriter<T>::writeSegment(std::vector<std::string> &flushedFiles) {
std::sort(allFields.begin(), allFields.end(),
ThreadState::FieldData::sort);
const int32_t numAllFields = allFields.size();
+ if (hasProx_) {
+ const int32_t numFieldInfos = fieldInfos->size();
+ for (int32_t fieldIdx = 0; fieldIdx < numFieldInfos; fieldIdx++) {
+ if (fieldIdx >= static_cast<int32_t>(norms.length)) {
+ continue;
+ }
+ BufferedNorms* n = norms[fieldIdx];
+ if (n != nullptr && numDocsInRAM > 0) {
+ float avgdl =
+ static_cast<float>(n->total_term_count_) /
static_cast<float>(numDocsInRAM);
+ n->createBM25Similarity(avgdl);
+ }
+ }
+ }
+
skipListWriter = _CLNEW DefaultSkipListWriter(termsOut->skipInterval,
termsOut->maxSkipLevels,
numDocsInRAM, freqOut,
proxOut, indexVersion_);
@@ -1136,8 +1151,8 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
STermInfosWriter<T> *termsOut,
IndexOutput *freqOut,
IndexOutput *proxOut) {
-
- const int32_t fieldNumber = (*fields)[0]->fieldInfo->number;
+ const FieldInfo* fieldInfo = (*fields)[0]->fieldInfo;
+ const int32_t fieldNumber = fieldInfo->number;
int32_t numFields = fields->length;
// In Doris, field number is always 1 by now.
assert(numFields == 1);
@@ -1161,6 +1176,13 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
const int32_t skipInterval = termsOut->skipInterval;
auto currentFieldStorePayloads = false;
+ // Block Max WAND: Pre-compute if scoring info is available
+ BufferedNorms* blockMaxNorms = nullptr;
+ if (fieldNumber < static_cast<int32_t>(norms.length)) {
+ blockMaxNorms = norms[fieldNumber];
+ }
+ const bool enableBlockMaxWand = hasProx_ && (blockMaxNorms != nullptr &&
!fieldInfo->omitNorms);
+
ValueArray<FieldMergeState *> termStates(numFields);
while (numFields > 0) {
@@ -1205,12 +1227,39 @@ void
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
while (numToMerge > 0) {
if ((++df % skipInterval) == 0) {
+ int32_t maxBlockFreq = -1;
+ int32_t maxBlockNorm = -1;
+
+ if (enableBlockMaxWand) {
+ const auto& nb = blockMaxNorms->norms_buffer;
+ auto* bm25 = blockMaxNorms->bm25_similarity_.get();
+ float maxTFFactor = -1.0F;
+
+ if (bm25 != nullptr) {
+ for (size_t i = 0; i < docDeltaBuffer.size(); ++i) {
+ int32_t freq = freqBuffer[i];
+ int32_t docId = docDeltaBuffer[i];
+
+ if (static_cast<size_t>(docId) < nb.size()) {
+ uint8_t normByte = nb[docId];
+ float tf_factor = bm25->tf_factor(freq,
normByte);
+
+ if (tf_factor > maxTFFactor) {
+ maxTFFactor = tf_factor;
+ maxBlockFreq = freq;
+ maxBlockNorm = normByte;
+ }
+ }
+ }
+ }
+ }
+
PforUtil::pfor_encode(freqOut, docDeltaBuffer, freqBuffer,
hasProx_);
if (hasProx_ && indexVersion_ >= IndexVersion::kV2) {
PforUtil::encodePos(proxOut, posBuffer);
}
- skipListWriter->setSkipData(lastDoc,
currentFieldStorePayloads, lastPayloadLength);
+ skipListWriter->setSkipData(lastDoc,
currentFieldStorePayloads, lastPayloadLength, maxBlockFreq, maxBlockNorm);
skipListWriter->bufferSkip(df);
}
@@ -1425,24 +1474,22 @@ SDocumentsWriter<T>::BufferedNorms::BufferedNorms() {
}
template<typename T>
void SDocumentsWriter<T>::BufferedNorms::add(float_t norm) {
- uint8_t b = search::Similarity::encodeNorm(norm);
- out.writeByte(b);
- upto++;
+ _CLTHROWA(CL_ERR_UnsupportedOperation,
"SDocumentsWriter::BufferedNorms::add(float_t) not supported");
}
template<typename T>
void SDocumentsWriter<T>::BufferedNorms::add(int32_t norm) {
total_term_count_ += norm;
uint8_t b = search::Similarity::encodeNorm(norm);
- out.writeByte(b);
+ norms_buffer.push_back(b);
upto++;
}
template<typename T>
void SDocumentsWriter<T>::BufferedNorms::reset() {
- out.reset();
upto = 0;
total_term_count_ = 0;
+ norms_buffer.clear();
}
template<typename T>
void SDocumentsWriter<T>::BufferedNorms::fill(int32_t docID) {
@@ -1452,7 +1499,7 @@ void SDocumentsWriter<T>::BufferedNorms::fill(int32_t
docID) {
// varying different fields, because we are not
// storing the norms sparsely (see LUCENE-830)
if (upto < docID) {
- fillBytes(&out, defaultNorm, docID - upto);
+ norms_buffer.insert(norms_buffer.end(), docID - upto, defaultNorm);
upto = docID;
}
}
diff --git a/src/core/CLucene/index/SDocumentWriter.h
b/src/core/CLucene/index/SDocumentWriter.h
index 5fc8247b7c4..5d475a19df7 100644
--- a/src/core/CLucene/index/SDocumentWriter.h
+++ b/src/core/CLucene/index/SDocumentWriter.h
@@ -14,6 +14,7 @@
#include "_FieldInfos.h"
#include "_TermInfo.h"
#include "_TermInfosWriter.h"
+#include "BlockMaxBM25Similarity.h"
CL_CLASS_DEF(analysis, Analyzer)
CL_CLASS_DEF(analysis, Token)
@@ -83,16 +84,22 @@ public:
* to a partial segment. */
class BufferedNorms {
public:
- CL_NS(store)::RAMOutputStream out;
- int32_t upto{};
-
BufferedNorms();
+ ~BufferedNorms() = default;
+
void add(float_t norm);
void add(int32_t norm);
void reset();
void fill(int32_t docID);
+ void createBM25Similarity(float avgdl) {
+ bm25_similarity_ =
std::make_unique<lucene::index::BlockMaxBM25Similarity>(avgdl);
+ }
+
+ int32_t upto {};
int64_t total_term_count_ = 0;
+ std::vector<uint8_t> norms_buffer;
+ std::unique_ptr<lucene::index::BlockMaxBM25Similarity>
bm25_similarity_;
};
template<typename T2>
class BlockPool {
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp
b/src/core/CLucene/index/SegmentTermDocs.cpp
index 2e2fd8a5192..58aa3c38dad 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -144,7 +144,7 @@ bool SegmentTermDocs::next() {
int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t length) {
int32_t i = 0;
-
+
if (count == df) {
return i;
}
@@ -197,24 +197,46 @@ bool SegmentTermDocs::readRange(DocRange* docRange) {
}
buffer_.readRange(docRange);
+
count += docRange->doc_many_size_;
- if (docRange->need_positions && hasProx && docRange->doc_many_size_ > 0 &&
df >= skipInterval) {
+ if (docRange->doc_many_size_ > 0) {
+ uint32_t start = (*docRange->doc_many)[0];
+ uint32_t end = (*docRange->doc_many)[docRange->doc_many_size_ - 1];
+ if ((end - start) == docRange->doc_many_size_ - 1) {
+ docRange->doc_range.first = start;
+ docRange->doc_range.second = start + docRange->doc_many_size_;
+ docRange->type_ = DocRangeType::kRange;
+ }
+ }
+
+ return true;
+}
+
+bool SegmentTermDocs::readBlock(DocRange* docRange) {
+ if (count >= df) {
+ return false;
+ }
+
+ buffer_.readRange(docRange);
+ count += docRange->doc_many_size_;
+
+ if (docRange->doc_many_size_ > 0 && df >= skipInterval) {
if (skipListReader == nullptr) {
- skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(),
maxSkipLevels, skipInterval, indexVersion_);
+ skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(),
maxSkipLevels,
+ skipInterval,
indexVersion_);
skipListReader->setIoContext(io_ctx_);
}
if (!haveSkipped) {
- skipListReader->init(skipPointer, freqBasePointer,
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
+ skipListReader->init(skipPointer, freqBasePointer,
proxBasePointer, df, hasProx,
+ currentFieldStoresPayloads);
haveSkipped = true;
}
-
+
uint32_t firstDoc = (*docRange->doc_many)[0];
- int32_t skippedCount = skipListReader->skipTo(firstDoc);
+ skipListReader->skipTo(firstDoc);
- if (skipListReader->getDoc() >= 0) {
- skipProx(skipListReader->getProxPointer(),
skipListReader->getPayloadLength());
- }
+ skipProx(skipListReader->getProxPointer(),
skipListReader->getPayloadLength());
}
if (docRange->doc_many_size_ > 0) {
@@ -263,30 +285,32 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
return true;
}
-void SegmentTermDocs::skipToBlock(const int32_t target) {
+bool SegmentTermDocs::skipToBlock(const int32_t target) {
if (df >= skipInterval) {
if (skipListReader == NULL) {
- skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(),
maxSkipLevels, skipInterval, indexVersion_);
+ skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(),
maxSkipLevels,
+ skipInterval,
indexVersion_);
skipListReader->setIoContext(io_ctx_);
}
if (!haveSkipped) {
- skipListReader->init(skipPointer, freqBasePointer,
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
+ skipListReader->init(skipPointer, freqBasePointer,
proxBasePointer, df, hasProx,
+ currentFieldStoresPayloads);
haveSkipped = true;
}
- int32_t newCount = skipListReader->skipTo(target);
- if (newCount > count) {
- freqStream->seek(skipListReader->getFreqPointer());
- skipProx(skipListReader->getProxPointer(),
skipListReader->getPayloadLength());
-
- _doc = skipListReader->getDoc();
- count = newCount;
- // Note: We do NOT call buffer_.refill() here.
- // The caller will use readRange() to read the next block.
+ int32_t currentLastDoc = skipListReader->getLastDocInBlock();
+ if (target <= currentLastDoc && haveSkipped) {
+ return false;
}
+ int32_t newCount = skipListReader->skipTo(target);
+ freqStream->seek(skipListReader->getFreqPointer());
+ skipProx(skipListReader->getProxPointer(),
skipListReader->getPayloadLength());
+ _doc = skipListReader->getDoc();
+ count = newCount;
+ return true;
}
- // If df < skipInterval, nothing to skip. Caller will use readRange()
sequentially.
+ return false;
}
void TermDocsBuffer::refill() {
@@ -409,4 +433,26 @@ void TermDocsBuffer::refillNorm(int32_t size) {
}
}
}
+
+int32_t SegmentTermDocs::getMaxBlockFreq() {
+ if (skipListReader != nullptr) {
+ return skipListReader->getMaxBlockFreq();
+ }
+ return -1;
+}
+
+int32_t SegmentTermDocs::getMaxBlockNorm() {
+ if (skipListReader != nullptr) {
+ return skipListReader->getMaxBlockNorm();
+ }
+ return -1;
+}
+
+int32_t SegmentTermDocs::getLastDocInBlock() {
+ if (skipListReader != nullptr) {
+ return skipListReader->getLastDocInBlock();
+ }
+ return -1;
+}
+
CL_NS_END
diff --git a/src/core/CLucene/index/SkipListReader.cpp
b/src/core/CLucene/index/SkipListReader.cpp
index 426e8d12317..5da80263edc 100644
--- a/src/core/CLucene/index/SkipListReader.cpp
+++ b/src/core/CLucene/index/SkipListReader.cpp
@@ -63,6 +63,13 @@ int32_t MultiLevelSkipListReader::getDoc() const {
return lastDoc;
}
+int32_t MultiLevelSkipListReader::getLastDocInBlock() const {
+ if (skipDoc[0] == 0 && haveSkipped && numberOfSkipLevels > 0) {
+ const_cast<MultiLevelSkipListReader*>(this)->loadNextSkip(0);;
+ }
+ return skipDoc[0];
+}
+
int32_t MultiLevelSkipListReader::skipTo(const int32_t target) {
if (!haveSkipped) {
// first time, load skip levels
@@ -111,6 +118,7 @@ bool MultiLevelSkipListReader::loadNextSkip(const int32_t
level) {
// this skip list is exhausted
skipDoc[level] = LUCENE_INT32_MAX_SHOULDBE;
if (numberOfSkipLevels > level) numberOfSkipLevels = level;
+ onSkipExhausted(level);
return false;
}
@@ -364,6 +372,13 @@ int32_t DefaultSkipListReader::readSkipData(const int32_t
level, CL_NS(store)::I
return delta;
}
+void DefaultSkipListReader::onSkipExhausted(const int32_t level) {
+ if (level == 0) {
+ maxBlockFreq = -1;
+ maxBlockNorm = -1;
+ }
+}
+
int32_t DefaultSkipListReader::getMaxBlockFreq() const {
return maxBlockFreq;
}
diff --git a/src/core/CLucene/index/SkipListWriter.cpp
b/src/core/CLucene/index/SkipListWriter.cpp
index 72b3be27980..63aed4e3443 100644
--- a/src/core/CLucene/index/SkipListWriter.cpp
+++ b/src/core/CLucene/index/SkipListWriter.cpp
@@ -118,8 +118,8 @@ void DefaultSkipListWriter::resetSkip() {
if (hasProx) {
Arrays<int64_t>::fill(lastSkipProxPointer, numberOfSkipLevels,
proxOutput->getFilePointer());
if (indexVersion == IndexVersion::kV4) {
- maxBlockFreq = 0;
- maxBlockNorm = 0;
+ maxBlockFreq = -1;
+ maxBlockNorm = -1;
}
}
Arrays<int64_t>::fill(lastSkipFreqPointer, numberOfSkipLevels,
freqOutput->getFilePointer());
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index aefe508e5d0..dbde2d0c4d0 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -64,6 +64,9 @@ public:
virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t length)=0;
virtual int32_t read(int32_t* docs, int32_t* freqs, int32_t* norms,
int32_t length)=0;
virtual bool readRange(DocRange* docRange) = 0;
+ virtual bool readBlock(DocRange* docRange) {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "readBlock is not
supported for multi-segment readers");
+ }
// Skips entries to the first beyond the current whose document number
is
// greater than or equal to <i>target</i>. <p>Returns true iff there is
such
@@ -79,10 +82,9 @@ public:
// Some implementations are considerably more efficient than that.
virtual bool skipTo(const int32_t target)=0;
- // Skip to the block containing the target document using skip list.
- // This is an optimization that positions the stream for subsequent
readRange calls.
- // Unlike skipTo, this does not scan to find the exact document.
- virtual void skipToBlock(const int32_t target) {}
+ virtual bool skipToBlock(const int32_t target) {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "skipToBlock is not
supported for multi-segment readers");
+ }
// Frees associated resources.
virtual void close() = 0;
@@ -103,6 +105,16 @@ public:
virtual int32_t docNorm() {
return 0;
}
+
+ virtual int32_t getMaxBlockFreq() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "getMaxBlockFreq is not
supported for multi-segment readers");
+ }
+ virtual int32_t getMaxBlockNorm() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "getMaxBlockNorm is not
supported for multi-segment readers");
+ }
+ virtual int32_t getLastDocInBlock() {
+ _CLTHROWA(CL_ERR_UnsupportedOperation, "getLastDocInBlock is
not supported for multi-segment readers");
+ }
};
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h
b/src/core/CLucene/index/_MultiSegmentReader.h
index f33ed5f4a07..02f718a66ce 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -120,6 +120,7 @@ public:
int32_t getTermInfosIndexDivisor();
const CL_NS(util)::ArrayBase<IndexReader*>* getSubReaders() const;
+ const int32_t* getStarts() const;
friend class MultiReader;
friend class SegmentReader;
@@ -176,8 +177,6 @@ public:
/* A Possible future optimization could skip entire segments */
bool skipTo(const int32_t target);
- /** Skip to the block containing target using skip list. */
- void skipToBlock(const int32_t target) override;
void close();
diff --git a/src/core/CLucene/index/_SegmentHeader.h
b/src/core/CLucene/index/_SegmentHeader.h
index 8686eb23a15..80ab2331265 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -236,18 +236,22 @@ public:
virtual int32_t read(int32_t* docs, int32_t* freqs,int32_t* norms, int32_t
length);
bool readRange(DocRange* docRange) override;
+ bool readBlock(DocRange* docRange) override;
/** Optimized implementation. */
virtual bool skipTo(const int32_t target);
- /** Skip to the block containing target using skip list. */
- void skipToBlock(const int32_t target) override;
+ bool skipToBlock(const int32_t target) override;
virtual TermPositions* __asTermPositions();
void setLoadStats(bool load_stats) override;
void setIoContext(const void* io_ctx) override;
+ int32_t getMaxBlockFreq() override;
+ int32_t getMaxBlockNorm() override;
+ int32_t getLastDocInBlock() override;
+
int32_t docFreq() override;
int32_t docNorm() override;
@@ -351,7 +355,7 @@ private:
int32_t freq() const{ return SegmentTermDocs::freq(); }
int32_t norm() const{ return SegmentTermDocs::norm(); }
bool skipTo(const int32_t target){ return SegmentTermDocs::skipTo(target); }
- void skipToBlock(const int32_t target) {
SegmentTermDocs::skipToBlock(target); }
+ bool skipToBlock(const int32_t target) { return
SegmentTermDocs::skipToBlock(target); }
private:
IndexVersion indexVersion_ = IndexVersion::kV0;
diff --git a/src/core/CLucene/index/_SkipListReader.h
b/src/core/CLucene/index/_SkipListReader.h
index f7562e2f9f2..723aa7c2207 100644
--- a/src/core/CLucene/index/_SkipListReader.h
+++ b/src/core/CLucene/index/_SkipListReader.h
@@ -65,6 +65,8 @@ public:
* has skipped. */
int32_t getDoc() const;
+ int32_t getLastDocInBlock() const;
+
/** Skips entries to the first beyond the current whose document number
is
* greater than or equal to <i>target</i>. Returns the current doc
count.
*/
@@ -79,6 +81,8 @@ protected:
/** Seeks the skip entry on the given level */
virtual void seekChild(const int32_t level);
+ virtual void onSkipExhausted(const int32_t level) {}
+
void close();
/** initializes the reader */
@@ -158,8 +162,8 @@ private:
int64_t lastProxPointer{0};
int32_t lastPayloadLength;
- int32_t maxBlockFreq = 0;
- int32_t maxBlockNorm = 0;
+ int32_t maxBlockFreq = -1;
+ int32_t maxBlockNorm = -1;
public:
DefaultSkipListReader(CL_NS(store)::IndexInput* _skipStream, const
int32_t maxSkipLevels, const int32_t _skipInterval, IndexVersion indexVersion);
@@ -191,6 +195,8 @@ protected:
void setLastSkipData(const int32_t level);
+ void onSkipExhausted(const int32_t level) override;
+
int32_t readSkipData(const int32_t level, CL_NS(store)::IndexInput*
_skipStream);
};
diff --git a/src/core/CLucene/index/_SkipListWriter.h
b/src/core/CLucene/index/_SkipListWriter.h
index c32351bd47c..92d1c2f3cd6 100644
--- a/src/core/CLucene/index/_SkipListWriter.h
+++ b/src/core/CLucene/index/_SkipListWriter.h
@@ -110,8 +110,8 @@ private:
bool hasProx = false;
IndexVersion indexVersion = IndexVersion::kV0;
- int32_t maxBlockFreq = 0;
- int32_t maxBlockNorm = 0;
+ int32_t maxBlockFreq = -1;
+ int32_t maxBlockNorm = -1;
protected:
@@ -128,7 +128,7 @@ public:
/**
* Sets the values for the current skip data.
*/
- void setSkipData(int32_t doc, bool storePayloads, int32_t payloadLength,
int32_t maxBlockFreq = 0, int32_t maxBlockNorm = 0);
+ void setSkipData(int32_t doc, bool storePayloads, int32_t payloadLength,
int32_t maxBlockFreq = -1, int32_t maxBlockNorm = -1);
void resetSkip();
};
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 0a19a2d278f..fd7f7079802 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -103,6 +103,7 @@ SET(clucene_core_Files
./CLucene/index/DocumentsWriter.cpp
./CLucene/index/SDocumentWriter.cpp
./CLucene/index/SDocumentWriter.h
+ ./CLucene/index/BlockMaxBM25Similarity.cpp
./CLucene/index/DocumentsWriterThreadState.cpp
./CLucene/index/SegmentTermVector.cpp
./CLucene/index/TermVectorReader.cpp
diff --git a/src/test/index/TestReadRange.cpp b/src/test/index/TestReadRange.cpp
index 0542dbb065f..ea2bf8edd7a 100644
--- a/src/test/index/TestReadRange.cpp
+++ b/src/test/index/TestReadRange.cpp
@@ -128,11 +128,11 @@ static TermDocsResult readWithNext(TermDocs* termDocs) {
return result;
}
-// Read using readRange() method
-static TermDocsResult readWithRange(TermDocs* termDocs) {
+// Read using readBlock() method
+static TermDocsResult readWithBlock(TermDocs* termDocs) {
TermDocsResult result;
DocRange docRange;
- while (termDocs->readRange(&docRange)) {
+ while (termDocs->readBlock(&docRange)) {
for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
result.docs.push_back((*docRange.doc_many)[i]);
if (docRange.freq_many && i < docRange.freq_many_size_) {
@@ -167,13 +167,12 @@ static TermPositionsResult
readPositionsWithNext(TermPositions* termPos) {
return result;
}
-// Read positions using readRange() and nextDeltaPosition() method
-static TermPositionsResult readPositionsWithRange(TermPositions* termPos) {
+// Read positions using readBlock() and nextDeltaPosition() method
+static TermPositionsResult readPositionsWithBlock(TermPositions* termPos) {
TermPositionsResult result;
DocRange docRange;
- docRange.need_positions = true;
- while (termPos->readRange(&docRange)) {
+ while (termPos->readBlock(&docRange)) {
for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
result.docs.push_back((*docRange.doc_many)[i]);
int32_t freq = 1;
@@ -296,7 +295,7 @@ void TestReadRangeBasic(CuTest* tc) {
// Read with readRange()
TermDocs* termDocs2 = reader->termDocs();
termDocs2->seek(term);
- TermDocsResult result2 = readWithRange(termDocs2);
+ TermDocsResult result2 = readWithBlock(termDocs2);
termDocs2->close();
_CLDELETE(termDocs2);
@@ -366,7 +365,7 @@ void TestReadRangePositions(CuTest* tc) {
// Read with readRange()/nextDeltaPosition()
TermPositions* termPos2 = reader->termPositions();
termPos2->seek(term);
- TermPositionsResult result2 = readPositionsWithRange(termPos2);
+ TermPositionsResult result2 = readPositionsWithBlock(termPos2);
termPos2->close();
_CLDELETE(termPos2);
@@ -642,7 +641,7 @@ void TestReadRangeLargeDataset(CuTest* tc) {
// Read with readRange()
TermDocs* termDocs2 = reader->termDocs();
termDocs2->seek(term);
- TermDocsResult result2 = readWithRange(termDocs2);
+ TermDocsResult result2 = readWithBlock(termDocs2);
termDocs2->close();
_CLDELETE(termDocs2);
@@ -712,7 +711,7 @@ void TestReadRangeVersions(CuTest* tc) {
// Read with readRange()
TermDocs* termDocs2 = reader->termDocs();
termDocs2->seek(term);
- TermDocsResult result2 = readWithRange(termDocs2);
+ TermDocsResult result2 = readWithBlock(termDocs2);
termDocs2->close();
_CLDELETE(termDocs2);
@@ -768,7 +767,7 @@ void TestReadRangeEdgeCases(CuTest* tc) {
TermDocs* termDocs2 = reader->termDocs();
termDocs2->seek(term);
- TermDocsResult result2 = readWithRange(termDocs2);
+ TermDocsResult result2 = readWithBlock(termDocs2);
termDocs2->close();
_CLDELETE(termDocs2);
@@ -831,7 +830,7 @@ void TestReadRangeEdgeCases(CuTest* tc) {
TermDocs* termDocs2 = reader->termDocs();
termDocs2->seek(term);
- TermDocsResult result2 = readWithRange(termDocs2);
+ TermDocsResult result2 = readWithBlock(termDocs2);
termDocs2->close();
_CLDELETE(termDocs2);
@@ -848,6 +847,145 @@ void TestReadRangeEdgeCases(CuTest* tc) {
std::cout << "\nTestReadRangeEdgeCases success" << std::endl;
}
+//=============================================================================
+// Test: Block-based interfaces (readBlock, skipToBlock, getMaxBlockFreq,
+// getMaxBlockNorm, getLastDocInBlock) working together
+//=============================================================================
+void TestBlockBasedInterfaces(CuTest* tc) {
+ std::srand(getDaySeed());
+ std::mt19937 rng(getDaySeed());
+
+ std::string fieldName = "content";
+ std::vector<std::string> datas;
+
+ // Create index with a common term appearing frequently
+ std::string commonWord = "target";
+ for (int32_t i = 0; i < DEFAULT_DOC_COUNT; ++i) {
+ std::string text = generateRandomText(rng, 1, 5);
+ if (i % 2 == 0) {
+ text += " " + commonWord;
+ // Add multiple occurrences for higher freq
+ if (i % 4 == 0) {
+ text += " " + commonWord + " " + commonWord;
+ }
+ }
+ datas.push_back(text);
+ }
+
+ RAMDirectory dir;
+ writeTestIndex(fieldName, &dir, IndexVersion::kV2, datas);
+
+ auto* reader = IndexReader::open(&dir);
+ std::exception_ptr eptr;
+
+ try {
+ std::wstring ws = StringUtil::string_to_wstring(commonWord);
+ std::wstring fieldNameW = StringUtil::string_to_wstring(fieldName);
+ Term* term = _CLNEW Term(fieldNameW.c_str(), ws.c_str());
+
+ // Collect all docs using next() as baseline
+ TermDocs* baselineDocs = reader->termDocs();
+ baselineDocs->seek(term);
+ std::vector<int32_t> baselineDocIds;
+ std::vector<int32_t> baselineFreqs;
+ while (baselineDocs->next()) {
+ baselineDocIds.push_back(baselineDocs->doc());
+ baselineFreqs.push_back(baselineDocs->freq());
+ }
+ baselineDocs->close();
+ _CLDELETE(baselineDocs);
+
+ // Test: skipToBlock + readBlock +
getMaxBlockFreq/getMaxBlockNorm/getLastDocInBlock
+ std::vector<int32_t> skipTargets = {0, 500, 2000, 5000, 8000};
+
+ for (int32_t target : skipTargets) {
+ TermDocs* termDocs = reader->termDocs();
+ termDocs->seek(term);
+
+ std::vector<int32_t> collectedDocs;
+ std::vector<int32_t> collectedFreqs;
+
+ // Skip to target block
+ bool skipped = termDocs->skipToBlock(target);
+
+ // Verify getLastDocInBlock returns valid value after skip
+ int32_t lastDocInBlock = termDocs->getLastDocInBlock();
+ if (skipped && lastDocInBlock >= 0) {
+ assertTrue(lastDocInBlock >= 0);
+ }
+
+ // Read blocks and verify maxBlockFreq/maxBlockNorm
+ DocRange docRange;
+ while (termDocs->readBlock(&docRange)) {
+ int32_t maxFreq = termDocs->getMaxBlockFreq();
+ int32_t maxNorm = termDocs->getMaxBlockNorm();
+ lastDocInBlock = termDocs->getLastDocInBlock();
+
+ // Verify maxBlockFreq is >= max freq in current block
+ int32_t actualMaxFreq = 0;
+ for (uint32_t i = 0; i < docRange.doc_many_size_; ++i) {
+ int32_t doc = (*docRange.doc_many)[i];
+ int32_t freq = (docRange.freq_many && i <
docRange.freq_many_size_)
+ ? (*docRange.freq_many)[i]
+ : 1;
+ if (doc >= target) {
+ collectedDocs.push_back(doc);
+ collectedFreqs.push_back(freq);
+ }
+ if (freq > actualMaxFreq) {
+ actualMaxFreq = freq;
+ }
+ }
+
+ // maxBlockFreq should be >= actual max freq in block (if
valid)
+ if (maxFreq > 0) {
+ assertTrue(maxFreq >= actualMaxFreq);
+ }
+
+ // lastDocInBlock should be >= last doc in current block
+ if (lastDocInBlock >= 0 && docRange.doc_many_size_ > 0) {
+ int32_t lastDocInRange =
(*docRange.doc_many)[docRange.doc_many_size_ - 1];
+ assertTrue(lastDocInBlock >= lastDocInRange);
+ }
+ }
+
+ // Verify collected docs match baseline (docs >= target)
+ std::vector<int32_t> expectedDocs;
+ for (size_t i = 0; i < baselineDocIds.size(); ++i) {
+ if (baselineDocIds[i] >= target) {
+ expectedDocs.push_back(baselineDocIds[i]);
+ }
+ }
+
+ // All expected docs should be in collected docs
+ for (int32_t expectedDoc : expectedDocs) {
+ bool found = std::find(collectedDocs.begin(),
collectedDocs.end(), expectedDoc) !=
+ collectedDocs.end();
+ if (!found) {
+ std::cerr << "Missing doc " << expectedDoc << " after
skipToBlock(" << target
+ << ")" << std::endl;
+ }
+ assertTrue(found);
+ }
+
+ termDocs->close();
+ _CLDELETE(termDocs);
+ }
+
+ _CLDECDELETE(term);
+
+ } catch (...) {
+ eptr = std::current_exception();
+ }
+
+ FINALLY(eptr, {
+ reader->close();
+ _CLLDELETE(reader);
+ })
+
+ std::cout << "\nTestBlockBasedInterfaces success" << std::endl;
+}
+
//=============================================================================
// Suite registration
//=============================================================================
@@ -861,6 +999,7 @@ CuSuite* testReadRange() {
SUITE_ADD_TEST(suite, TestReadRangeLargeDataset);
SUITE_ADD_TEST(suite, TestReadRangeVersions);
SUITE_ADD_TEST(suite, TestReadRangeEdgeCases);
+ SUITE_ADD_TEST(suite, TestBlockBasedInterfaces);
return suite;
}
diff --git a/src/test/posting/TestBlockMaxScoreV3.cpp
b/src/test/posting/TestBlockMaxScoreV3.cpp
index efa67b66a98..6204dc72879 100644
--- a/src/test/posting/TestBlockMaxScoreV3.cpp
+++ b/src/test/posting/TestBlockMaxScoreV3.cpp
@@ -116,8 +116,8 @@ void testDefaultSkipListWriterMaxBlockParamsV1(CuTest* tc) {
int32_t actualMaxBlockFreq = reader->getMaxBlockFreq();
int32_t actualMaxBlockNorm = reader->getMaxBlockNorm();
- CuAssertTrue(tc, actualMaxBlockFreq == 0);
- CuAssertTrue(tc, actualMaxBlockNorm == 0);
+ CuAssertTrue(tc, actualMaxBlockFreq == -1);
+ CuAssertTrue(tc, actualMaxBlockNorm == -1);
}
_CLDELETE(reader);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]