This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch dev_0308_3 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
commit 4950660466283dafab7f15a1f8019037c7aa0d6f Author: BiteTheDDDDt <pxl...@qq.com> AuthorDate: Fri Mar 8 17:23:15 2024 +0800 fix some implicit conversion --- .github/workflows/build.yml | 2 +- .github/workflows/clucene-ut.yml | 118 +++ .../CLucene/analysis/LanguageBasedAnalyzer.cpp | 14 +- .../CLucene/analysis/jieba/ChineseTokenizer.cpp | 8 +- .../CLucene/analysis/jieba/ChineseTokenizer.h | 25 +- src/core/CLucene/analysis/AnalysisHeader.h | 8 +- src/core/CLucene/index/IndexWriter.cpp | 95 ++- src/core/CLucene/index/IndexWriter.h | 16 + src/core/CLucene/index/MultiSegmentReader.cpp | 4 + src/core/CLucene/index/SegmentInfos.cpp | 3 + src/core/CLucene/index/SegmentReader.cpp | 7 +- src/core/CLucene/index/SegmentTermDocs.cpp | 4 +- src/core/CLucene/index/SegmentTermEnum.cpp | 801 ++++++++++----------- src/core/CLucene/index/TermInfosReader.cpp | 737 ++++++++++--------- src/core/CLucene/index/_SegmentHeader.h | 4 +- src/core/CLucene/index/_SegmentTermEnum.h | 3 +- src/core/CLucene/search/MultiPhraseQuery.cpp | 4 +- src/core/CLucene/search/query/TermIterator.h | 29 +- .../CLucene/search/query/TermPositionIterator.h | 23 + src/core/CLucene/store/IndexOutput.cpp | 13 +- src/core/CLucene/util/CLStreams.h | 2 +- src/core/CLucene/util/PriorityQueue.h | 2 +- src/core/CLucene/util/bkd/bkd_docid_iterator.h | 8 +- src/core/CLucene/util/stringUtil.h | 17 +- src/core/CMakeLists.txt | 2 +- src/test/CMakeLists.txt | 2 + src/test/index/TestIndexCompaction.cpp | 344 +++++++++ src/test/query/TestMultiPhraseQuery.cpp | 163 +++++ src/test/test.h | 2 + src/test/tests.cpp | 2 + 30 files changed, 1623 insertions(+), 839 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 64386c999a..36ea60e962 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -82,7 +82,7 @@ jobs: cd /tmp curl -L https://sourceforge.net/projects/libpng/files/zlib/1.2.11/zlib-1.2.11.tar.gz | tar -zxf - - curl -L https://boostorg.jfrog.io/artifactory/main/release/1.81.0/source/boost_1_81_0.tar.gz -o - | tar -zxf - + curl -L https://archives.boost.io/release/1.81.0/source/boost_1_81_0.tar.gz -o - | tar -zxf - if [[ "${{ matrix.config.name }}" == 'macOS' ]]; then pushd "$(brew --repo)" diff --git a/.github/workflows/clucene-ut.yml b/.github/workflows/clucene-ut.yml new file mode 100644 index 0000000000..302934cbce --- /dev/null +++ b/.github/workflows/clucene-ut.yml @@ -0,0 +1,118 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Clucene UT + +on: + pull_request: + branches: + - clucene + - clucene-2.0 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + build_linux: + name: Build (Linux) + runs-on: ubuntu-22.04 + steps: + - name: "Checkout ${{ github.event.pull_request.number }} ${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: "Prepare" + run: | + sudo apt update + sudo DEBIAN_FRONTEND=noninteractive apt install --yes \ + 'build-essential' \ + 'automake' \ + 'autoconf' \ + 'libtool-bin' \ + 'pkg-config' \ + 'cmake' \ + 'ninja-build' \ + 'ccache' \ + 'python-is-python3' \ + 'bison' \ + 'byacc' \ + 'flex' \ + 'binutils-dev' \ + 'libiberty-dev' \ + 'curl' \ + 'git' \ + 'zip' \ + 'unzip' \ + 'autopoint' \ + 'openjdk-11-jdk' \ + 'openjdk-11-jdk-headless' \ + 'maven' + - name: "Run" + run: | + set -x + mkdir build && cd build + cmake ../ + make cl_test + cd bin/ + ./cl_test + # - name: "Setup tmate session to debug" + # if: ${{ failure() }} + # uses: mxschmitt/action-tmate@v3 + # timeout-minutes: 30 + + build_macos: + name: Build (MacOS) + runs-on: macos-12 + steps: + - name: "Checkout ${{ github.event.pull_request.number }} ${{ github.event.pull_request.head.sha }}" + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: "Prepare" + run: | + brew install \ + 'automake' \ + 'autoconf' \ + 'libtool' \ + 'pkg-config' \ + 'texinfo' \ + 'coreutils' \ + 'gnu-getopt' \ + 'python@3' \ + 'cmake' \ + 'ninja' \ + 'ccache' \ + 'bison' \ + 'byacc' \ + 'gettext' \ + 'wget' \ + 'pcre' \ + 'openjdk@11' \ + 'maven' \ + 'node' \ + 'llvm@16' + - name: "Run" + run: | + set -x + mkdir build && cd build + cmake ../ + make cl_test + cd bin/ + ./cl_test diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp index 2a32ff04fa..6adfcf1e34 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp @@ -4,6 +4,7 @@ * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ +#include <fstream> #include "CLucene/_ApiHeader.h" #include "CLucene/analysis/Analyzers.h" @@ -64,7 +65,18 @@ void LanguageBasedAnalyzer::setMode(AnalyzerMode m) { void LanguageBasedAnalyzer::initDict(const std::string &dictPath) { if (_tcscmp(lang, _T("chinese")) == 0) { - CL_NS2(analysis, jieba)::ChineseTokenizer::init(dictPath); + ChineseDict chineseDict; + chineseDict.dictPath_ = dictPath; + + for (const auto& file : chineseDict.files_) { + std::string path = dictPath + "/" + file; + std::ifstream in(path); + if (!in.good()) { + _CLTHROWA(CL_ERR_IO, std::string("chinese tokenizer dict file not found: " + path).c_str()); + } + } + + CL_NS2(analysis, jieba)::ChineseTokenizer::init(&chineseDict); } } diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index 9a7f5eddfd..ef46315ff5 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -17,11 +17,11 @@ ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, Tokenizer::lowercase = lowercase; } -void ChineseTokenizer::init(const std::string &dictPath) { - JiebaSingleton::getInstance(dictPath); +void ChineseTokenizer::init(const ChineseDict* chineseDict) { + JiebaSingleton::getInstance(chineseDict); } -CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { +CL_NS(analysis)::Token* ChineseTokenizer::next(lucene::analysis::Token* token) { if (bufferIndex >= dataLen) { return nullptr; } @@ -29,7 +29,7 @@ CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { std::string_view& token_text = tokens_text[bufferIndex++]; size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); if (Tokenizer::lowercase) { - if (!token_text.empty() && token_text[0] < 0x80) { + if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) { std::transform(token_text.begin(), token_text.end(), const_cast<char*>(token_text.data()), [](char c) { return to_lower(c); }); diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h index 9fe33f5805..09760b7b1c 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h @@ -14,14 +14,25 @@ CL_NS_DEF2(analysis,jieba) CL_NS_USE(analysis) +struct ChineseDict { + std::string dictPath_; + std::vector<std::string> files_ = { + "jieba.dict.utf8", + "hmm_model.utf8", + "user.dict.utf8", + "idf.utf8", + "stop_words.utf8" + }; +}; + class JiebaSingleton { public: - static cppjieba::Jieba& getInstance(const std::string& dictPath = "") { - static cppjieba::Jieba instance(dictPath + "/" + "jieba.dict.utf8", - dictPath + "/" + "hmm_model.utf8", - dictPath + "/" + "user.dict.utf8", - dictPath + "/" + "idf.utf8", - dictPath + "/" + "stop_words.utf8"); + static cppjieba::Jieba& getInstance(const ChineseDict* dict = nullptr) { + static cppjieba::Jieba instance(dict->dictPath_ + "/" + dict->files_[0], + dict->dictPath_ + "/" + dict->files_[1], + dict->dictPath_ + "/" + dict->files_[2], + dict->dictPath_ + "/" + dict->files_[3], + dict->dictPath_ + "/" + dict->files_[4]); return instance; } @@ -46,7 +57,7 @@ public: // Constructor explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode); explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase); - static void init(const std::string& dictPath=""); + static void init(const ChineseDict* chineseDict); // Destructor ~ChineseTokenizer() override = default; diff --git a/src/core/CLucene/analysis/AnalysisHeader.h b/src/core/CLucene/analysis/AnalysisHeader.h index 578d8e0061..a98e26e4ab 100644 --- a/src/core/CLucene/analysis/AnalysisHeader.h +++ b/src/core/CLucene/analysis/AnalysisHeader.h @@ -219,15 +219,15 @@ public: template <> inline size_t Token::termLength<char>(){ if ( _termTextLen == -1 ) //it was invalidated by growBuffer - _termTextLen = strlen((char*)_buffer); - return _termTextLen; + _termTextLen = (int32_t)strlen((char*)_buffer); + return (size_t)_termTextLen; }; template <> inline size_t Token::termLength<TCHAR>(){ if ( _termTextLen == -1 ) //it was invalidated by growBuffer - _termTextLen = wcslen((TCHAR*)_buffer); - return _termTextLen; + _termTextLen = (int32_t)wcslen((TCHAR*)_buffer); + return (size_t)_termTextLen; }; class CLUCENE_EXPORT TokenStream { diff --git a/src/core/CLucene/index/IndexWriter.cpp b/src/core/CLucene/index/IndexWriter.cpp index 71cf4f2cac..e30abf5107 100644 --- a/src/core/CLucene/index/IndexWriter.cpp +++ b/src/core/CLucene/index/IndexWriter.cpp @@ -40,6 +40,13 @@ #include <memory> #include <assert.h> #include <iostream> +#include <roaring/roaring.hh> + +#define FINALLY_CLOSE_OUTPUT(x) \ + try { \ + if (x != nullptr) x->close(); \ + } catch (...) { \ + } CL_NS_USE(store) CL_NS_USE(util) @@ -50,6 +57,7 @@ CL_NS_DEF(index) int64_t IndexWriter::WRITE_LOCK_TIMEOUT = 1000; const char *IndexWriter::WRITE_LOCK_NAME = "write.lock"; +const char *IndexWriter::NULL_BITMAP_FILE_NAME = "null_bitmap"; std::ostream *IndexWriter::defaultInfoStream = NULL; const int32_t IndexWriter::MERGE_READ_BUFFER_SIZE = 4096; @@ -1255,18 +1263,43 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d int numIndices = src_dirs.size(); //Set of IndexReaders - if (infoStream != NULL) { + if (infoStream != nullptr) { message(string("src index dir size: ") + Misc::toString(numIndices)); } + + // first level vector index is src_index_id + // second level vector index is src_doc_id + std::vector<std::vector<uint32_t>> srcNullBitmapValues(numIndices); + IndexInput* null_bitmap_in = nullptr; for (int32_t i = 0; i < numIndices; i++) { // One index dir may have more than one segment, so we change the code to open all segments by using IndexReader::open // To keep the number of readers consistent with the number of src dirs. // Using IndexWriter::segmentInfos will be incorrect when there are more than one segment in one index dir IndexReader* reader = lucene::index::IndexReader::open(src_dirs[i], MERGE_READ_BUFFER_SIZE, false); readers.push_back(reader); - if (infoStream != NULL) { + if (infoStream != nullptr) { message(src_dirs[i]->toString()); } + + // read null_bitmap and store values in srcBitmapValues + try { + if (src_dirs[i]->fileExists(NULL_BITMAP_FILE_NAME)) { + // get null_bitmap index input + null_bitmap_in = src_dirs[i]->openInput(NULL_BITMAP_FILE_NAME); + size_t null_bitmap_size = null_bitmap_in->length(); + std::string buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast<uint8_t*>(const_cast<char*>(buf.data())), null_bitmap_size); + auto null_bitmap = roaring::Roaring::read(buf.data(), false); + null_bitmap.runOptimize(); + for (unsigned int v : null_bitmap) { + srcNullBitmapValues[i].emplace_back(v); + } + FINALLY_CLOSE_OUTPUT(null_bitmap_in); + } + } catch (CLuceneError &e) { + FINALLY_CLOSE_OUTPUT(null_bitmap_in); + } } assert(readers.size() == numIndices); @@ -1302,6 +1335,7 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d docStoreSegment.clear(); std::vector<lucene::index::IndexWriter *> destIndexWriterList; + std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList; try { /// merge fields mergeFields(hasProx); @@ -1345,10 +1379,17 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d skipInterval = termInfosWriter->skipInterval; maxSkipLevels = termInfosWriter->maxSkipLevels; skipListWriterList.push_back(_CLNEW DefaultSkipListWriter(skipInterval, maxSkipLevels, (int) dest_index_docs[j], freqOutputList[j], proxOutputList[j])); + + // create null_bitmap index output + auto* null_bitmap_out = dest_dir->createOutput(NULL_BITMAP_FILE_NAME); + nullBitmapIndexOutputList.push_back(null_bitmap_out); } /// merge terms mergeTerms(hasProx); + + /// merge null_bitmap + mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList); } catch (CLuceneError &e) { throw e; } @@ -1387,6 +1428,13 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d _CLDELETE(r); } } readers.clear();); + for (auto* null_bitmap_out + : nullBitmapIndexOutputList) { + if (null_bitmap_out != nullptr) { + null_bitmap_out->close(); + _CLDELETE(null_bitmap_out); + } + } nullBitmapIndexOutputList.clear(); // update segment infos of dest index_writer in memory // close dest index writer @@ -1818,6 +1866,49 @@ void IndexWriter::mergeTerms(bool hasProx) { } } +void IndexWriter::mergeNullBitmap(std::vector<std::vector<uint32_t>> srcNullBitmapValues, std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList) { + // first level vector index is dest_index_id + // second level vector index is dest_doc_id + std::vector<std::vector<uint32_t>> destNullBitmapValues(numDestIndexes); + + // iterate srcNullBitmapValues to construct destNullBitmapValues + for (size_t i = 0; i < srcNullBitmapValues.size(); ++i) { + std::vector<uint32_t> &indexSrcBitmapValues = srcNullBitmapValues[i]; + if (indexSrcBitmapValues.empty()) { + // empty indicates there is no null_bitmap file in this index + continue; + } + for (const auto& srcDocId : indexSrcBitmapValues) { + auto destIdx = _trans_vec[i][srcDocId].first; + auto destDocId = _trans_vec[i][srcDocId].second; + // <UINT32_MAX, UINT32_MAX> indicates current row not exist in Doris dest segment. + // So we ignore this doc here. + if (destIdx == UINT32_MAX || destDocId == UINT32_MAX) { + continue; + } + destNullBitmapValues[destIdx].emplace_back(destDocId); + } + } + + // construct null_bitmap and write null_bitmap to dest index + for (size_t i = 0; i < destNullBitmapValues.size(); ++i) { + roaring::Roaring null_bitmap; + for (const auto& v : destNullBitmapValues[i]) { + null_bitmap.add(v); + } + // write null_bitmap file + auto* nullBitmapIndexOutput = nullBitmapIndexOutputList[i]; + null_bitmap.runOptimize(); + size_t size = null_bitmap.getSizeInBytes(false); + if (size > 0) { + std::string buf; + buf.resize(size); + null_bitmap.write(reinterpret_cast<char*>(buf.data()), false); + nullBitmapIndexOutput->writeBytes(reinterpret_cast<uint8_t*>(buf.data()), size); + } + } +} + void IndexWriter::addIndexesNoOptimize(CL_NS(util)::ArrayBase<CL_NS(store)::Directory *> &dirs) { ensureOpen(); diff --git a/src/core/CLucene/index/IndexWriter.h b/src/core/CLucene/index/IndexWriter.h index 719ce0e5dc..7cfb67d2ca 100644 --- a/src/core/CLucene/index/IndexWriter.h +++ b/src/core/CLucene/index/IndexWriter.h @@ -325,10 +325,21 @@ public: void writeFields(lucene::store::Directory* d, std::string segment); // merge terms and write files void mergeTerms(bool hasProx); + // merge null_bitmap + void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList); // Compare current index with the other void compareIndexes(lucene::store::Directory* other); + // only for tests + void setNumDestIndexes(int32_t num_dest_indexes) { + numDestIndexes = num_dest_indexes; + } + // only for tests + void setTransVec(std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec) { + _trans_vec = std::move(trans_vec); + } + // Release the write lock, if needed. SegmentInfos* segmentInfos; @@ -414,6 +425,11 @@ public: */ static const char* WRITE_LOCK_NAME; //"write.lock"; + /** + * Name of the null bitmap in the index. + */ + static const char* NULL_BITMAP_FILE_NAME; //"null_bitmap"; + /** * @deprecated * @see LogMergePolicy#DEFAULT_MERGE_FACTOR diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp b/src/core/CLucene/index/MultiSegmentReader.cpp index ad37807e1a..b4be5f0129 100644 --- a/src/core/CLucene/index/MultiSegmentReader.cpp +++ b/src/core/CLucene/index/MultiSegmentReader.cpp @@ -561,6 +561,10 @@ int32_t MultiTermDocs::docFreq() { int32_t MultiTermDocs::doc() const { CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called"); + // if not found term, current will return INT_MAX, we could not add base, otherwise it will overflow. + if (current->doc() == LUCENE_INT32_MAX_SHOULDBE) { + return LUCENE_INT32_MAX_SHOULDBE; + } return base + current->doc(); } int32_t MultiTermDocs::freq() const { diff --git a/src/core/CLucene/index/SegmentInfos.cpp b/src/core/CLucene/index/SegmentInfos.cpp index 035321295e..60a3695474 100644 --- a/src/core/CLucene/index/SegmentInfos.cpp +++ b/src/core/CLucene/index/SegmentInfos.cpp @@ -826,6 +826,9 @@ string SegmentInfo::segString(Directory* dir) { // Try not to leave a truncated segments_N file in // the index: directory->deleteFile(segmentFileName.c_str()); + if (output != nullptr) { + _CLDELETE(output); + } } ) ) diff --git a/src/core/CLucene/index/SegmentReader.cpp b/src/core/CLucene/index/SegmentReader.cpp index f7741a9f80..721263664f 100644 --- a/src/core/CLucene/index/SegmentReader.cpp +++ b/src/core/CLucene/index/SegmentReader.cpp @@ -257,7 +257,12 @@ SegmentReader *SegmentReader::get(Directory *dir, SegmentInfo *si, instance->init(dir, sis, closeDir); // TODO: make this configurable... bool fieldsReaderExist = false; - instance->initialize(si, readBufferSize == -1 ? BufferedIndexInput::BUFFER_SIZE : readBufferSize, doOpenStores, fieldsReaderExist); + try { + instance->initialize(si, readBufferSize == -1 ? BufferedIndexInput::BUFFER_SIZE : readBufferSize, doOpenStores, fieldsReaderExist); + } catch (CLuceneError& e) { + _CLDELETE(instance) + throw e; + } return instance; } diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp b/src/core/CLucene/index/SegmentTermDocs.cpp index 9108f1dfd5..e346dc0ca2 100644 --- a/src/core/CLucene/index/SegmentTermDocs.cpp +++ b/src/core/CLucene/index/SegmentTermDocs.cpp @@ -19,7 +19,7 @@ CL_NS_DEF(index) SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) : parent(_parent), freqStream(_parent->freqStream->clone()), - count(0), df(0), deletedDocs(_parent->deletedDocs), _doc(0), _freq(0), skipInterval(_parent->tis->getSkipInterval()), + count(0), df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0), skipInterval(_parent->tis->getSkipInterval()), maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL), freqBasePointer(0), proxBasePointer(0), skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0), indexVersion_(_parent->_fieldInfos->getIndexVersion()), hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, indexVersion_) { @@ -73,7 +73,7 @@ void SegmentTermDocs::seek(const TermInfo *ti, Term *term) { df = 0; } else {// punt case df = ti->docFreq; - _doc = 0; + _doc = -1; freqBasePointer = ti->freqPointer; proxBasePointer = ti->proxPointer; skipPointer = freqBasePointer + ti->skipOffset; diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp b/src/core/CLucene/index/SegmentTermEnum.cpp index 574d939643..8179c7b780 100644 --- a/src/core/CLucene/index/SegmentTermEnum.cpp +++ b/src/core/CLucene/index/SegmentTermEnum.cpp @@ -5,424 +5,419 @@ * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #include "CLucene/_ApiHeader.h" -#include "_SegmentHeader.h" -#include "_SegmentTermEnum.h" - +#include "Term.h" #include "Terms.h" #include "_FieldInfos.h" -#include "Term.h" +#include "_SegmentHeader.h" +#include "_SegmentTermEnum.h" #include "_TermInfo.h" #include "_TermInfosWriter.h" CL_NS_USE(store) CL_NS_DEF(index) - SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi, int32_t in_format): - fieldInfos(fis){ - //Func - Constructor - //Pre - i holds a reference to an instance of IndexInput - // fis holds a reference to an instance of FieldInfos - // isi - //Post - An instance of SegmentTermEnum has been created - input = i; - position = -1; - //Instantiate a Term with empty field, empty text and which is interned (see term.h what interned means) - _term = _CLNEW Term; - isIndex = isi; - termInfo = _CLNEW TermInfo(); - indexPointer = 0; - buffer = NULL; - bufferLength = 0; - prev = NULL; - formatM1SkipInterval = 0; - maxSkipLevels = 1; - - //Set isClone to false as the instance is not clone of another instance - isClone = false; - - int32_t firstInt = in_format == -4 ? in_format : input->readInt(); +SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi) + : fieldInfos(fis) { + //Func - Constructor + //Pre - i holds a reference to an instance of IndexInput + // fis holds a reference to an instance of FieldInfos + // isi + //Post - An instance of SegmentTermEnum has been created + input = i; + position = -1; + //Instantiate a Term with empty field, empty text and which is interned (see term.h what interned means) + _term = _CLNEW Term; + isIndex = isi; + termInfo = _CLNEW TermInfo(); + indexPointer = 0; + buffer = NULL; + bufferLength = 0; + prev = NULL; + formatM1SkipInterval = 0; + maxSkipLevels = 1; + + //Set isClone to false as the instance is not clone of another instance + isClone = false; +} + +void SegmentTermEnum::init(int32_t in_format) { + int32_t firstInt = in_format == -4 ? in_format : input->readInt(); if (firstInt >= 0) { - // original-format file, without explicit format version number - format = 0; - size = firstInt; + // original-format file, without explicit format version number + format = 0; + size = firstInt; - // back-compatible settings - indexInterval = 128; - skipInterval = LUCENE_INT32_MAX_SHOULDBE; // switch off skipTo optimization + // back-compatible settings + indexInterval = 128; + skipInterval = LUCENE_INT32_MAX_SHOULDBE; // switch off skipTo optimization - } else { - // we have a format version number - format = firstInt; + } else { + // we have a format version number + format = firstInt; - // check that it is a format we can understand - if (format < TermInfosWriter::FORMAT){ + // check that it is a format we can understand + if (format < TermInfosWriter::FORMAT) { TCHAR err[30]; - _sntprintf(err,30,_T("Unknown format version: %d"), format); - _CLTHROWT(CL_ERR_CorruptIndex,err); - } - - if (format == -4) { - if (isIndex) { - size = input->readLong(); - if (size < 0) { - auto pos = input->getFilePointer(); - input->seek(input->length() - 16); - size = input->readLong(); - tisSize = input->readLong(); - input->seek(pos); - } - - indexInterval = input->readInt(); - skipInterval = input->readInt(); - maxSkipLevels = input->readInt(); - } - } else { - size = input->readLong(); // read the size - if (size < 0) { // read the size at file footer, if size < 0 - auto pos = input->getFilePointer(); - input->seek(input->length() - 8); - size = input->readLong(); - input->seek(pos); - } - - if(format == -1){ - if (!isIndex) { - indexInterval = input->readInt(); - formatM1SkipInterval = input->readInt(); - } - // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in - // skipTo implementation of these versions - skipInterval = LUCENE_INT32_MAX_SHOULDBE; - }else{ - indexInterval = input->readInt(); - skipInterval = input->readInt(); - if ( format == -3 ) { - // this new format introduces multi-level skipping - maxSkipLevels = input->readInt(); - } - } - } - } - } - - SegmentTermEnum::SegmentTermEnum(const SegmentTermEnum& clone): - fieldInfos(clone.fieldInfos) - { - //Func - Constructor - // The instance is created by cloning all properties of clone - //Pre - clone holds a valid reference to SegmentTermEnum - //Post - An instance of SegmentTermEnum with the same properties as clone - - input = clone.input->clone(); - //Copy the postion from the clone - position = clone.position; - - if ( clone._term != NULL ){ - _term = _CLNEW Term; - _term->set(clone._term,clone._term->text()); - }else - _term = NULL; - isIndex = clone.isIndex; - termInfo = _CLNEW TermInfo(clone.termInfo); - indexPointer = clone.indexPointer; - buffer = clone.buffer==NULL?NULL:(TCHAR*)malloc(sizeof(TCHAR) * (clone.bufferLength+1)); - bufferLength = clone.bufferLength; - prev = clone.prev==NULL?NULL:_CLNEW Term(clone.prev->field(),clone.prev->text(),false); - size = clone.size; - tisSize = clone.tisSize; - - format = clone.format; - indexInterval= clone.indexInterval; - skipInterval = clone.skipInterval; - formatM1SkipInterval = clone.formatM1SkipInterval; - maxSkipLevels = clone.maxSkipLevels; - - //Set isClone to true as this instance is a clone of another instance - isClone = true; - - //Copy the contents of buffer of clone to the buffer of this instance - if ( clone.buffer != NULL ) - memcpy(buffer,clone.buffer,bufferLength * sizeof(TCHAR)); - } - - SegmentTermEnum::~SegmentTermEnum(){ - //Func - Destructor - //Pre - true - //Post - The instance has been destroyed. If this instance was a clone - // then the inputstream is closed and deleted too. - - //todo: revisit this... close() should clean up most of everything. - - //Finalize prev - _CLDECDELETE(prev ); - //Finalize term - _CLDECDELETE( _term ); - - - //Delete the buffer if necessary - if ( buffer != NULL ) free(buffer); - //Delete termInfo if necessary - _CLDELETE(termInfo); - - //Check if this instance is a clone - if ( isClone ){ - //Close the inputstream - input->close(); - //delete the inputstream - _CLDELETE(input); - } - } - - void SegmentTermEnum::initByTii(SegmentTermEnum* tii) { - if (format == -4) { - size = tii->tisSize; - indexInterval = tii->indexInterval; - skipInterval = tii->skipInterval; - maxSkipLevels = tii->maxSkipLevels; - size_t header = sizeof(format) + - sizeof(size) + - sizeof(indexInterval) + - sizeof(skipInterval) + - sizeof(maxSkipLevels); - input->seek(header); - } - } - - const char* SegmentTermEnum::getObjectName() const{ return getClassName(); } - const char* SegmentTermEnum::getClassName(){ return "SegmentTermEnum"; } - - bool SegmentTermEnum::next(){ - //Func - Moves the current of the set to the next in the set - //Pre - true - //Post - If the end has been reached NULL is returned otherwise the term has - // become the next Term in the enumeration - - //Increase position by and and check if the end has been reached - if (position++ >= size-1) { - //delete term - _CLDECDELETE(_term); - return false; - } - - //delete the previous enumerated term - Term* tmp=NULL; - if ( prev != NULL ){ - if ( _LUCENE_ATOMIC_INT_GET(prev->__cl_refcount) > 1 ){ - _CLDECDELETE(prev); //todo: tune other places try and delete its term - }else - tmp = prev; //we are going to re-use this term - } - //prev becomes the current enumerated term - prev = _term; - //term becomes the next term read from inputStream input - _term = readTerm(tmp); - - //Read docFreq, the number of documents which contain the term. - termInfo->docFreq = input->readVInt(); - //Read freqPointer, a pointer into the TermFreqs file (.frq) - termInfo->freqPointer += input->readVLong(); - - //Read proxPointer, a pointer into the TermPosition file (.prx). - termInfo->proxPointer += input->readVLong(); - - if(format == -1){ - // just read skipOffset in order to increment file pointer; - // value is never used since skipTo is switched off - if (!isIndex) { + _sntprintf(err, 30, _T("Unknown format version: %d"), format); + _CLTHROWT(CL_ERR_CorruptIndex, err); + } + + if (format == -4) { + if (isIndex) { + size = input->readLong(); + if (size < 0) { + auto pos = input->getFilePointer(); + input->seek(input->length() - 16); + size = input->readLong(); + tisSize = input->readLong(); + input->seek(pos); + } + + indexInterval = input->readInt(); + skipInterval = input->readInt(); + maxSkipLevels = input->readInt(); + } + } else { + size = input->readLong(); // read the size + if (size < 0) { // read the size at file footer, if size < 0 + auto pos = input->getFilePointer(); + input->seek(input->length() - 8); + size = input->readLong(); + input->seek(pos); + } + + if (format == -1) { + if (!isIndex) { + indexInterval = input->readInt(); + formatM1SkipInterval = input->readInt(); + } + // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in + // skipTo implementation of these versions + skipInterval = LUCENE_INT32_MAX_SHOULDBE; + } else { + indexInterval = input->readInt(); + skipInterval = input->readInt(); + if (format == -3) { + // this new format introduces multi-level skipping + maxSkipLevels = input->readInt(); + } + } + } + } +} + +SegmentTermEnum::SegmentTermEnum(const SegmentTermEnum& clone) : fieldInfos(clone.fieldInfos) { + //Func - Constructor + // The instance is created by cloning all properties of clone + //Pre - clone holds a valid reference to SegmentTermEnum + //Post - An instance of SegmentTermEnum with the same properties as clone + + input = clone.input->clone(); + //Copy the postion from the clone + position = clone.position; + + if (clone._term != NULL) { + _term = _CLNEW Term; + _term->set(clone._term, clone._term->text()); + } else + _term = NULL; + isIndex = clone.isIndex; + termInfo = _CLNEW TermInfo(clone.termInfo); + indexPointer = clone.indexPointer; + buffer = clone.buffer == NULL ? NULL : (TCHAR*)malloc(sizeof(TCHAR) * (clone.bufferLength + 1)); + bufferLength = clone.bufferLength; + prev = clone.prev == NULL ? NULL : _CLNEW Term(clone.prev->field(), clone.prev->text(), false); + size = clone.size; + tisSize = clone.tisSize; + + format = clone.format; + indexInterval = clone.indexInterval; + skipInterval = clone.skipInterval; + formatM1SkipInterval = clone.formatM1SkipInterval; + maxSkipLevels = clone.maxSkipLevels; + + //Set isClone to true as this instance is a clone of another instance + isClone = true; + + //Copy the contents of buffer of clone to the buffer of this instance + if (clone.buffer != NULL) memcpy(buffer, clone.buffer, bufferLength * sizeof(TCHAR)); +} + +SegmentTermEnum::~SegmentTermEnum() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed. If this instance was a clone + // then the inputstream is closed and deleted too. + + //todo: revisit this... close() should clean up most of everything. + + //Finalize prev + _CLDECDELETE(prev); + //Finalize term + _CLDECDELETE(_term); + + //Delete the buffer if necessary + if (buffer != NULL) free(buffer); + //Delete termInfo if necessary + _CLDELETE(termInfo); + + //Check if this instance is a clone + if (isClone) { + //Close the inputstream + input->close(); + //delete the inputstream + _CLDELETE(input); + } +} + +void SegmentTermEnum::initByTii(SegmentTermEnum* tii) { + if (format == -4) { + size = tii->tisSize; + indexInterval = tii->indexInterval; + skipInterval = tii->skipInterval; + maxSkipLevels = tii->maxSkipLevels; + size_t header = sizeof(format) + sizeof(size) + sizeof(indexInterval) + + sizeof(skipInterval) + sizeof(maxSkipLevels); + input->seek(header); + } +} + +const char* SegmentTermEnum::getObjectName() const { + return getClassName(); +} +const char* SegmentTermEnum::getClassName() { + return "SegmentTermEnum"; +} + +bool SegmentTermEnum::next() { + //Func - Moves the current of the set to the next in the set + //Pre - true + //Post - If the end has been reached NULL is returned otherwise the term has + // become the next Term in the enumeration + + //Increase position by and and check if the end has been reached + if (position++ >= size - 1) { + //delete term + _CLDECDELETE(_term); + return false; + } + + //delete the previous enumerated term + Term* tmp = NULL; + if (prev != NULL) { + if (_LUCENE_ATOMIC_INT_GET(prev->__cl_refcount) > 1) { + _CLDECDELETE(prev); //todo: tune other places try and delete its term + } else + tmp = prev; //we are going to re-use this term + } + //prev becomes the current enumerated term + prev = _term; + //term becomes the next term read from inputStream input + _term = readTerm(tmp); + + //Read docFreq, the number of documents which contain the term. + termInfo->docFreq = input->readVInt(); + //Read freqPointer, a pointer into the TermFreqs file (.frq) + termInfo->freqPointer += input->readVLong(); + + //Read proxPointer, a pointer into the TermPosition file (.prx). + termInfo->proxPointer += input->readVLong(); + + if (format == -1) { + // just read skipOffset in order to increment file pointer; + // value is never used since skipTo is switched off + if (!isIndex) { if (termInfo->docFreq > formatM1SkipInterval) { - termInfo->skipOffset = input->readVInt(); + termInfo->skipOffset = input->readVInt(); } - } - }else{ - if (termInfo->docFreq >= skipInterval) - termInfo->skipOffset = input->readVInt(); - } - - //Check if the enumeration is an index - if (isIndex) - //read index pointer - indexPointer += input->readVLong(); - - return true; - } - - Term* SegmentTermEnum::term(bool pointer) { - if ( pointer ) - return _CL_POINTER(_term); - else - return _term; - } - - void SegmentTermEnum::scanTo(const Term *term){ - //Func - Scan for Term without allocating new Terms - //Pre - term != NULL - //Post - The iterator term has been moved to the position where Term is expected to be - // in the enumeration - while ( term->compareTo(this->_term) > 0 && next()) - { - } - } - - void SegmentTermEnum::close() { - //Func - Closes the enumeration to further activity, freeing resources. - //Pre - true - //Post - The inputStream input has been closed - - input->close(); - } - - int32_t SegmentTermEnum::docFreq() const { - //Func - Returns the document frequency of the current term in the set - //Pre - termInfo != NULL - // next() must have been called once - //Post - The document frequency of the current enumerated term has been returned - - return termInfo->docFreq; - } - - void SegmentTermEnum::seek(const int64_t pointer, const int32_t p, Term* t, TermInfo* ti) { - //Func - Repositions term and termInfo within the enumeration - //Pre - pointer >= 0 - // p >= 0 and contains the new position within the enumeration - // t is a valid reference to a Term and is the new current term in the enumeration - // ti is a valid reference to a TermInfo and is corresponding TermInfo form the new - // current Term - //Post - term and terminfo have been repositioned within the enumeration - - //Reset the IndexInput input to pointer - input->seek(pointer); - //Assign the new position - position = p; - - //finalize the current term - if ( _term == NULL || _LUCENE_ATOMIC_INT_GET(_term->__cl_refcount) > 1 ){ - _CLDECDELETE(_term); - //Get a pointer from t and increase the reference counter of t - _term = _CLNEW Term; //cannot use reference, because TermInfosReader uses non ref-counted array - } - _term->set(t,t->text()); - - //finalize prev - _CLDECDELETE(prev); - - //Change the current termInfo so it matches the new current term - termInfo->set(ti); - - //Have the buffer grown if needed - if ( bufferLength <= _term->textLength() ) - growBuffer(_term->textLength(), true ); // copy term text into buffer - else - _tcsncpy(buffer,_term->text(),bufferLength); //just copy the buffer - } - - TermInfo* SegmentTermEnum::getTermInfo()const { - //Func - Returns a clone of the current termInfo - //Pre - termInfo != NULL - // next() must have been called once - //Post - A clone of the current termInfo has been returned - - return _CLNEW TermInfo(*termInfo); //clone - } - - void SegmentTermEnum::getTermInfo(TermInfo* ti)const { - //Func - Retrieves a clone of termInfo through the reference ti - //Pre - ti contains a valid reference to TermInfo - // termInfo != NULL - // next() must have been called once - //Post - ti contains a clone of termInfo - - ti->set(termInfo); - } - - int64_t SegmentTermEnum::freqPointer()const { - //Func - Returns the freqpointer of the current termInfo - //Pre - termInfo != NULL - // next() must have been called once - //Post - The freqpointer of the current termInfo has been returned - - return termInfo->freqPointer; - } - - int64_t SegmentTermEnum::proxPointer()const { - //Func - Returns the proxPointer of the current termInfo - //Pre - termInfo != NULL - // next() must have been called once - //Post - the proxPointer of the current termInfo has been returned - - return termInfo->proxPointer; - } - - SegmentTermEnum* SegmentTermEnum::clone() const { - //Func - Returns a clone of this instance - //Pre - true - //Post - An clone of this instance has been returned - - return _CLNEW SegmentTermEnum(*this); - } - - Term* SegmentTermEnum::readTerm(Term* reuse) { - //Func - Reads the next term in the enumeration - //Pre - true - //Post - The next Term in the enumeration has been read and returned - - //Read the start position from the inputStream input - int32_t start = input->readVInt(); - //Read the length of term in the inputStream input - int32_t length = input->readVInt(); - - //Calculated the total lenght of bytes that buffer must be to contain the current - //chars in buffer and the new ones yet to be read - uint32_t totalLength = start + length; - - if (static_cast<uint32_t>(bufferLength) < totalLength+1) - growBuffer(totalLength, false); //dont copy the buffer over. - - //Read a length number of characters into the buffer from position start in the inputStream input - input->readChars(buffer, start, length); - //Null terminate the string - buffer[totalLength] = 0; - - //Return a new Term - int32_t field = input->readVInt(); - const TCHAR* fieldname = fieldInfos->fieldName(field); - if ( reuse == NULL ) - reuse = _CLNEW Term; - - reuse->set(fieldname, buffer, false); - return reuse; - } - - void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) { - //Func - Instantiate a buffer of length length+1 - //Pre - length > 0 - //Post - pre(buffer) has been deleted with its contents. A new buffer - // has been allocated of length length+1 and the text of term has been copied - // to buffer - //todo: we could guess that we will need to re-grow this - //buffer a few times...so start off with a reasonable grow - //value... - if ( bufferLength > length ) - return; - - //Store the new bufferLength - if ( length - bufferLength < 8 ) - bufferLength = length+8; - else - bufferLength = length+1; - - bool copy = buffer==NULL; - - //Instantiate the new buffer + 1 is needed for terminator '\0' - if ( buffer == NULL ) - buffer = (TCHAR*)malloc(sizeof(TCHAR) * (bufferLength+1)); - else - buffer = (TCHAR*)realloc(buffer, sizeof(TCHAR) * (bufferLength+1)); - - if ( copy || force_copy){ - //Copy the text of term into buffer - _tcsncpy(buffer,_term->text(),bufferLength); - } - } + } + } else { + if (termInfo->docFreq >= skipInterval) termInfo->skipOffset = input->readVInt(); + } + + //Check if the enumeration is an index + if (isIndex) + //read index pointer + indexPointer += input->readVLong(); + + return true; +} + +Term* SegmentTermEnum::term(bool pointer) { + if (pointer) + return _CL_POINTER(_term); + else + return _term; +} + +void SegmentTermEnum::scanTo(const Term* term) { + //Func - Scan for Term without allocating new Terms + //Pre - term != NULL + //Post - The iterator term has been moved to the position where Term is expected to be + // in the enumeration + while (term->compareTo(this->_term) > 0 && next()) { + } +} + +void SegmentTermEnum::close() { + //Func - Closes the enumeration to further activity, freeing resources. + //Pre - true + //Post - The inputStream input has been closed + + input->close(); +} + +int32_t SegmentTermEnum::docFreq() const { + //Func - Returns the document frequency of the current term in the set + //Pre - termInfo != NULL + // next() must have been called once + //Post - The document frequency of the current enumerated term has been returned + + return termInfo->docFreq; +} + +void SegmentTermEnum::seek(const int64_t pointer, const int32_t p, Term* t, TermInfo* ti) { + //Func - Repositions term and termInfo within the enumeration + //Pre - pointer >= 0 + // p >= 0 and contains the new position within the enumeration + // t is a valid reference to a Term and is the new current term in the enumeration + // ti is a valid reference to a TermInfo and is corresponding TermInfo form the new + // current Term + //Post - term and terminfo have been repositioned within the enumeration + + //Reset the IndexInput input to pointer + input->seek(pointer); + //Assign the new position + position = p; + + //finalize the current term + if (_term == NULL || _LUCENE_ATOMIC_INT_GET(_term->__cl_refcount) > 1) { + _CLDECDELETE(_term); + //Get a pointer from t and increase the reference counter of t + _term = _CLNEW + Term; //cannot use reference, because TermInfosReader uses non ref-counted array + } + _term->set(t, t->text()); + + //finalize prev + _CLDECDELETE(prev); + + //Change the current termInfo so it matches the new current term + termInfo->set(ti); + + //Have the buffer grown if needed + if (bufferLength <= _term->textLength()) + growBuffer(_term->textLength(), true); // copy term text into buffer + else + _tcsncpy(buffer, _term->text(), bufferLength); //just copy the buffer +} + +TermInfo* SegmentTermEnum::getTermInfo() const { + //Func - Returns a clone of the current termInfo + //Pre - termInfo != NULL + // next() must have been called once + //Post - A clone of the current termInfo has been returned + + return _CLNEW TermInfo(*termInfo); //clone +} + +void SegmentTermEnum::getTermInfo(TermInfo* ti) const { + //Func - Retrieves a clone of termInfo through the reference ti + //Pre - ti contains a valid reference to TermInfo + // termInfo != NULL + // next() must have been called once + //Post - ti contains a clone of termInfo + + ti->set(termInfo); +} + +int64_t SegmentTermEnum::freqPointer() const { + //Func - Returns the freqpointer of the current termInfo + //Pre - termInfo != NULL + // next() must have been called once + //Post - The freqpointer of the current termInfo has been returned + + return termInfo->freqPointer; +} + +int64_t SegmentTermEnum::proxPointer() const { + //Func - Returns the proxPointer of the current termInfo + //Pre - termInfo != NULL + // next() must have been called once + //Post - the proxPointer of the current termInfo has been returned + + return termInfo->proxPointer; +} + +SegmentTermEnum* SegmentTermEnum::clone() const { + //Func - Returns a clone of this instance + //Pre - true + //Post - An clone of this instance has been returned + + return _CLNEW SegmentTermEnum(*this); +} + +Term* SegmentTermEnum::readTerm(Term* reuse) { + //Func - Reads the next term in the enumeration + //Pre - true + //Post - The next Term in the enumeration has been read and returned + + //Read the start position from the inputStream input + int32_t start = input->readVInt(); + //Read the length of term in the inputStream input + int32_t length = input->readVInt(); + + //Calculated the total lenght of bytes that buffer must be to contain the current + //chars in buffer and the new ones yet to be read + uint32_t totalLength = start + length; + + if (static_cast<uint32_t>(bufferLength) < totalLength + 1) + growBuffer(totalLength, false); //dont copy the buffer over. + + //Read a length number of characters into the buffer from position start in the inputStream input + input->readChars(buffer, start, length); + //Null terminate the string + buffer[totalLength] = 0; + + //Return a new Term + int32_t field = input->readVInt(); + const TCHAR* fieldname = fieldInfos->fieldName(field); + if (reuse == NULL) reuse = _CLNEW Term; + + reuse->set(fieldname, buffer, false); + return reuse; +} + +void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) { + //Func - Instantiate a buffer of length length+1 + //Pre - length > 0 + //Post - pre(buffer) has been deleted with its contents. A new buffer + // has been allocated of length length+1 and the text of term has been copied + // to buffer + //todo: we could guess that we will need to re-grow this + //buffer a few times...so start off with a reasonable grow + //value... + if (bufferLength > length) return; + + //Store the new bufferLength + if (length - bufferLength < 8) + bufferLength = length + 8; + else + bufferLength = length + 1; + + bool copy = buffer == NULL; + + //Instantiate the new buffer + 1 is needed for terminator '\0' + if (buffer == NULL) + buffer = (TCHAR*)malloc(sizeof(TCHAR) * (bufferLength + 1)); + else + buffer = (TCHAR*)realloc(buffer, sizeof(TCHAR) * (bufferLength + 1)); + + if (copy || force_copy) { + //Copy the text of term into buffer + _tcsncpy(buffer, _term->text(), bufferLength); + } +} CL_NS_END diff --git a/src/core/CLucene/index/TermInfosReader.cpp b/src/core/CLucene/index/TermInfosReader.cpp index 7996d4d4f8..6cf8b42fe4 100644 --- a/src/core/CLucene/index/TermInfosReader.cpp +++ b/src/core/CLucene/index/TermInfosReader.cpp @@ -24,207 +24,206 @@ CL_NS_USE(store) CL_NS_USE(util) CL_NS_DEF(index) - - TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis, const int32_t readBufferSize): - directory (dir),fieldInfos (fis), indexTerms(NULL), indexInfos(NULL), indexPointers(NULL), indexDivisor(1) - { - //Func - Constructor. - // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) - //Pre - dir is a reference to a valid Directory - // Fis contains a valid reference to an FieldInfos instance - // seg != NULL and contains the name of the segment - //Post - An instance has been created and the index named seg has been read. (Remember - // a segment is nothing more then an independently readable index) - - CND_PRECONDITION(seg != NULL, "seg is NULL"); - - //Initialize the name of the segment - segment = seg; - - //Create a filname fo a Term Info File - string tisFile = Misc::segmentname(segment,".tis"); - string tiiFile = Misc::segmentname(segment,".tii"); - bool success = false; +TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis, + const int32_t readBufferSize) + : directory(dir), + fieldInfos(fis), + indexTerms(NULL), + indexInfos(NULL), + indexPointers(NULL), + indexDivisor(1) { + //Func - Constructor. + // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) + //Pre - dir is a reference to a valid Directory + // Fis contains a valid reference to an FieldInfos instance + // seg != NULL and contains the name of the segment + //Post - An instance has been created and the index named seg has been read. (Remember + // a segment is nothing more then an independently readable index) + + CND_PRECONDITION(seg != NULL, "seg is NULL"); + + //Initialize the name of the segment + segment = seg; + + //Create a filname fo a Term Info File + string tisFile = Misc::segmentname(segment, ".tis"); + string tiiFile = Misc::segmentname(segment, ".tii"); + bool success = false; origEnum = indexEnum = NULL; _size = indexTermsLength = totalIndexInterval = 0; - indexIsRead = false; - - try { - //Create an SegmentTermEnum for storing all the terms read of the segment - - // tii - auto tiiStream = directory->openInput( tiiFile.c_str(), readBufferSize ); - indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true, -1); - CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); - - // tis - auto tisStream = directory->openInput( tisFile.c_str(), readBufferSize ); - origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false, indexEnum->getFormat()); - origEnum->initByTii(indexEnum); - CND_CONDITION(origEnum != NULL, "No memory could be allocated for index enumerator"); - _size = origEnum->size; - totalIndexInterval = origEnum->indexInterval; - - //call ensureIndexIsRead to load data to memory right now - ensureIndexIsRead(); - - success = true; - } _CLFINALLY({ - // With lock-less commits, it's entirely possible (and - // fine) to hit a FileNotFound exception above. In - // this case, we want to explicitly close any subset - // of things that were opened so that we don't have to - // wait for a GC to do so. - if (!success) { - close(); - } - }); - - } - - TermInfosReader::~TermInfosReader(){ - //Func - Destructor - //Pre - true - //Post - The instance has been destroyed - - //Close the TermInfosReader to be absolutly sure that enumerator has been closed - //and the arrays indexTerms, indexPointers and indexInfos and their elements - //have been destroyed - close(); - } - int32_t TermInfosReader::getSkipInterval() const { + indexIsRead = false; + + try { + //Create an SegmentTermEnum for storing all the terms read of the segment + + // tii + auto tiiStream = directory->openInput(tiiFile.c_str(), readBufferSize); + indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true); + indexEnum->init(-1); + CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); + + // tis + auto tisStream = directory->openInput(tisFile.c_str(), readBufferSize); + origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false); + origEnum->init(indexEnum->getFormat()); + origEnum->initByTii(indexEnum); + CND_CONDITION(origEnum != NULL, "No memory could be allocated for index enumerator"); + _size = origEnum->size; + totalIndexInterval = origEnum->indexInterval; + + //call ensureIndexIsRead to load data to memory right now + ensureIndexIsRead(); + + success = true; + } + _CLFINALLY({ + // With lock-less commits, it's entirely possible (and + // fine) to hit a FileNotFound exception above. In + // this case, we want to explicitly close any subset + // of things that were opened so that we don't have to + // wait for a GC to do so. + if (!success) { + close(); + } + }); +} + +TermInfosReader::~TermInfosReader() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + //Close the TermInfosReader to be absolutly sure that enumerator has been closed + //and the arrays indexTerms, indexPointers and indexInfos and their elements + //have been destroyed + close(); +} +int32_t TermInfosReader::getSkipInterval() const { return origEnum->skipInterval; - } +} - int32_t TermInfosReader::getMaxSkipLevels() const { +int32_t TermInfosReader::getMaxSkipLevels() const { return origEnum->maxSkipLevels; - } - - void TermInfosReader::setIndexDivisor(const int32_t _indexDivisor) { - if (indexDivisor < 1) - _CLTHROWA(CL_ERR_IllegalArgument, "indexDivisor must be > 0"); +} - if (indexTerms != NULL) - _CLTHROWA(CL_ERR_IllegalArgument, "index terms are already loaded"); +void TermInfosReader::setIndexDivisor(const int32_t _indexDivisor) { + if (indexDivisor < 1) _CLTHROWA(CL_ERR_IllegalArgument, "indexDivisor must be > 0"); - this->indexDivisor = _indexDivisor; - totalIndexInterval = origEnum->indexInterval * _indexDivisor; - } + if (indexTerms != NULL) _CLTHROWA(CL_ERR_IllegalArgument, "index terms are already loaded"); - int32_t TermInfosReader::getIndexDivisor() const { return indexDivisor; } - void TermInfosReader::close() { + this->indexDivisor = _indexDivisor; + totalIndexInterval = origEnum->indexInterval * _indexDivisor; +} - //Check if indexTerms and indexInfos exist - if (indexTerms && indexInfos){ - //Iterate through arrays indexTerms and indexPointer to - //destroy their elements +int32_t TermInfosReader::getIndexDivisor() const { + return indexDivisor; +} +void TermInfosReader::close() { + //Check if indexTerms and indexInfos exist + if (indexTerms && indexInfos) { + //Iterate through arrays indexTerms and indexPointer to + //destroy their elements #ifdef _DEBUG - for ( int32_t i=0; i<indexTermsLength;++i ){ + for (int32_t i = 0; i < indexTermsLength; ++i) { indexTerms[i].__cl_refcount--; - } + } #endif - } - //Delete the arrays - if (indexTerms){ - delete [] indexTerms; - indexTerms = NULL; - } - if (indexInfos){ - _CLDELETE_ARRAY(indexInfos); - indexInfos = NULL; - } - - //Delete the arrays - if (indexPointers) { + } + //Delete the arrays + if (indexTerms) { + delete[] indexTerms; + indexTerms = NULL; + } + if (indexInfos) { + _CLDELETE_ARRAY(indexInfos); + indexInfos = NULL; + } + + //Delete the arrays + if (indexPointers) { _CLDELETE_ARRAY(indexPointers); indexPointers = NULL; - } + } - if (origEnum != NULL){ + if (origEnum != NULL) { origEnum->close(); - //Get a pointer to IndexInput used by the enumeration but - //instantiated in the constructor by directory.open( tisFile ) - IndexInput *is = origEnum->input; + //Get a pointer to IndexInput used by the enumeration but + //instantiated in the constructor by directory.open( tisFile ) + IndexInput* is = origEnum->input; //Delete the enumuration enumerator _CLDELETE(origEnum); //Delete the IndexInput _CLDELETE(is); - } + } - if (indexEnum != NULL){ + if (indexEnum != NULL) { indexEnum->close(); - //Get a pointer to IndexInput used by the enumeration but - //instantiated in the constructor by directory.open( tiiFile ) - IndexInput *is = indexEnum->input; + //Get a pointer to IndexInput used by the enumeration but + //instantiated in the constructor by directory.open( tiiFile ) + IndexInput* is = indexEnum->input; //Delete the enumuration enumerator _CLDELETE(indexEnum); - indexEnum = NULL; + indexEnum = NULL; //Delete the IndexInput _CLDELETE(is); - } - enumerators.setNull(); - } - - int64_t TermInfosReader::size() const{ - //Func - Return the size of the enumeration of TermInfos - //Pre - true - //Post - size has been returened + } + enumerators.setNull(); +} - return _size; - } +int64_t TermInfosReader::size() const { + //Func - Return the size of the enumeration of TermInfos + //Pre - true + //Post - size has been returened + return _size; +} - Term* TermInfosReader::get(const int32_t position) { - //Func - Returns the nth term in the set - //Pre - position > = 0 - //Post - The n-th term in the set has been returned +Term* TermInfosReader::get(const int32_t position) { + //Func - Returns the nth term in the set + //Pre - position > = 0 + //Post - The n-th term in the set has been returned - //Check if the size is 0 because then there are no terms - if (_size == 0) - return NULL; + //Check if the size is 0 because then there are no terms + if (_size == 0) return NULL; - SegmentTermEnum* enumerator = getEnum(); + SegmentTermEnum* enumerator = getEnum(); - if ( - enumerator != NULL //an enumeration exists - && enumerator->term(false) != NULL // term is at or past current - && position >= enumerator->position - && position < (enumerator->position + totalIndexInterval) - ) - { - return scanEnum(position); // can avoid seek - } + if (enumerator != NULL //an enumeration exists + && enumerator->term(false) != NULL // term is at or past current + && position >= enumerator->position && + position < (enumerator->position + totalIndexInterval)) { + return scanEnum(position); // can avoid seek + } //random-access: must seek seekEnum(position / totalIndexInterval); - //Get the Term at position + //Get the Term at position return scanEnum(position); - } +} - SegmentTermEnum* TermInfosReader::getEnum(){ +SegmentTermEnum* TermInfosReader::getEnum() { SegmentTermEnum* termEnum = enumerators.get(); - if (termEnum == NULL){ - termEnum = terms(); - enumerators.set(termEnum); + if (termEnum == NULL) { + termEnum = terms(); + enumerators.set(termEnum); } return termEnum; - } +} - TermInfo* TermInfosReader::get(const Term* term){ - //Func - Returns a TermInfo for a term - //Pre - term holds a valid reference to term - //Post - if term can be found its TermInfo has been returned otherwise NULL +TermInfo* TermInfosReader::get(const Term* term) { + //Func - Returns a TermInfo for a term + //Pre - term holds a valid reference to term + //Post - if term can be found its TermInfo has been returned otherwise NULL //If the size of the enumeration is 0 then no Terms have been read - if (_size == 0) - return NULL; + if (_size == 0) return NULL; ensureIndexIsRead(); @@ -233,250 +232,238 @@ CL_NS_DEF(index) // optimize sequential access: first try scanning cached enumerator w/o seeking if ( - //the current term of the enumeration enumerator is not at the end AND - enumerator->term(false) != NULL && - ( - //there exists a previous current called prev and term is positioned after this prev OR - ( enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) || - //term is positioned at the same position as the current of enumerator or at a higher position - term->compareTo(enumerator->term(false)) >= 0 ) - ) - { - - //Calculate the offset for the position - int32_t _enumOffset = (int32_t)(enumerator->position/totalIndexInterval)+1; - - // but before end of block - if ( - //the length of indexTerms (the number of terms in enumerator) equals - //_enum_offset OR - indexTermsLength == _enumOffset || - //term is positioned in front of term found at _enumOffset in indexTerms - term->compareTo(&indexTerms[_enumOffset]) < 0){ - - //no need to seek, retrieve the TermInfo for term - return scanEnum(term); + //the current term of the enumeration enumerator is not at the end AND + enumerator->term(false) != NULL && + ( + //there exists a previous current called prev and term is positioned after this prev OR + (enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) || + //term is positioned at the same position as the current of enumerator or at a higher position + term->compareTo(enumerator->term(false)) >= 0)) { + //Calculate the offset for the position + int32_t _enumOffset = (int32_t)(enumerator->position / totalIndexInterval) + 1; + + // but before end of block + if ( + //the length of indexTerms (the number of terms in enumerator) equals + //_enum_offset OR + indexTermsLength == _enumOffset || + //term is positioned in front of term found at _enumOffset in indexTerms + term->compareTo(&indexTerms[_enumOffset]) < 0) { + //no need to seek, retrieve the TermInfo for term + return scanEnum(term); } } //Reposition current term in the enumeration seekEnum(getIndexOffset(term)); - //Return the TermInfo for term + //Return the TermInfo for term return scanEnum(term); - } - +} - int64_t TermInfosReader::getPosition(const Term* term) { - //Func - Returns the position of a Term in the set - //Pre - term holds a valid reference to a Term - // enumerator != NULL - //Post - If term was found then its position is returned otherwise -1 +int64_t TermInfosReader::getPosition(const Term* term) { + //Func - Returns the position of a Term in the set + //Pre - term holds a valid reference to a Term + // enumerator != NULL + //Post - If term was found then its position is returned otherwise -1 - //if the enumeration is empty then return -1 - if (_size == 0) - return -1; + //if the enumeration is empty then return -1 + if (_size == 0) return -1; - ensureIndexIsRead(); - - //Retrieve the indexOffset for term - int32_t indexOffset = getIndexOffset(term); - seekEnum(indexOffset); + ensureIndexIsRead(); - SegmentTermEnum* enumerator = getEnum(); + //Retrieve the indexOffset for term + int32_t indexOffset = getIndexOffset(term); + seekEnum(indexOffset); - while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {} + SegmentTermEnum* enumerator = getEnum(); - if ( term->equals(enumerator->term(false)) ){ - return enumerator->position; - }else - return -1; - } + while (term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) { + } - SegmentTermEnum* TermInfosReader::terms(const Term* term) { - //Func - Returns an enumeration of terms starting at or after the named term. - // If term is null then enumerator is set to the beginning - //Pre - term holds a valid reference to a Term - // enumerator != NULL - //Post - An enumeration of terms starting at or after the named term has been returned + if (term->equals(enumerator->term(false))) { + return enumerator->position; + } else + return -1; +} + +SegmentTermEnum* TermInfosReader::terms(const Term* term) { + //Func - Returns an enumeration of terms starting at or after the named term. + // If term is null then enumerator is set to the beginning + //Pre - term holds a valid reference to a Term + // enumerator != NULL + //Post - An enumeration of terms starting at or after the named term has been returned + + SegmentTermEnum* enumerator = NULL; + if (term != NULL) { + //Seek enumerator to term; delete the new TermInfo that's returned. + TermInfo* ti = get(term); + _CLLDELETE(ti); + enumerator = getEnum(); + } else + enumerator = origEnum; + + //Clone the entire enumeration + SegmentTermEnum* cln = enumerator->clone(); + + //Check if cln points to a valid instance + CND_CONDITION(cln != NULL, "cln is NULL"); + + return cln; +} + +void TermInfosReader::ensureIndexIsRead() { + //Func - Reads the term info index file or .tti file. + // This file contains every IndexInterval-th entry from the .tis file, + // along with its location in the "tis" file. This is designed to be read entirely + // into memory and used to provide random access to the "tis" file. + //Pre - indexTerms = NULL + // indexInfos = NULL + // indexPointers = NULL + //Post - The term info index file has been read into memory - SegmentTermEnum* enumerator = NULL; - if ( term != NULL ){ - //Seek enumerator to term; delete the new TermInfo that's returned. - TermInfo* ti = get(term); - _CLLDELETE(ti); - enumerator = getEnum(); - }else - enumerator = origEnum; + SCOPED_LOCK_MUTEX(THIS_LOCK) - //Clone the entire enumeration - SegmentTermEnum* cln = enumerator->clone(); + if (indexIsRead) return; + + //https://jira.qianxin-inc.cn/browse/XHBUG-2921 + //https://jira.qianxin-inc.cn/browse/XHBUG-3053 + if (indexEnum == NULL) _CLTHROWA(CL_ERR_NullPointer, "indexEnum is NULL"); + + try { + indexTermsLength = (size_t)indexEnum->size; + + //Instantiate an block of Term's,so that each one doesn't have to be new'd + indexTerms = new Term[indexTermsLength]; + CND_CONDITION( + indexTerms != NULL, + "No memory could be allocated for indexTerms"); //Check if is indexTerms is a valid array + + //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd + indexInfos = _CL_NEWARRAY(TermInfo, indexTermsLength); + CND_CONDITION( + indexInfos != NULL, + "No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array + + //Instantiate an array indexPointers that contains pointers to the term info index file + indexPointers = _CL_NEWARRAY(int64_t, indexTermsLength); + CND_CONDITION( + indexPointers != NULL, + "No memory could be allocated for indexPointers"); //Check if is indexPointers is a valid array + + //Iterate through the terms of indexEnum + for (int32_t i = 0; indexEnum->next(); ++i) { + indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text()); + indexEnum->getTermInfo(&indexInfos[i]); + indexPointers[i] = indexEnum->indexPointer; + + for (int32_t j = 1; j < indexDivisor; j++) + if (!indexEnum->next()) break; + } + indexIsRead = true; + } + _CLFINALLY(indexEnum->close(); + //Close and delete the IndexInput is. The close is done by the destructor. + _CLDELETE(indexEnum->input); _CLDELETE(indexEnum); indexEnum = NULL;); +} + +int32_t TermInfosReader::getIndexOffset(const Term* term) { + //Func - Returns the offset of the greatest index entry which is less than or equal to term. + //Pre - term holds a reference to a valid term + // indexTerms != NULL + //Post - The new offset has been returned + + //Check if is indexTerms is a valid array + CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); + + int32_t lo = 0; + int32_t hi = indexTermsLength - 1; + int32_t mid; + int32_t delta; + + while (hi >= lo) { + //Start in the middle betwee hi and lo + mid = (lo + hi) >> 1; + + //Check if is indexTerms[mid] is a valid instance of Term + CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL"); + CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength"); + + //Determine if term is before mid or after mid + delta = term->compareTo(&indexTerms[mid]); + if (delta < 0) { + //Calculate the new hi + hi = mid - 1; + } else if (delta > 0) { + //Calculate the new lo + lo = mid + 1; + } else { + //term has been found so return its position + return mid; + } + } + // the new starting offset + return hi; +} + +void TermInfosReader::seekEnum(const int32_t indexOffset) { + //Func - Reposition the current Term and TermInfo to indexOffset + //Pre - indexOffset >= 0 + // indexTerms != NULL + // indexInfos != NULL + // indexPointers != NULL + //Post - The current Term and Terminfo have been repositioned to indexOffset + + CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); + CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); + CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); + CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); - //Check if cln points to a valid instance - CND_CONDITION(cln != NULL,"cln is NULL"); + SegmentTermEnum* enumerator = getEnum(); + enumerator->seek(indexPointers[indexOffset], (indexOffset * totalIndexInterval) - 1, + &indexTerms[indexOffset], &indexInfos[indexOffset]); +} + +TermInfo* TermInfosReader::scanEnum(const Term* term) { + //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found. + // The search is started from the current term. + //Pre - term contains a valid reference to a Term + // enumerator != NULL + //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL + // has been returned - return cln; - } + SegmentTermEnum* enumerator = getEnum(); + enumerator->scanTo(term); + + //Check if the at the position the Term term can be found + if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) { + //Return the TermInfo instance about term + return enumerator->getTermInfo(); + } else { + //term was not found so no TermInfo can be returned + return NULL; + } +} +Term* TermInfosReader::scanEnum(const int32_t position) { + //Func - Scans the enumeration to the requested position and returns the + // Term located at that position + //Pre - position > = 0 + // enumerator != NULL + //Post - The Term at the requested position has been returned - void TermInfosReader::ensureIndexIsRead() { - //Func - Reads the term info index file or .tti file. - // This file contains every IndexInterval-th entry from the .tis file, - // along with its location in the "tis" file. This is designed to be read entirely - // into memory and used to provide random access to the "tis" file. - //Pre - indexTerms = NULL - // indexInfos = NULL - // indexPointers = NULL - //Post - The term info index file has been read into memory + SegmentTermEnum* enumerator = getEnum(); - SCOPED_LOCK_MUTEX(THIS_LOCK) + //As long the position of the enumeration enumerator is smaller than the requested one + while (enumerator->position < position) { + //Move the current of enumerator to the next + if (!enumerator->next()) { + //If there is no next it means that the requested position was to big + return NULL; + } + } - if (indexIsRead) - return; - - //https://jira.qianxin-inc.cn/browse/XHBUG-2921 - //https://jira.qianxin-inc.cn/browse/XHBUG-3053 - if (indexEnum == NULL) - _CLTHROWA(CL_ERR_NullPointer, "indexEnum is NULL"); - - try { - indexTermsLength = (size_t)indexEnum->size; - - //Instantiate an block of Term's,so that each one doesn't have to be new'd - indexTerms = new Term[indexTermsLength]; - CND_CONDITION(indexTerms != NULL,"No memory could be allocated for indexTerms");//Check if is indexTerms is a valid array - - //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd - indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength); - CND_CONDITION(indexInfos != NULL,"No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array - - //Instantiate an array indexPointers that contains pointers to the term info index file - indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength); - CND_CONDITION(indexPointers != NULL,"No memory could be allocated for indexPointers");//Check if is indexPointers is a valid array - - //Iterate through the terms of indexEnum - for (int32_t i = 0; indexEnum->next(); ++i){ - indexTerms[i].set(indexEnum->term(false),indexEnum->term(false)->text()); - indexEnum->getTermInfo(&indexInfos[i]); - indexPointers[i] = indexEnum->indexPointer; - - for (int32_t j = 1; j < indexDivisor; j++) - if (!indexEnum->next()) - break; - } - indexIsRead = true; - }_CLFINALLY( - indexEnum->close(); - //Close and delete the IndexInput is. The close is done by the destructor. - _CLDELETE( indexEnum->input ); - _CLDELETE( indexEnum ); - indexEnum = NULL; - ); - } - - - int32_t TermInfosReader::getIndexOffset(const Term* term){ - //Func - Returns the offset of the greatest index entry which is less than or equal to term. - //Pre - term holds a reference to a valid term - // indexTerms != NULL - //Post - The new offset has been returned - - //Check if is indexTerms is a valid array - CND_PRECONDITION(indexTerms != NULL,"indexTerms is NULL"); - - int32_t lo = 0; - int32_t hi = indexTermsLength - 1; - int32_t mid; - int32_t delta; - - while (hi >= lo) { - //Start in the middle betwee hi and lo - mid = (lo + hi) >> 1; - - //Check if is indexTerms[mid] is a valid instance of Term - CND_PRECONDITION(&indexTerms[mid] != NULL,"indexTerms[mid] is NULL"); - CND_PRECONDITION(mid < indexTermsLength,"mid >= indexTermsLength"); - - //Determine if term is before mid or after mid - delta = term->compareTo(&indexTerms[mid]); - if (delta < 0){ - //Calculate the new hi - hi = mid - 1; - }else if (delta > 0){ - //Calculate the new lo - lo = mid + 1; - }else{ - //term has been found so return its position - return mid; - } - } - // the new starting offset - return hi; - } - - void TermInfosReader::seekEnum(const int32_t indexOffset) { - //Func - Reposition the current Term and TermInfo to indexOffset - //Pre - indexOffset >= 0 - // indexTerms != NULL - // indexInfos != NULL - // indexPointers != NULL - //Post - The current Term and Terminfo have been repositioned to indexOffset - - CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); - CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); - CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); - CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); - - SegmentTermEnum* enumerator = getEnum(); - enumerator->seek( - indexPointers[indexOffset], - (indexOffset * totalIndexInterval) - 1, - &indexTerms[indexOffset], - &indexInfos[indexOffset] - ); - } - - - TermInfo* TermInfosReader::scanEnum(const Term* term) { - //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found. - // The search is started from the current term. - //Pre - term contains a valid reference to a Term - // enumerator != NULL - //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL - // has been returned - - SegmentTermEnum* enumerator = getEnum(); - enumerator->scanTo(term); - - //Check if the at the position the Term term can be found - if (enumerator->term(false) != NULL && term->equals(enumerator->term(false)) ){ - //Return the TermInfo instance about term - return enumerator->getTermInfo(); - }else{ - //term was not found so no TermInfo can be returned - return NULL; - } - } - - Term* TermInfosReader::scanEnum(const int32_t position) { - //Func - Scans the enumeration to the requested position and returns the - // Term located at that position - //Pre - position > = 0 - // enumerator != NULL - //Post - The Term at the requested position has been returned - - SegmentTermEnum* enumerator = getEnum(); - - //As long the position of the enumeration enumerator is smaller than the requested one - while(enumerator->position < position){ - //Move the current of enumerator to the next - if (!enumerator->next()){ - //If there is no next it means that the requested position was to big - return NULL; - } - } - - //Return the Term a the requested position - return enumerator->term(); - } + //Return the Term a the requested position + return enumerator->term(); +} CL_NS_END diff --git a/src/core/CLucene/index/_SegmentHeader.h b/src/core/CLucene/index/_SegmentHeader.h index bf988a2f27..c1f01e7cec 100644 --- a/src/core/CLucene/index/_SegmentHeader.h +++ b/src/core/CLucene/index/_SegmentHeader.h @@ -93,8 +93,8 @@ protected: int32_t count; int32_t df; CL_NS(util)::BitSet* deletedDocs; - int32_t _doc; - int32_t _freq; + int32_t _doc = -1; + int32_t _freq = 0; int32_t docs[PFOR_BLOCK_SIZE]; // buffered doc numbers int32_t freqs[PFOR_BLOCK_SIZE]; // buffered term freqs int32_t pointer; diff --git a/src/core/CLucene/index/_SegmentTermEnum.h b/src/core/CLucene/index/_SegmentTermEnum.h index b5fa419d4f..3dd2c8c5b8 100644 --- a/src/core/CLucene/index/_SegmentTermEnum.h +++ b/src/core/CLucene/index/_SegmentTermEnum.h @@ -55,7 +55,8 @@ protected: public: ///Constructor - SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const bool isi, int32_t in_format = -1); + SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const bool isi); + void init(int32_t in_format = -1); ///Destructor ~SegmentTermEnum(); diff --git a/src/core/CLucene/search/MultiPhraseQuery.cpp b/src/core/CLucene/search/MultiPhraseQuery.cpp index 5427370261..107c8b11f4 100644 --- a/src/core/CLucene/search/MultiPhraseQuery.cpp +++ b/src/core/CLucene/search/MultiPhraseQuery.cpp @@ -211,8 +211,8 @@ Query* MultiPhraseQuery::rewrite(IndexReader* /*reader*/) { ArrayBase<Term*>* terms = termArrays->at(0); BooleanQuery* boq = _CLNEW BooleanQuery(true); for ( size_t i=0;i<terms->length;i++ ){ - boq->add(_CLNEW TermQuery((*terms)[i]), BooleanClause::SHOULD); - } + boq->add(_CLNEW TermQuery((*terms)[i]), true, BooleanClause::SHOULD); + } boq->setBoost(getBoost()); return boq; } else { diff --git a/src/core/CLucene/search/query/TermIterator.h b/src/core/CLucene/search/query/TermIterator.h index e0cf23a4fb..3eb22a254d 100644 --- a/src/core/CLucene/search/query/TermIterator.h +++ b/src/core/CLucene/search/query/TermIterator.h @@ -1,51 +1,54 @@ #pragma once -#include "CLucene/search/query/DcoIdSetIterator.h" #include "CLucene/index/Terms.h" #include <limits.h> +#include <cstdint> CL_NS_USE(index) -class TermIterator : public DocIdSetIterator { +class TermIterator { public: TermIterator() = default; - TermIterator(TermDocs* termDocs) : termDocs_(termDocs) { + TermIterator(TermDocs* termDocs) + : termDocs_(termDocs) { } - virtual ~TermIterator() = default; - - bool isEmpty() { + inline bool isEmpty() const { return termDocs_ == nullptr; } - int32_t docID() override { - uint32_t docId = termDocs_->doc(); + inline int32_t docID() const { + int32_t docId = termDocs_->doc(); return docId >= INT_MAX ? INT_MAX : docId; } - int32_t nextDoc() override { + inline int32_t freq() const { + return termDocs_->freq(); + } + + inline int32_t nextDoc() const { if (termDocs_->next()) { return termDocs_->doc(); } return INT_MAX; } - int32_t advance(int32_t target) override { + inline int32_t advance(int32_t target) const { if (termDocs_->skipTo(target)) { return termDocs_->doc(); } return INT_MAX; } - int32_t docFreq() const override { + inline int32_t docFreq() const { return termDocs_->docFreq(); } - bool readRange(DocRange* docRange) const override { + inline bool readRange(DocRange* docRange) const { return termDocs_->readRange(docRange); } -private: +protected: TermDocs* termDocs_ = nullptr; }; \ No newline at end of file diff --git a/src/core/CLucene/search/query/TermPositionIterator.h b/src/core/CLucene/search/query/TermPositionIterator.h new file mode 100644 index 0000000000..d64af4098f --- /dev/null +++ b/src/core/CLucene/search/query/TermPositionIterator.h @@ -0,0 +1,23 @@ +#pragma once + +#include "CLucene/search/query/TermIterator.h" +#include "CLucene/index/Terms.h" + +#include <limits.h> + +CL_NS_USE(index) + +class TermPositionIterator : public TermIterator { +public: + TermPositionIterator() = default; + TermPositionIterator(TermPositions* termPositions) + : TermIterator(termPositions), termPositions_(termPositions) { + } + + inline int32_t nextPosition() const { + return termPositions_->nextPosition(); + } + +private: + TermPositions* termPositions_ = nullptr; +}; \ No newline at end of file diff --git a/src/core/CLucene/store/IndexOutput.cpp b/src/core/CLucene/store/IndexOutput.cpp index 05e7695f92..77c37400d8 100644 --- a/src/core/CLucene/store/IndexOutput.cpp +++ b/src/core/CLucene/store/IndexOutput.cpp @@ -35,12 +35,13 @@ CL_NS_DEF(store) close(); } - void BufferedIndexOutput::close(){ - flush(); - _CLDELETE_ARRAY( buffer ); - - bufferStart = 0; - bufferPosition = 0; + void BufferedIndexOutput::close() { + // flush may throw error here, if we do not delete buffer for all circumstances, + // we may close again in destructor above, that would cause pure virtual function call for flushBuffer + try { + flush(); + } + _CLFINALLY(_CLDELETE_ARRAY(buffer); bufferStart = 0; bufferPosition = 0;) } void BufferedIndexOutput::writeByte(const uint8_t b) { diff --git a/src/core/CLucene/util/CLStreams.h b/src/core/CLucene/util/CLStreams.h index 121f272539..3f60f2d97a 100644 --- a/src/core/CLucene/util/CLStreams.h +++ b/src/core/CLucene/util/CLStreams.h @@ -196,7 +196,7 @@ public: this->init(_value, _length, copyData); } void init(const void *_value, int32_t _length, bool copyData = true) override { - const size_t length = _length; + const size_t length = (size_t)_length; this->pos = 0; if (copyData) { T *tmp = (T *) this->value; diff --git a/src/core/CLucene/util/PriorityQueue.h b/src/core/CLucene/util/PriorityQueue.h index 16b2bbac66..59cb0a8d31 100644 --- a/src/core/CLucene/util/PriorityQueue.h +++ b/src/core/CLucene/util/PriorityQueue.h @@ -39,7 +39,7 @@ class CLUCENE_INLINE_EXPORT PriorityQueue { int32_t j = ((uint32_t)i) >> 1; while (j > 0 && lessThan(node,heap[j])) { heap[i] = heap[j]; // shift parents down - i = j; + i = (size_t)j; j = ((uint32_t)j) >> 1; } heap[i] = node; // install saved node diff --git a/src/core/CLucene/util/bkd/bkd_docid_iterator.h b/src/core/CLucene/util/bkd/bkd_docid_iterator.h index 491d3c4c5a..412228ad97 100644 --- a/src/core/CLucene/util/bkd/bkd_docid_iterator.h +++ b/src/core/CLucene/util/bkd/bkd_docid_iterator.h @@ -12,7 +12,7 @@ class bkd_docid_set{ public: static const int NO_MORE_DOCS = std::numeric_limits<int32_t>::max(); - explicit bkd_docid_set(int32_t size) { + explicit bkd_docid_set(size_t size) { docids.resize(size); } int32_t length() const { @@ -22,7 +22,7 @@ public: if (_idx == _length) { _docid = NO_MORE_DOCS; } else { - _docid = docids[_offset + _idx]; + _docid = docids[size_t(_offset + _idx)]; _idx++; } return _docid; @@ -48,7 +48,7 @@ public: explicit bkd_docid_bitmap_set(int32_t size) {} ~bkd_docid_bitmap_set() = default; void add(std::vector<char>&& r, int pos) { - docids[pos] = r; + docids[size_t(pos)] = r; _offset++; } void add(std::vector<char>&& r) { @@ -66,7 +66,7 @@ public: if (_idx == _length) { _docid = std::vector<char>(0); } else { - _docid = docids[_offset + _idx]; + _docid = docids[size_t(_offset + _idx)]; _idx++; } return _docid; diff --git a/src/core/CLucene/util/stringUtil.h b/src/core/CLucene/util/stringUtil.h index 4a022e3e24..e7d41e1d83 100644 --- a/src/core/CLucene/util/stringUtil.h +++ b/src/core/CLucene/util/stringUtil.h @@ -41,7 +41,7 @@ public: #if defined(__SSE2__) || defined(__aarch64__) const auto bytes_sse = sizeof(__m128i); - const auto src_end_sse = src_end - (src_end - src) % bytes_sse; + const auto src_end_sse = src_end - size_t(src_end - src) % bytes_sse; const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); @@ -243,7 +243,7 @@ public: } } - return n1 - n2; + return int(n1 - n2); } static inline int32_t utf8_byte_count(uint8_t c) { @@ -275,10 +275,11 @@ public: int32_t bytes_in_char = 0; int32_t surplus_bytes = 0; uint32_t codepoint = 0; - for (uint8_t c : str) { + for (auto cc : str) { + char c = (char)cc; if (bytes_in_char == 0) { if ((c & 0x80) == 0) { - codepoint = c; + codepoint = (uint32_t)c; continue; } else if ((c & 0xE0) == 0xC0) { codepoint = c & 0x1F; @@ -313,10 +314,10 @@ public: size_t i = 0; while (i < utf8_str.size()) { wchar_t wc = utf8_str[i]; - int32_t n = utf8_byte_count(utf8_str[i]); + int32_t n = utf8_byte_count((uint8_t)utf8_str[i]); if ((n >= 1 && n <= 4) && - (i + n <= utf8_str.size()) && - validate_utf8(std::string_view(utf8_str.data() + i, n)) == 0) { + (i + (size_t)n <= utf8_str.size()) && + validate_utf8(std::string_view(utf8_str.data() + i, (size_t)n)) == 0) { if (n == 2) { wc = ((utf8_str[i] & 0x1F) << 6) | (utf8_str[i + 1] & 0x3F); } else if (n == 3) { @@ -324,7 +325,7 @@ public: } else if (n == 4) { wc = ((utf8_str[i] & 0x07) << 18) | ((utf8_str[i + 1] & 0x3F) << 12) | ((utf8_str[i + 2] & 0x3F) << 6) | (utf8_str[i + 3] & 0x3F); } - i += n; + i += (size_t)n; } else { i += 1; } diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index e1c13305aa..b9a09bb306 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -199,8 +199,8 @@ SET(clucene_core_Files ./CLucene/search/spans/SpanWeight.cpp ./CLucene/search/spans/SpanWeight.h ./CLucene/search/spans/TermSpans.cpp - ./CLucene/search/query/DcoIdSetIterator.h ./CLucene/search/query/TermIterator.h + ./CLucene/search/query/TermPositionIterator.h ) #if USE_SHARED_OBJECT_FILES then we link directly to the object files (means rebuilding them for the core) diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 20c722f80c..88c7c229dd 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -85,6 +85,7 @@ SET(test_files ./tests.cpp ./search/spans/TestSpanExplanations.h ./search/spans/TestSpanExplanationsOfNonMatches.cpp ./search/spans/TestSpanExplanationsOfNonMatches.h + ./index/TestIndexCompaction.cpp ./index/TestIndexModifier.cpp ./index/TestIndexWriter.cpp ./index/TestIndexModifier.cpp @@ -102,6 +103,7 @@ SET(test_files ./tests.cpp ./util/TestStringBuffer.cpp ./util/English.cpp ./util/TestStrConvert.cpp + ./query/TestMultiPhraseQuery.cpp ${test_HEADERS}) IF (USE_SHARED_OBJECT_FILES) GET_SHARED_FILES(clucene_shared_Files) diff --git a/src/test/index/TestIndexCompaction.cpp b/src/test/index/TestIndexCompaction.cpp new file mode 100644 index 0000000000..1d49c59788 --- /dev/null +++ b/src/test/index/TestIndexCompaction.cpp @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License + +#include <cstdint> +#include <iostream> +#include <utility> +#include <vector> +#include "CLucene/debug/mem.h" +#include "test.h" +#include "CLucene/debug/error.h" +#include "CLucene/index/IndexWriter.h" +#include "CLucene/store/IndexInput.h" +#include "CLucene/store/IndexOutput.h" +#include "roaring/roaring.hh" + +void _setupSourceNullBitmapValues(std::vector<std::vector<uint32_t>> &srcNullBitmapValues) { + srcNullBitmapValues.push_back(std::vector<uint32_t>{1, 2, 3}); + srcNullBitmapValues.push_back(std::vector<uint32_t>{2, 3, 4}); + srcNullBitmapValues.push_back(std::vector<uint32_t>{3, 4, 5}); +} + +void _setupTransVec(std::vector<std::vector<std::pair<uint32_t, uint32_t>>>& trans_vec) { + + trans_vec.resize(3); + for (int i = 0; i < 3; i++) { + trans_vec[i].resize(6); + } + + trans_vec[0][0] = std::pair<uint32_t, uint32_t>{0, 1}; + trans_vec[0][1] = std::pair<uint32_t, uint32_t>{0, 2}; + trans_vec[0][2] = std::pair<uint32_t, uint32_t>{0, 5}; + trans_vec[0][3] = std::pair<uint32_t, uint32_t>{0, 7}; + trans_vec[0][4] = std::pair<uint32_t, uint32_t>{0, 3}; + trans_vec[0][5] = std::pair<uint32_t, uint32_t>{0, 8}; + trans_vec[1][0] = std::pair<uint32_t, uint32_t>{0, 4}; + trans_vec[1][1] = std::pair<uint32_t, uint32_t>{0, 6}; + trans_vec[1][2] = std::pair<uint32_t, uint32_t>{UINT32_MAX, UINT32_MAX}; + trans_vec[1][3] = std::pair<uint32_t, uint32_t>{1, 1}; + trans_vec[1][4] = std::pair<uint32_t, uint32_t>{1, 2}; + trans_vec[1][5] = std::pair<uint32_t, uint32_t>{1, 9}; + trans_vec[2][0] = std::pair<uint32_t, uint32_t>{1, 3}; + trans_vec[2][1] = std::pair<uint32_t, uint32_t>{1, 4}; + trans_vec[2][2] = std::pair<uint32_t, uint32_t>{1, 5}; + trans_vec[2][3] = std::pair<uint32_t, uint32_t>{1, 6}; + trans_vec[2][4] = std::pair<uint32_t, uint32_t>{1, 7}; + trans_vec[2][5] = std::pair<uint32_t, uint32_t>{1, 8}; +} + +uint64_t _getNullBitmapCardinality(RAMDirectory& dir) { + IndexInput* null_bitmap_in = nullptr; + CLuceneError error; + dir.openInput(IndexWriter::NULL_BITMAP_FILE_NAME, null_bitmap_in, error); + if (error.number() != 0) { + return 0; + } + size_t null_bitmap_size = null_bitmap_in->length(); + std::string buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast<uint8_t*>(const_cast<char*>(buf.data())), null_bitmap_size); + auto null_bitmap = roaring::Roaring::read(buf.data(), false); + null_bitmap.runOptimize(); + + // close resources + null_bitmap_in->close(); + _CLLDELETE(null_bitmap_in); + + return null_bitmap.cardinality(); +} + +// src segments -> dest segments +// 3 -> 2 +// docs 18 -> 17 +// 1,2,3,4,5,6 +// 1,2,3,4,5,6 -> 1,2,3,4,5,6,7,8 +// 1,2,3,4,5,6 1,2,3,4,5,6,7,8,9 +// +// null values +// 1,2,3 +// 2,3,4 -> 2,5,7 +// 3,4,5 1,2,6,7,8 +void TestMergeNullBitmapWriteNullBitmap(CuTest *tc) { + lucene::analysis::SimpleAnalyzer<char> analyzer; + RAMDirectory dir; + auto* index_writer = _CLNEW lucene::index::IndexWriter(&dir, &analyzer, true); + std::vector<std::vector<uint32_t>> srcNullBitmapValues; + std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList; + + _setupSourceNullBitmapValues(srcNullBitmapValues); + + // setup _trans_vec + // translation vec + // <<dest_idx_num, dest_docId>> + // the first level vector: index indicates src segment. + // the second level vector: index indicates row id of source segment, + // value indicates row id of destination segment. + // <UINT32_MAX, UINT32_MAX> indicates current row not exist. + std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec; + _setupTransVec(trans_vec); + + RAMDirectory dest_dir1; + RAMDirectory dest_dir2; + auto* dest_output_index1 = dest_dir1.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + auto* dest_output_index2 = dest_dir2.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + nullBitmapIndexOutputList.push_back(dest_output_index1); + nullBitmapIndexOutputList.push_back(dest_output_index2); + + try { + index_writer->setNumDestIndexes(2); + index_writer->setTransVec(trans_vec); + index_writer->mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList); + } catch (const std::exception& ex) { + std::cout << "Caught exception: " << ex.what() << std::endl; + } catch (...) { + std::cout << "merge null bitmap failed" << std::endl; + return; + } + dest_output_index1->close(); + dest_output_index2->close(); + _CLLDELETE(dest_output_index1); + _CLLDELETE(dest_output_index2); + nullBitmapIndexOutputList.clear(); + index_writer->close(); + _CLDELETE(index_writer); + + // check cardinality + uint64_t source_cardinality = 0; + for (const auto& vec : srcNullBitmapValues) { + source_cardinality += vec.size(); + } + auto dest_cardinality1 = _getNullBitmapCardinality(dest_dir1); + auto dest_cardinality2 = _getNullBitmapCardinality(dest_dir2); + auto dest_cardinality = dest_cardinality1 + dest_cardinality2; + + // 9 = 8 + 1 + CLUCENE_ASSERT(source_cardinality == (dest_cardinality + 1)); + + // release resources + dest_dir1.close(); + dest_dir2.close(); + dir.close(); +} + +void TestMergeNullBitmapEmptySrc(CuTest *tc) { + lucene::analysis::SimpleAnalyzer<char> analyzer; + RAMDirectory dir; + auto* index_writer = _CLNEW lucene::index::IndexWriter(&dir, &analyzer, true); + // empty source bitmap values + std::vector<std::vector<uint32_t>> srcNullBitmapValues; + std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList; + + std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec; + _setupTransVec(trans_vec); + + RAMDirectory dest_dir1; + RAMDirectory dest_dir2; + auto* dest_output_index1 = dest_dir1.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + auto* dest_output_index2 = dest_dir2.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + nullBitmapIndexOutputList.push_back(dest_output_index1); + nullBitmapIndexOutputList.push_back(dest_output_index2); + + try { + index_writer->setNumDestIndexes(2); + index_writer->setTransVec(trans_vec); + index_writer->mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList); + } catch (const std::exception& ex) { + std::cout << "Caught exception: " << ex.what() << std::endl; + } catch (...) { + std::cout << "merge null bitmap failed" << std::endl; + return; + } + dest_output_index1->close(); + dest_output_index2->close(); + _CLLDELETE(dest_output_index1); + _CLLDELETE(dest_output_index2); + nullBitmapIndexOutputList.clear(); + index_writer->close(); + _CLDELETE(index_writer); + + // check cardinality + uint64_t source_cardinality = 0; + for (const auto& vec : srcNullBitmapValues) { + source_cardinality += vec.size(); + } + auto dest_cardinality1 = _getNullBitmapCardinality(dest_dir1); + auto dest_cardinality2 = _getNullBitmapCardinality(dest_dir2); + auto dest_cardinality = dest_cardinality1 + dest_cardinality2; + + // 0 = 0 + CLUCENE_ASSERT(source_cardinality == dest_cardinality); + + // release resources + dest_dir1.close(); + dest_dir2.close(); + dir.close(); +} + +void TestMergeNullBitmapEmptyIndexSrcBitmapValues(CuTest *tc) { + lucene::analysis::SimpleAnalyzer<char> analyzer; + RAMDirectory dir; + auto* index_writer = _CLNEW lucene::index::IndexWriter(&dir, &analyzer, true); + // empty source bitmap values for every index + std::vector<std::vector<uint32_t>> srcNullBitmapValues; + srcNullBitmapValues.push_back(std::vector<uint32_t>()); + srcNullBitmapValues.push_back(std::vector<uint32_t>()); + srcNullBitmapValues.push_back(std::vector<uint32_t>()); + + std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList; + + std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec; + _setupTransVec(trans_vec); + + RAMDirectory dest_dir1; + RAMDirectory dest_dir2; + auto* dest_output_index1 = dest_dir1.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + auto* dest_output_index2 = dest_dir2.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + nullBitmapIndexOutputList.push_back(dest_output_index1); + nullBitmapIndexOutputList.push_back(dest_output_index2); + + try { + index_writer->setNumDestIndexes(2); + index_writer->setTransVec(trans_vec); + index_writer->mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList); + } catch (const std::exception& ex) { + std::cout << "Caught exception: " << ex.what() << std::endl; + } catch (...) { + std::cout << "merge null bitmap failed" << std::endl; + return; + } + dest_output_index1->close(); + dest_output_index2->close(); + _CLLDELETE(dest_output_index1); + _CLLDELETE(dest_output_index2); + nullBitmapIndexOutputList.clear(); + index_writer->close(); + _CLDELETE(index_writer); + + // check cardinality + uint64_t source_cardinality = 0; + for (const auto& vec : srcNullBitmapValues) { + source_cardinality += vec.size(); + } + auto dest_cardinality1 = _getNullBitmapCardinality(dest_dir1); + auto dest_cardinality2 = _getNullBitmapCardinality(dest_dir2); + auto dest_cardinality = dest_cardinality1 + dest_cardinality2; + + // 0 = 0 + CLUCENE_ASSERT(source_cardinality == dest_cardinality); + + // release resources + dest_dir1.close(); + dest_dir2.close(); + dir.close(); +} + +void TestMergeNullBitmapIgnoreDoc(CuTest *tc) { + lucene::analysis::SimpleAnalyzer<char> analyzer; + RAMDirectory dir; + auto* index_writer = _CLNEW lucene::index::IndexWriter(&dir, &analyzer, true); + std::vector<std::vector<uint32_t>> srcNullBitmapValues; + _setupSourceNullBitmapValues(srcNullBitmapValues); + + std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList; + + // all docs in src index are ignored + std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec; + trans_vec.resize(srcNullBitmapValues.size()); + for (int i = 0; i < trans_vec.size(); i++) { + trans_vec[i].resize(6); + } + for (int i = 0; i < srcNullBitmapValues.size(); i++) { + for (int j = 0; j < 6; j++) { + trans_vec[i][j] = std::pair<uint32_t, uint32_t>{UINT32_MAX, UINT32_MAX}; + } + } + + RAMDirectory dest_dir1; + RAMDirectory dest_dir2; + auto* dest_output_index1 = dest_dir1.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + auto* dest_output_index2 = dest_dir2.createOutput(IndexWriter::NULL_BITMAP_FILE_NAME); + nullBitmapIndexOutputList.push_back(dest_output_index1); + nullBitmapIndexOutputList.push_back(dest_output_index2); + + try { + index_writer->setNumDestIndexes(2); + index_writer->setTransVec(trans_vec); + index_writer->mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList); + } catch (const std::exception& ex) { + std::cout << "Caught exception: " << ex.what() << std::endl; + } catch (...) { + std::cout << "merge null bitmap failed" << std::endl; + return; + } + dest_output_index1->close(); + dest_output_index2->close(); + _CLLDELETE(dest_output_index1); + _CLLDELETE(dest_output_index2); + nullBitmapIndexOutputList.clear(); + index_writer->close(); + _CLDELETE(index_writer); + + // check cardinality + uint64_t source_cardinality = 0; + for (const auto& vec : srcNullBitmapValues) { + source_cardinality += vec.size(); + } + auto dest_cardinality1 = _getNullBitmapCardinality(dest_dir1); + auto dest_cardinality2 = _getNullBitmapCardinality(dest_dir2); + auto dest_cardinality = dest_cardinality1 + dest_cardinality2; + + // 9 = 0 + 9 + CLUCENE_ASSERT(source_cardinality == dest_cardinality + source_cardinality); + + // release resources + dest_dir1.close(); + dest_dir2.close(); + dir.close(); +} + + + +CuSuite* testIndexCompaction() { + CuSuite* suite = CuSuiteNew(_T("CLucene Index Compaction Test")); + + SUITE_ADD_TEST(suite, TestMergeNullBitmapWriteNullBitmap); + SUITE_ADD_TEST(suite, TestMergeNullBitmapEmptySrc); + SUITE_ADD_TEST(suite, TestMergeNullBitmapEmptyIndexSrcBitmapValues); + SUITE_ADD_TEST(suite, TestMergeNullBitmapIgnoreDoc); + + return suite; +} \ No newline at end of file diff --git a/src/test/query/TestMultiPhraseQuery.cpp b/src/test/query/TestMultiPhraseQuery.cpp new file mode 100644 index 0000000000..ccc4fe7f89 --- /dev/null +++ b/src/test/query/TestMultiPhraseQuery.cpp @@ -0,0 +1,163 @@ +#include <CLucene.h> + +#include <iostream> +#include <memory> +#include <vector> + +#include "CLucene/debug/error.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "CLucene/search/MultiPhraseQuery.h" +#include "CLucene/store/Directory.h" +#include "CLucene/store/FSDirectory.h" +#include "CLucene/store/RAMDirectory.h" +#include "test.h" + +CL_NS_USE(util) +CL_NS_USE(store) +CL_NS_USE(search) +CL_NS_USE(index) + +void testSimple1Add(CuTest* tc) { + RAMDirectory dir; + + SimpleAnalyzer<char> analyzer; + IndexWriter w(&dir, &analyzer, true); + w.setUseCompoundFile(false); + auto field_name = lucene::util::Misc::_charToWide("name"); + std::string value = "value"; + + Document doc; + auto field = _CLNEW Field(field_name, Field::INDEX_TOKENIZED | Field::STORE_NO); + + auto char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + char_string_reader->init(value.data(), value.size(), true); + auto stream = analyzer.tokenStream(field->name(), char_string_reader.get()); + field->setValue(stream); + doc.add(*field); + + w.addDocument(&doc); + w.close(); + + IndexSearcher index_searcher(&dir); + { + MultiPhraseQuery query; + + Term* t1 = _CLNEW Term(_T( "name" ), _T( "t1" )); + query.add(t1); + _CLLDECDELETE(t1); + + std::vector<int32_t> result; + index_searcher._search(&query, [&result](const int32_t docid, const float_t /*score*/) { + result.push_back(docid); + }); + CLUCENE_ASSERT(result.size() == 0); + } + + _CLDELETE(stream) + _CLDELETE_ARRAY(field_name) +} + +void testSimple2Add(CuTest* tc) { + RAMDirectory dir; + + SimpleAnalyzer<char> analyzer; + IndexWriter w(&dir, &analyzer, true); + w.setUseCompoundFile(false); + auto field_name = lucene::util::Misc::_charToWide("name"); + std::string value = "value"; + + Document doc; + auto field = _CLNEW Field(field_name, Field::INDEX_TOKENIZED | Field::STORE_NO); + + auto char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + char_string_reader->init(value.data(), value.size(), true); + auto stream = analyzer.tokenStream(field->name(), char_string_reader.get()); + field->setValue(stream); + doc.add(*field); + + w.addDocument(&doc); + w.close(); + + IndexSearcher index_searcher(&dir); + { + MultiPhraseQuery query; + + std::vector<Term*> terms; + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t2" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t3" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t4" ))); + query.add(terms); + for (int32_t i = 0; i < terms.size(); i++) { + _CLLDECDELETE(terms[i]); + } + + std::vector<int32_t> result; + index_searcher._search(&query, [&result](const int32_t docid, const float_t /*score*/) { + result.push_back(docid); + }); + CLUCENE_ASSERT(result.size() == 0); + } + + _CLDELETE(stream) + _CLDELETE_ARRAY(field_name) +} + +void testMultiAdd(CuTest* tc) { + RAMDirectory dir; + + SimpleAnalyzer<char> analyzer; + IndexWriter w(&dir, &analyzer, true); + w.setUseCompoundFile(false); + auto field_name = lucene::util::Misc::_charToWide("name"); + std::string value = "value"; + + Document doc; + auto field = _CLNEW Field(field_name, Field::INDEX_TOKENIZED | Field::STORE_NO); + + auto char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + char_string_reader->init(value.data(), value.size(), true); + auto stream = analyzer.tokenStream(field->name(), char_string_reader.get()); + field->setValue(stream); + doc.add(*field); + + w.addDocument(&doc); + w.close(); + + IndexSearcher index_searcher(&dir); + { + MultiPhraseQuery query; + + Term* t1 = _CLNEW Term(_T( "name" ), _T( "t1" )); + query.add(t1); + _CLLDECDELETE(t1); + + std::vector<Term*> terms; + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t2" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t3" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t4" ))); + query.add(terms); + for (int32_t i = 0; i < terms.size(); i++) { + _CLLDECDELETE(terms[i]); + } + + std::vector<int32_t> result; + index_searcher._search(&query, [&result](const int32_t docid, const float_t /*score*/) { + result.push_back(docid); + }); + CLUCENE_ASSERT(result.size() == 0); + } + + _CLDELETE(stream) + _CLDELETE_ARRAY(field_name) +} + +CuSuite* testMultiPhraseQuery(void) { + CuSuite* suite = CuSuiteNew(_T("CLucene MultiPhraseQuery Test")); + + SUITE_ADD_TEST(suite, testSimple1Add); + SUITE_ADD_TEST(suite, testSimple2Add); + SUITE_ADD_TEST(suite, testMultiAdd); + + return suite; +} \ No newline at end of file diff --git a/src/test/test.h b/src/test/test.h index 08c168cb67..cb92953882 100644 --- a/src/test/test.h +++ b/src/test/test.h @@ -82,6 +82,8 @@ CuSuite *testTermVectorsReader(void); CuSuite *teststandard95(void); CuSuite *testStrConvert(void); CuSuite *testSearchRange(void); +CuSuite *testMultiPhraseQuery(void); +CuSuite *testIndexCompaction(void); #ifdef TEST_CONTRIB_LIBS //CuSuite *testGermanAnalyzer(void); diff --git a/src/test/tests.cpp b/src/test/tests.cpp index d703e15973..3a193a3c38 100644 --- a/src/test/tests.cpp +++ b/src/test/tests.cpp @@ -16,6 +16,8 @@ unittest tests[] = { {"MSBRadixSorter",testMSBRadixSorter}, {"strconvert", testStrConvert}, {"searchRange", testSearchRange}, + {"MultiPhraseQuery", testMultiPhraseQuery}, + {"IndexCompaction", testIndexCompaction}, #ifdef TEST_CONTRIB_LIBS {"chinese", testchinese}, #endif --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org