This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch clucene-2.0 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-2.0 by this push: new d3a628663ad [improve](pfor) add non-simd implementation for PFOR 256 (#297) d3a628663ad is described below commit d3a628663ad07a38568f07deb04f5bc07fcc5869 Author: airborne12 <jiang...@selectdb.com> AuthorDate: Sun Mar 23 22:17:26 2025 +0800 [improve](pfor) add non-simd implementation for PFOR 256 (#297) * [improve](pfor) add non-simd implementation for PFOR 256 * [improve](pfor) add non-simd implementation for PFOR 256 * [improve](pfor) add non-simd implementation for PFOR 256 * add zigzag scalar function * add unitest for pfor encode/decode * add unitest for pfor encode/decode * add unitest for pfor encode/decode * add unitest for pfor encode/decode * add unitest for pfor encode/decode --- CMakeLists.txt | 27 +- cmake/Toolchain-aarch64.cmake | 5 + src/core/CLucene/index/CodeMode.h | 4 +- src/core/CLucene/index/FieldInfos.cpp | 56 +- src/core/CLucene/index/SDocumentWriter.cpp | 20 +- src/core/CLucene/index/SegmentTermDocs.cpp | 43 +- src/core/CLucene/index/_FieldInfos.h | 9 +- src/core/CLucene/index/_SegmentHeader.h | 6 +- src/core/CLucene/util/PFORUtil.cpp | 190 +++- src/core/CLucene/util/PFORUtil.h | 10 +- src/ext/for/CMakeLists.txt | 16 + src/ext/for/bitpack.h | 17 + src/ext/for/bitunpack.c | 1202 ++++++++++++++++++++ src/ext/for/test_bitd1unpack.cpp | 399 +++++++ src/ext/for/vp4.h | 2 + src/ext/for/vp4d.c | 30 + src/test/CMakeLists.txt | 3 +- .../pfor_p4ndx_compat_gen_by_old_version_arm.dat | Bin 0 -> 1168 bytes ...pfor_p4ndx_compat_gen_by_old_version_x86_64.dat | Bin 0 -> 1164 bytes src/test/store/testPFOR.cpp | 546 +++++++++ src/test/test.h | 2 +- src/test/tests.cpp | 3 +- 22 files changed, 2494 insertions(+), 96 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3b77861857..44a673a9e4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,8 +19,6 @@ SET(CLUCENE_VERSION "${CLUCENE_VERSION_MAJOR}.${CLUCENE_VERSION_MINOR}.${CLUCENE #CMake 2.6+ is recommended to an improved Boost module CMAKE_MINIMUM_REQUIRED(VERSION 2.4.0 FATAL_ERROR) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") - if(COMMAND cmake_policy) cmake_policy(SET CMP0003 NEW) cmake_policy(SET CMP0043 NEW) @@ -135,9 +133,30 @@ elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "LSAN") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_LSAN}") endif() -if (USE_AVX2) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX2") +if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64") + set (ARCH_AMD64 1) +endif () +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*)") + set (ARCH_AARCH64 1) +endif () +if (ARCH_AARCH64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm") + set (ARCH_ARM 1) +endif () +if (ARCH_AMD64) + if (USE_SSE4_2) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + endif() + message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + if (USE_AVX2) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DUSE_AVX2") + endif() endif() + +if (ARCH_ARM) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc") +endif() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") + if (__COMPILER_CLANG) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-narrowing -g -fno-omit-frame-pointer") else () diff --git a/cmake/Toolchain-aarch64.cmake b/cmake/Toolchain-aarch64.cmake new file mode 100644 index 00000000000..948164d513f --- /dev/null +++ b/cmake/Toolchain-aarch64.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR aarch64) + +set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) +set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) diff --git a/src/core/CLucene/index/CodeMode.h b/src/core/CLucene/index/CodeMode.h index 3c39e94ecb6..05dd8b82649 100644 --- a/src/core/CLucene/index/CodeMode.h +++ b/src/core/CLucene/index/CodeMode.h @@ -5,7 +5,9 @@ CL_NS_DEF(index) enum class CodeMode { kDefault = 0, kPfor = 1, - kRange = 2 + kRange = 2, + kPfor256 = 3, + kPfor128 = 4 }; CL_NS_END \ No newline at end of file diff --git a/src/core/CLucene/index/FieldInfos.cpp b/src/core/CLucene/index/FieldInfos.cpp index 00e0c4275a5..155c14e945f 100644 --- a/src/core/CLucene/index/FieldInfos.cpp +++ b/src/core/CLucene/index/FieldInfos.cpp @@ -21,24 +21,20 @@ CL_NS_USE(document) CL_NS_USE(util) CL_NS_DEF(index) - -FieldInfo::FieldInfo(const TCHAR *_fieldName, - const bool _isIndexed, - const int32_t _fieldNumber, - const bool _storeTermVector, - const bool _storeOffsetWithTermVector, - const bool _storePositionWithTermVector, - const bool _omitNorms, - const bool _hasProx, - const bool _storePayloads) : name(CLStringIntern::intern(_fieldName )), - isIndexed(_isIndexed), - number(_fieldNumber), - storeTermVector(_storeTermVector), - storeOffsetWithTermVector(_storeOffsetWithTermVector), - storePositionWithTermVector(_storePositionWithTermVector), - omitNorms(_omitNorms), hasProx(_hasProx), - storePayloads(_storePayloads) { -} +FieldInfo::FieldInfo(const TCHAR* _fieldName, const bool _isIndexed, const int32_t _fieldNumber, + const bool _storeTermVector, const bool _storeOffsetWithTermVector, + const bool _storePositionWithTermVector, const bool _omitNorms, + const bool _hasProx, const bool _storePayloads, const bool _compatibleRead) + : name(CLStringIntern::intern(_fieldName)), + isIndexed(_isIndexed), + number(_fieldNumber), + storeTermVector(_storeTermVector), + storeOffsetWithTermVector(_storeOffsetWithTermVector), + storePositionWithTermVector(_storePositionWithTermVector), + omitNorms(_omitNorms), + hasProx(_hasProx), + storePayloads(_storePayloads), + compatibleRead(_compatibleRead) {} FieldInfo::~FieldInfo(){ CL_NS(util)::CLStringIntern::unintern(name); @@ -46,7 +42,7 @@ FieldInfo::~FieldInfo(){ FieldInfo* FieldInfo::clone() { return _CLNEW FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, hasProx, storePayloads); + storeOffsetWithTermVector, omitNorms, hasProx, storePayloads, compatibleRead); } FieldInfos::FieldInfos(): @@ -103,6 +99,17 @@ bool FieldInfos::hasProx() { return false; } +bool FieldInfos::compatibleRead() { + int numFields = byNumber.size(); + for (int i = 0; i < numFields; i++) { + FieldInfo* fi = fieldInfo(i); + if (fi->compatibleRead) { + return true; + } + } + return false; +} + IndexVersion FieldInfos::getIndexVersion() { int numFields = byNumber.size(); for (int i = 0; i < numFields; i++) { @@ -137,11 +144,11 @@ void FieldInfos::add(const TCHAR** names, const bool isIndexed, const bool store FieldInfo* FieldInfos::add(const TCHAR* name, const bool isIndexed, const bool storeTermVector, const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, - const bool hasProx, const bool storePayloads) { + const bool hasProx, const bool storePayloads, const bool compatibleRead) { FieldInfo* fi = fieldInfo(name); if (fi == NULL) { return addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, hasProx, storePayloads); + storeOffsetWithTermVector, omitNorms, hasProx, storePayloads, compatibleRead); } else { if (fi->isIndexed != isIndexed) { fi->isIndexed = true; // once indexed, always index @@ -164,6 +171,9 @@ FieldInfo* FieldInfos::add(const TCHAR* name, const bool isIndexed, const bool s if (fi->storePayloads != storePayloads) { fi->storePayloads = true; } + if (fi->compatibleRead != compatibleRead) { + fi->compatibleRead = compatibleRead; + } } return fi; } @@ -172,10 +182,10 @@ FieldInfo* FieldInfos::addInternal(const TCHAR* name, const bool isIndexed, const bool storeTermVector, const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, - const bool hasProx, const bool storePayloads) { + const bool hasProx, const bool storePayloads, const bool compatibleRead) { FieldInfo* fi = _CLNEW FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, - omitNorms, hasProx, storePayloads); + omitNorms, hasProx, storePayloads, compatibleRead); byNumber.push_back(fi); byName.put( fi->name, fi); return fi; diff --git a/src/core/CLucene/index/SDocumentWriter.cpp b/src/core/CLucene/index/SDocumentWriter.cpp index 2b85fe5bbca..8d5df79a447 100644 --- a/src/core/CLucene/index/SDocumentWriter.cpp +++ b/src/core/CLucene/index/SDocumentWriter.cpp @@ -1198,31 +1198,13 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa skipListWriter->resetSkip(); - auto encode = [](IndexOutput* out, std::vector<uint32_t>& buffer, bool isDoc) { - std::vector<uint8_t> compress(4 * buffer.size() + PFOR_BLOCK_SIZE); - size_t size = 0; - if (isDoc) { - size = P4ENC(buffer.data(), buffer.size(), compress.data()); - } else { - size = P4NZENC(buffer.data(), buffer.size(), compress.data()); - } - out->writeVInt(size); - out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); - buffer.resize(0); - }; - // Now termStates has numToMerge FieldMergeStates // which all share the same term. Now we must // interleave the docID streams. while (numToMerge > 0) { if ((++df % skipInterval) == 0) { - freqOut->writeByte((char)CodeMode::kPfor); - freqOut->writeVInt(docDeltaBuffer.size()); - encode(freqOut, docDeltaBuffer, true); - if (hasProx_) { - encode(freqOut, freqBuffer, false); - } + pfor_encode(freqOut, docDeltaBuffer, freqBuffer, hasProx_); skipListWriter->setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength); skipListWriter->bufferSkip(df); diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp b/src/core/CLucene/index/SegmentTermDocs.cpp index e346dc0ca24..ae9e3a4508f 100644 --- a/src/core/CLucene/index/SegmentTermDocs.cpp +++ b/src/core/CLucene/index/SegmentTermDocs.cpp @@ -22,7 +22,7 @@ SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) : parent(_parent) count(0), df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0), skipInterval(_parent->tis->getSkipInterval()), maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL), freqBasePointer(0), proxBasePointer(0), skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0), indexVersion_(_parent->_fieldInfos->getIndexVersion()), - hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, indexVersion_) { + hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, indexVersion_, _parent->_fieldInfos->compatibleRead()) { CND_CONDITION(_parent != NULL, "Parent is NULL"); memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t)); memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t)); @@ -222,13 +222,13 @@ int32_t TermDocsBuffer::refillV0() { uint32_t SerializedSize = freqStream_->readVInt(); std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); freqStream_->readBytes(buf.data(), SerializedSize); - P4DEC(buf.data(), arraySize, docs_.data()); + util::P4DEC(buf.data(), arraySize, docs_.data()); } { uint32_t SerializedSize = freqStream_->readVInt(); std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); freqStream_->readBytes(buf.data(), SerializedSize); - P4NZDEC(buf.data(), arraySize, freqs_.data()); + util::P4NZDEC(buf.data(), arraySize, freqs_.data()); } } else if (mode == (char)CodeMode::kDefault) { uint32_t docDelta = 0; @@ -258,7 +258,7 @@ int32_t TermDocsBuffer::refillV0() { uint32_t serializedSize = freqStream_->readVInt(); std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE); freqStream_->readBytes(buf.data(), serializedSize); - P4DEC(buf.data(), arraySize, docs_.data()); + util::P4DEC(buf.data(), arraySize, docs_.data()); } } return arraySize; @@ -266,40 +266,7 @@ int32_t TermDocsBuffer::refillV0() { } int32_t TermDocsBuffer::refillV1() { - char mode = freqStream_->readByte(); - uint32_t arraySize = freqStream_->readVInt(); - if (mode == (char)CodeMode::kPfor) { - { - uint32_t SerializedSize = freqStream_->readVInt(); - std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); - freqStream_->readBytes(buf.data(), SerializedSize); - P4DEC(buf.data(), arraySize, docs_.data()); - } - if (hasProx_) { - uint32_t SerializedSize = freqStream_->readVInt(); - std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); - freqStream_->readBytes(buf.data(), SerializedSize); - P4NZDEC(buf.data(), arraySize, freqs_.data()); - } - } else if (mode == (char)CodeMode::kDefault) { - uint32_t docDelta = 0; - for (uint32_t i = 0; i < arraySize; i++) { - uint32_t docCode = freqStream_->readVInt(); - if (hasProx_) { - docDelta += (docCode >> 1); - docs_[i] = docDelta; - if ((docCode & 1) != 0) { - freqs_[i] = 1; - } else { - freqs_[i] = freqStream_->readVInt(); - } - } else { - docDelta += docCode; - docs_[i] = docDelta; - } - } - } - return arraySize; + return pfor_decode(freqStream_, docs_, freqs_, hasProx_, compatibleRead_); } CL_NS_END diff --git a/src/core/CLucene/index/_FieldInfos.h b/src/core/CLucene/index/_FieldInfos.h index ed142c4435c..f80388bb73d 100644 --- a/src/core/CLucene/index/_FieldInfos.h +++ b/src/core/CLucene/index/_FieldInfos.h @@ -38,6 +38,7 @@ class FieldInfo :LUCENE_BASE{ IndexVersion indexVersion_ = IndexVersion::kV1; bool storePayloads; // whether this field stores payloads together with term positions + bool compatibleRead; // whether index docid list is read cross platform(eg x86 and arm64) //Func - Constructor // Initialises FieldInfo. @@ -59,7 +60,8 @@ class FieldInfo :LUCENE_BASE{ const bool storePositionWithTermVector, const bool omitNorms, const bool hasProx, - const bool storePayloads); + const bool storePayloads, + const bool compatibleRead); //Func - Destructor //Pre - true @@ -133,6 +135,7 @@ public: void addIndexed(const TCHAR** names, const bool storeTermVectors, const bool storePositionWithTermVector, const bool storeOffsetWithTermVector); bool hasProx(); + bool compatibleRead(); IndexVersion getIndexVersion(); /** @@ -167,13 +170,13 @@ public: FieldInfo* add(const TCHAR* name, const bool isIndexed, const bool storeTermVector = false, const bool storePositionWithTermVector = false, const bool storeOffsetWithTermVector = false, const bool omitNorms = false, - const bool hasProx = false, const bool storePayloads = false); + const bool hasProx = false, const bool storePayloads = false, const bool compatibleRead = false); // was void FieldInfo* addInternal(const TCHAR* name, const bool isIndexed, const bool storeTermVector, const bool storePositionWithTermVector, const bool storeOffsetWithTermVector, const bool omitNorms, - const bool hasProx, const bool storePayloads); + const bool hasProx, const bool storePayloads, const bool compatibleRead = false); int32_t fieldNumber(const TCHAR* fieldName)const; diff --git a/src/core/CLucene/index/_SegmentHeader.h b/src/core/CLucene/index/_SegmentHeader.h index c1f01e7cecb..8087423953b 100644 --- a/src/core/CLucene/index/_SegmentHeader.h +++ b/src/core/CLucene/index/_SegmentHeader.h @@ -32,12 +32,13 @@ class SegmentReader; class TermDocsBuffer { public: - TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, IndexVersion indexVersion) + TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, IndexVersion indexVersion, bool compatibleRead) : docs_(PFOR_BLOCK_SIZE + 3), freqs_(PFOR_BLOCK_SIZE + 3), freqStream_(freqStream), hasProx_(hasProx), - indexVersion_(indexVersion) { + indexVersion_(indexVersion), + compatibleRead_(compatibleRead) { } ~TermDocsBuffer() { @@ -83,6 +84,7 @@ private: CL_NS(store)::IndexInput* freqStream_ = nullptr; bool hasProx_ = false; + bool compatibleRead_ = false; IndexVersion indexVersion_ = IndexVersion::kV0; }; diff --git a/src/core/CLucene/util/PFORUtil.cpp b/src/core/CLucene/util/PFORUtil.cpp index ae27f521553..d241a305acf 100644 --- a/src/core/CLucene/util/PFORUtil.cpp +++ b/src/core/CLucene/util/PFORUtil.cpp @@ -15,19 +15,20 @@ // specific language governing permissions and limitations // under the License. #include "PFORUtil.h" +#include "CLucene/debug/error.h" +#include "CLucene/index/CodeMode.h" #include "vp4.h" #if (defined(__i386) || defined(__x86_64__)) #include <cpuid.h> #endif -namespace { +CL_NS_DEF(util) using DEC_FUNC = size_t (*)(unsigned char *__restrict, size_t, uint32_t *__restrict); using ENC_FUNC = size_t (*)(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); DEC_FUNC g_p4nd1dec; DEC_FUNC g_p4nzdec; ENC_FUNC g_p4nd1enc; ENC_FUNC g_p4nzenc; -} // anonymous namespace size_t DefaultDEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { size_t bufferSize = 0; @@ -129,3 +130,188 @@ size_t P4ENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { size_t P4NZENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { return g_p4nzenc(in, n, out); } +void pfor_encode(store::IndexOutput* out, std::vector<uint32_t>& docDeltaBuffer, std::vector<uint32_t>& freqBuffer, bool has_prox) { +#ifdef __AVX2__ + out->writeByte((char)index::CodeMode::kPfor256); + out->writeVInt(docDeltaBuffer.size()); + std::vector<uint8_t> compress(4 * docDeltaBuffer.size() + PFOR_BLOCK_SIZE); + size_t size = 0; + size = p4nd1enc256v32(docDeltaBuffer.data(), docDeltaBuffer.size(), compress.data()); + out->writeVInt(size); + out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); + if (has_prox) { + size = p4nzenc256v32(freqBuffer.data(), freqBuffer.size(), compress.data()); + out->writeVInt(size); + out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); + } +#elif (defined(__SSSE3__) || defined(__ARM_NEON)) + out->writeByte((char)index::CodeMode::kPfor128); + out->writeVInt(docDeltaBuffer.size()); + std::vector<uint8_t> compress(4 * docDeltaBuffer.size() + PFOR_BLOCK_SIZE); + size_t size = 0; + size = p4nd1enc32(docDeltaBuffer.data(), docDeltaBuffer.size(), compress.data()); + out->writeVInt(size); + out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); + if (has_prox) { + size = p4nzenc32(freqBuffer.data(), freqBuffer.size(), compress.data()); + out->writeVInt(size); + out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); + } +#else + out->writeByte((char)index::CodeMode::kDefault); + out->writeVInt(docDeltaBuffer.size()); + uint32_t lastDoc = 0; + for (int32_t i = 0; i < docDeltaBuffer.size(); i++) { + uint32_t curDoc = docDeltaBuffer[i]; + if (has_prox) { + uint32_t newDocCode = (curDoc - lastDoc) << 1; + lastDoc = curDoc; + uint32_t freq = freqBuffer[i]; + if (1 == freq) { + out->writeVInt(newDocCode | 1); + } else { + out->writeVInt(newDocCode); + out->writeVInt(freq); + } + } else { + out->writeVInt(curDoc - lastDoc); + lastDoc = curDoc; + } + } +#endif + docDeltaBuffer.resize(0); + freqBuffer.resize(0); +} + +uint32_t pfor_decode(store::IndexInput* in, std::vector<uint32_t>& docs, std::vector<uint32_t>& freqs, bool has_prox, bool compatibleRead) { + char mode = in->readByte(); + uint32_t arraySize = in->readVInt(); + // old version, need to separate read based on compatibleRead + if (mode == (char)index::CodeMode::kPfor) { + { + uint32_t SerializedSize = in->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + in->readBytes(buf.data(), SerializedSize); +#if defined(USE_AVX2) && defined(__AVX2__) + // if compatibleRead is true, means we are reading old version arm64 index in x86_64 platform. + if (compatibleRead) { + p4nd1dec32(buf.data(), arraySize, docs.data()); + } else { + p4nd1dec256v32(buf.data(), arraySize, docs.data()); + } +#elif (defined(__ARM_NEON)) + // if compatibleRead is true, means we are reading old version x86_64 index in arm64 platform. + if (compatibleRead) { + p4nd1dec256scalarv32(buf.data(), arraySize, docs.data()); + } else { + p4nd1dec32(buf.data(), arraySize, docs.data()); + } +#elif (defined(__SSSE3__)) + // if compatibleRead is true, means we are reading old version x86_64 index in x86_64 which does not support avx2. + if (compatibleRead) { + p4nd1dec256scalarv32(buf.data(), arraySize, docs.data()); + } else { + DefaultDDEC(buf.data(), arraySize, docs.data()); + } +#else + DefaultDDEC(buf.data(), arraySize, docs.data()); +#endif + } + if (has_prox) { + uint32_t SerializedSize = in->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + in->readBytes(buf.data(), SerializedSize); +#if defined(USE_AVX2) && defined(__AVX2__) + // if compatibleRead is true, means we are reading old version arm64 index in x86_64 platform. + if (compatibleRead) { + p4nzdec32(buf.data(), arraySize, freqs.data()); + } else { + p4nzdec256v32(buf.data(), arraySize, freqs.data()); + } +#elif (defined(__ARM_NEON)) + // if compatibleRead is true, means we are reading old version x86_64 index in arm64 platform. + if (compatibleRead) { + p4nzdec256scalarv32(buf.data(), arraySize, freqs.data()); + } else { + p4nzdec32(buf.data(), arraySize, freqs.data()); + } +#elif (defined(__SSSE3__)) + // if compatibleRead is true, means we are reading old version x86_64 index in x86_64 which does not support avx2. + if (compatibleRead) { + p4nzdec256scalarv32(buf.data(), arraySize, freqs.data()); + } else { + DefaultDEC(buf.data(), arraySize, freqs.data()); + } +#else + DefaultDEC(buf.data(), arraySize, freqs.data()); +#endif + } + } else if (mode == (char)index::CodeMode::kDefault) { + uint32_t docDelta = 0; + for (uint32_t i = 0; i < arraySize; i++) { + uint32_t docCode = in->readVInt(); + if (has_prox) { + docDelta += (docCode >> 1); + docs[i] = docDelta; + if ((docCode & 1) != 0) { + freqs[i] = 1; + } else { + freqs[i] = in->readVInt(); + } + } else { + docDelta += docCode; + docs[i] = docDelta; + } + } + } else if (mode == (char)index::CodeMode::kPfor256) { + // new version, read based on compatibleRead + { + uint32_t SerializedSize = in->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + in->readBytes(buf.data(), SerializedSize); +#if defined(USE_AVX2) && defined(__AVX2__) + p4nd1dec256v32(buf.data(), arraySize, docs.data()); +#else + _CLTHROWA(CL_ERR_CorruptIndex, "PFOR256 is not supported on this platform"); +#endif + } + if (has_prox) { + uint32_t SerializedSize = in->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + in->readBytes(buf.data(), SerializedSize); +#if defined(USE_AVX2) && defined(__AVX2__) + p4nzdec256v32(buf.data(), arraySize, freqs.data()); +#else + _CLTHROWA(CL_ERR_CorruptIndex, "PFOR256 is not supported on this platform"); +#endif + } + } else if (mode == (char)index::CodeMode::kPfor128) { + // new version, read based on compatibleRead + { + uint32_t SerializedSize = in->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + in->readBytes(buf.data(), SerializedSize); +#if defined(USE_AVX2) && defined(__AVX2__) + p4nd1dec32(buf.data(), arraySize, docs.data()); +#elif (defined(__SSSE3__) || defined(__ARM_NEON)) + p4nd1dec32(buf.data(), arraySize, docs.data()); +#else + _CLTHROWA(CL_ERR_CorruptIndex, "PFOR128 is not supported on this platform"); +#endif + } + if (has_prox) { + uint32_t SerializedSize = in->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + in->readBytes(buf.data(), SerializedSize); +#if defined(USE_AVX2) && defined(__AVX2__) + p4nzdec32(buf.data(), arraySize, freqs.data()); +#elif (defined(__SSSE3__) || defined(__ARM_NEON)) + p4nzdec32(buf.data(), arraySize, freqs.data()); +#else + _CLTHROWA(CL_ERR_CorruptIndex, "PFOR128 is not supported on this platform"); +#endif + } + } + return arraySize; +} +CL_NS_END diff --git a/src/core/CLucene/util/PFORUtil.h b/src/core/CLucene/util/PFORUtil.h index 29acb7fe7a6..bf44cb1bc23 100644 --- a/src/core/CLucene/util/PFORUtil.h +++ b/src/core/CLucene/util/PFORUtil.h @@ -18,9 +18,17 @@ #include <cstddef> #include <cstdint> +#include "CLucene/SharedHeader.h" +#include "CLucene/CLConfig.h" +#include "CLucene/store/IndexOutput.h" +#include "CLucene/store/IndexInput.h" +#include <vector> +CL_NS_DEF(util) size_t P4DEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); size_t P4NZDEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); size_t P4ENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); size_t P4NZENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - +void pfor_encode(store::IndexOutput* out, std::vector<uint32_t>& docDeltaBuffer, std::vector<uint32_t>& freqBuffer, bool has_prox); +uint32_t pfor_decode(store::IndexInput* in, std::vector<uint32_t>& docs, std::vector<uint32_t>& freqs, bool has_prox, bool compatibleRead); +CL_NS_END diff --git a/src/ext/for/CMakeLists.txt b/src/ext/for/CMakeLists.txt index 3b14781f69e..9c139a6f2a6 100644 --- a/src/ext/for/CMakeLists.txt +++ b/src/ext/for/CMakeLists.txt @@ -84,8 +84,24 @@ foreach(SRC_FILE ${SRC_FILES}) endif() endforeach() +add_executable(test_bitd1unpack test_bitd1unpack.cpp) + +target_link_libraries(test_bitd1unpack ic) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + target_compile_options(test_bitd1unpack PRIVATE ${AVX2} -DAVX2_ON ${DEBUG}) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + target_compile_options(test_bitd1unpack PRIVATE -march=armv8-a ${DEBUG}) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le") + target_compile_options(test_bitd1unpack PRIVATE -mcpu=power9 -mtune=power9 -D__SSSE3__ ${DEBUG}) +endif() + set(LIB_DESTINATION ../) install(TARGETS ic DESTINATION ${LIB_DESTINATION} COMPONENT ext) + +install(TARGETS test_bitd1unpack + DESTINATION ${CMAKE_INSTALL_PREFIX}/bin + COMPONENT tests) diff --git a/src/ext/for/bitpack.h b/src/ext/for/bitpack.h index b0b9e02275a..b8f91ad5690 100644 --- a/src/ext/for/bitpack.h +++ b/src/ext/for/bitpack.h @@ -30,6 +30,7 @@ #include <stdint.h> #endif #include <stddef.h> +#include <stdbool.h> #ifdef __cplusplus extern "C" { @@ -271,6 +272,7 @@ unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n, unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b); unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b); +unsigned char *bitzunpack256scalarv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b); unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b); unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b); unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b); @@ -299,10 +301,25 @@ unsigned char *_bitd1unpack128h32(const unsigned char *__restrict in, unsigned n unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb); unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb); +unsigned char* bitd1unpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned start, unsigned b); +unsigned char* _bitd1unpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned start, unsigned b, + unsigned* __restrict pex, unsigned char* bb); +unsigned char *bitunpack256scalarv32(const unsigned char *__restrict in, unsigned n, + unsigned *__restrict out, unsigned b); +unsigned char *_bitunpack256scalarv32(const unsigned char *__restrict in, + unsigned n, + unsigned *__restrict out, + unsigned b, + unsigned *__restrict pex, + unsigned char *bb, + bool isZigZag); unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb); unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb); unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb); unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb); +unsigned char *_bitzunpack256scalarv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb); #ifdef __cplusplus } #endif diff --git a/src/ext/for/bitunpack.c b/src/ext/for/bitunpack.c index 1dd78003ada..26817ac55bc 100644 --- a/src/ext/for/bitunpack.c +++ b/src/ext/for/bitunpack.c @@ -28,6 +28,9 @@ #include "bitutil.h" #include "bitpack.h" #include "vint.h" +#include <string.h> +#include <stdlib.h> +#include <stdbool.h> #define PAD8(_x_) (((_x_)+7)/8) @@ -690,6 +693,1205 @@ unsigned char *bitunpack256w32( const unsigned char *__restrict in, unsigned n, BITUNPACK128V32(in, b, out, sv); return (unsigned char *)_in+PAD8(256*b); } +static void applyException_8bits(uint8_t xm8, uint32_t** pPEX, int nb, uint32_t ov[8]) { + uint32_t* ex = *pPEX; + for (int j = 0; j < 8; j++) { + if ((xm8 >> j) & 1) { + ov[j] += (ex[0] << nb); + ex++; + } + } + *pPEX = ex; +} +static inline uint32_t zigzagDecode_scalar(uint32_t x) { + // (x>>1) ^ -((x & 1) ) + return (x >> 1) ^ -(x & 1); +} +static void bitunblk256v32_scalar_template(uint32_t** pIn, uint32_t** pOut, int expansions_count, + const uint8_t* SHIFT_HI, const uint8_t* SHIFT_LO, + const uint8_t* READ_FLAG, uint32_t mask, int nb, + bool isZigZag) { + const uint32_t* oldp = NULL; // pointer to current block data + uint32_t ov[8], tmp[8]; + + for (int k = 0; k < expansions_count; k++) { + if (k == 0) { + // Step 0: Load input block and directly take the lower nb bits + oldp = *pIn; + *pIn += 8; + for (int j = 0; j < 8; j++) { + ov[j] = oldp[j] & mask; + } + } else { + // First right shift the current block data by SHIFT_HI[k] + for (int j = 0; j < 8; j++) { + ov[j] = oldp[j] >> SHIFT_HI[k]; + } + if (READ_FLAG[k]) { + // Need to load a new block: left shift the new block data by SHIFT_LO[k], then merge with ov + const uint32_t* newp = *pIn; + *pIn += 8; + for (int j = 0; j < 8; j++) { + uint32_t part_lo = (newp[j] << SHIFT_LO[k]) & mask; + ov[j] |= part_lo; + } + // Update current block pointer + oldp = newp; + } else { + // No need to load a new block, ensure the result is within mask range + for (int j = 0; j < 8; j++) { + ov[j] &= mask; + } + } + } + // Write out the current 8 results + uint32_t* outp = *pOut; + for (int j = 0; j < 8; j++) { + if (isZigZag) { + outp[j] = zigzagDecode_scalar(ov[j]); + } else { + outp[j] = ov[j]; + } + } + *pOut += 8; + } +} +/** + * Generic template: supports "some expansions don't need to read new blocks". + * + * Parameters: + * - expansions_count: total number of expansions (for 29-bit, it might be 32 times) + * - SHIFT_HI[k], SHIFT_LO[k]: right shift for leftover, left shift for new block in k-th expansion + * - READ_FLAG[k]: whether k-th expansion needs to read a new block (1 means yes, 0 means no) + * - mask: for 29-bit = (1u << 29) - 1 + * - nb: base bits (29) + */ +static void bitunblk256v32_scalarBlock_ex_template(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, int expansions_count, + const uint8_t* SHIFT_HI, const uint8_t* SHIFT_LO, + const uint8_t* READ_FLAG, uint32_t mask, int nb, + bool isZigZag) { + const uint32_t* oldp = NULL; // leftover block (previous batch) + + for (int k = 0; k < expansions_count; k++) { + uint32_t ov[8]; + + if (k == 0) { + // First time: directly read 8×32-bit and apply mask + oldp = *pIn; + *pIn += 8; + for (int j = 0; j < 8; j++) { + ov[j] = oldp[j] & mask; + } + } else { + // Subsequent expansions + uint8_t hi = SHIFT_HI[k]; + uint8_t lo = SHIFT_LO[k]; + + // First shift leftover >> hi + for (int j = 0; j < 8; j++) { + ov[j] = (oldp[j] >> hi); + } + + // If this expansion needs to read a new block, append newp << lo + if (READ_FLAG[k]) { + const uint32_t* newp = *pIn; + *pIn += 8; + for (int j = 0; j < 8; j++) { + uint32_t part_lo = (newp[j] << lo) & mask; + ov[j] |= part_lo; + } + // After reading, newp becomes the leftover for next time + oldp = newp; + } else { + // No need to read new block => just apply mask to leftover >> hi + for (int j = 0; j < 8; j++) { + ov[j] &= mask; + } + // leftover remains unchanged, continue using oldp + } + } + + // Apply exceptions + uint8_t xm8 = **pBB; + (*pBB)++; + applyException_8bits(xm8, pPEX, nb, ov); + + // Write out this batch of 8 results + uint32_t* outp = *pOut; + for (int j = 0; j < 8; j++) { + if (isZigZag) { + outp[j] = zigzagDecode_scalar(ov[j]); + } else { + outp[j] = ov[j]; + } + } + *pOut += 8; + } +} +static void bitunpack256v32_0_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + uint32_t* op = *pOut; + for (int i = 0; i < 32; i++) { + // Read bitmap if exists, otherwise default to 0 + uint8_t xm8 = (pBB != NULL) ? **pBB : 0; + if (pBB != NULL) { + (*pBB)++; + } + // Initialize output array (all zeros by default) + uint32_t ov[8] = {0}; + if (xm8 != 0 && pPEX != NULL) { + applyException_8bits(xm8, pPEX, 0, ov); + } + + // Directly write 8 values using a loop to avoid repeated memory copy calls + for (int j = 0; j < 8; j++) { + if (isZigZag) { + op[j] = zigzagDecode_scalar(ov[j]); + } else { + op[j] = ov[j]; + } + } + op += 8; + } + *pOut = op; +} + +static void bitunpack256v32_1_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb1 = 1; + const uint32_t mask1 = 1; // 0x1 + const int expansions_count_1 = 32; + static const uint8_t SHIFT_HI_1[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + static const uint8_t SHIFT_LO_1[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_1[32] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_1, SHIFT_HI_1, + SHIFT_LO_1, READ_FLAG_1, mask1, nb1, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_1, SHIFT_HI_1, SHIFT_LO_1, + READ_FLAG_1, mask1, nb1, isZigZag); + } +} + +static void bitunpack256v32_2_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb2 = 2; + const uint32_t mask2 = (1u << nb2) - 1; // 0x3 + const int expansions_count_2 = 16; + static const uint8_t SHIFT_HI_2[16] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; + static const uint8_t SHIFT_LO_2[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_2[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_2, + SHIFT_HI_2, SHIFT_LO_2, READ_FLAG_2, mask2, nb2, + isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_2, SHIFT_HI_2, SHIFT_LO_2, + READ_FLAG_2, mask2, nb2, isZigZag); + } + } +} + +static void bitunpack256v32_3_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb3 = 3; + const uint32_t mask3 = (1u << nb3) - 1; // 0x7 + const int expansions_count_3 = 32; + static const uint8_t SHIFT_HI_3[32] = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, + 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, + 2, 5, 8, 11, 14, 17, 20, 23, 26, 29}; + static const uint8_t SHIFT_LO_3[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_3[32] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_3, SHIFT_HI_3, + SHIFT_LO_3, READ_FLAG_3, mask3, nb3, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_3, SHIFT_HI_3, SHIFT_LO_3, + READ_FLAG_3, mask3, nb3, isZigZag); + } +} + +static void bitunpack256v32_4_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const uint32_t mask4 = (1u << 4) - 1; // 0xF + const int nb = 4; // base bits + const int expansions_count = 8; + static const uint8_t SHIFT_HI_4[8] = {0, 4, 8, 12, 16, 20, 24, 28}; + static const uint8_t SHIFT_LO_4[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_4[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count, SHIFT_HI_4, + SHIFT_LO_4, READ_FLAG_4, mask4, nb, isZigZag); + } + } else { + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_4, SHIFT_LO_4, + READ_FLAG_4, mask4, nb, isZigZag); + } + } +} + +static void bitunpack256v32_5_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb5 = 5; + const uint32_t mask5 = (1u << nb5) - 1; // 0x1F + const int expansions_count_5 = 32; + static const uint8_t SHIFT_HI_5[32] = {0, 5, 10, 15, 20, 25, 30, 3, 8, 13, 18, + 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, + 14, 19, 24, 29, 2, 7, 12, 17, 22, 27}; + static const uint8_t SHIFT_LO_5[32] = {0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_5[32] = {1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_5, SHIFT_HI_5, + SHIFT_LO_5, READ_FLAG_5, mask5, nb5, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_5, SHIFT_HI_5, SHIFT_LO_5, + READ_FLAG_5, mask5, nb5, isZigZag); + } +} +static void bitunpack256v32_6_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb6 = 6; + const uint32_t mask6 = (1u << nb6) - 1; // 0x3F + const int expansions_count_6 = 16; + static const uint8_t SHIFT_HI_6[16] = {0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26}; + static const uint8_t SHIFT_LO_6[16] = {0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_6[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_6, + SHIFT_HI_6, SHIFT_LO_6, READ_FLAG_6, mask6, nb6, + isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_6, SHIFT_HI_6, SHIFT_LO_6, + READ_FLAG_6, mask6, nb6, isZigZag); + } + } +} +static void bitunpack256v32_7_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb7 = 7; + const uint32_t mask7 = (1u << nb7) - 1; // 0x7F + const int expansions_count = 32; + static const uint8_t SHIFT_HI_7[32] = {0, 7, 14, 21, 28, 3, 10, 17, 24, 31, 6, + 13, 20, 27, 2, 9, 16, 23, 30, 5, 12, 19, + 26, 1, 8, 15, 22, 29, 4, 11, 18, 25}; + static const uint8_t SHIFT_LO_7[32] = {0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, + 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 3, 0, 0, 0, 0}; + static const uint8_t READ_FLAG_7[32] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, + 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count, SHIFT_HI_7, + SHIFT_LO_7, READ_FLAG_7, mask7, nb7, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_7, SHIFT_LO_7, + READ_FLAG_7, mask7, nb7, isZigZag); + } +} +static void bitunpack256v32_8_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb8 = 8; + const uint32_t mask8 = (1u << nb8) - 1; // 0xFF + const int expansions_count_8 = 4; + static const uint8_t SHIFT_HI_8[4] = {0, 8, 16, 24}; + static const uint8_t SHIFT_LO_8[4] = {0, 0, 0, 0}; + static const uint8_t READ_FLAG_8[4] = {1, 0, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 8; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_8, + SHIFT_HI_8, SHIFT_LO_8, READ_FLAG_8, mask8, nb8, + isZigZag); + } + } else { + for (int i = 0; i < 8; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_8, SHIFT_HI_8, SHIFT_LO_8, + READ_FLAG_8, mask8, nb8, isZigZag); + } + } +} + +static void bitunpack256v32_9_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb9 = 9; + const uint32_t mask9 = (1u << nb9) - 1; // 0x1FF + const int expansions_count_9 = 32; + static const uint8_t SHIFT_HI_9[32] = {0, 9, 18, 27, 4, 13, 22, 31, 8, 17, 26, + 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, + 6, 15, 24, 1, 10, 19, 28, 5, 14, 23}; + static const uint8_t SHIFT_LO_9[32] = {0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 6, 0, 0, 0, 2, 0, + 0, 7, 0, 0, 0, 3, 0, 0, 8, 0, 0, 0, 4, 0, 0, 0}; + static const uint8_t READ_FLAG_9[32] = {1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, + 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_9, SHIFT_HI_9, + SHIFT_LO_9, READ_FLAG_9, mask9, nb9, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_9, SHIFT_HI_9, SHIFT_LO_9, + READ_FLAG_9, mask9, nb9, isZigZag); + } +} +static void bitunpack256v32_10_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb10 = 10; + const uint32_t mask10 = (1u << nb10) - 1; // 0x3FF + const int expansions_count_10 = 16; + static const uint8_t SHIFT_HI_10[16] = {0, 10, 20, 30, 8, 18, 28, 6, + 16, 26, 4, 14, 24, 2, 12, 22}; + static const uint8_t SHIFT_LO_10[16] = {0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 8, 0, 0, 0}; + static const uint8_t READ_FLAG_10[16] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_10, + SHIFT_HI_10, SHIFT_LO_10, READ_FLAG_10, mask10, + nb10, isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_10, SHIFT_HI_10, SHIFT_LO_10, + READ_FLAG_10, mask10, nb10, isZigZag); + } + } +} + +static void bitunpack256v32_11_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb11 = 11; + const uint32_t mask11 = (1u << nb11) - 1; // 0x7FF + const int expansions_count_11 = 32; + static const uint8_t SHIFT_HI_11[32] = {0, 11, 22, 1, 12, 23, 2, 13, 24, 3, 14, + 25, 4, 15, 26, 5, 16, 27, 6, 17, 28, 7, + 18, 29, 8, 19, 30, 9, 20, 31, 10, 21}; + static const uint8_t SHIFT_LO_11[32] = {0, 0, 10, 0, 0, 9, 0, 0, 8, 0, 0, 7, 0, 0, 6, 0, + 0, 5, 0, 0, 4, 0, 0, 3, 0, 0, 2, 0, 0, 1, 0, 0}; + static const uint8_t READ_FLAG_11[32] = {1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, + 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_11, SHIFT_HI_11, + SHIFT_LO_11, READ_FLAG_11, mask11, nb11, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_11, SHIFT_HI_11, SHIFT_LO_11, + READ_FLAG_11, mask11, nb11, isZigZag); + } +} + +static void bitunpack256v32_12_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb12 = 12; + const uint32_t mask12 = (1u << nb12) - 1; // 0xFFF + const int expansions_count_12 = 8; + static const uint8_t SHIFT_HI_12[8] = {0, 12, 24, 4, 16, 28, 8, 20}; + static const uint8_t SHIFT_LO_12[8] = {0, 0, 8, 0, 0, 4, 0, 0}; + static const uint8_t READ_FLAG_12[8] = {1, 0, 1, 0, 0, 1, 0, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_12, + SHIFT_HI_12, SHIFT_LO_12, READ_FLAG_12, mask12, + nb12, isZigZag); + } + } else { + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_12, SHIFT_HI_12, SHIFT_LO_12, + READ_FLAG_12, mask12, nb12, isZigZag); + } + } +} + +static void bitunpack256v32_13_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb13 = 13; + const uint32_t mask13 = (1u << nb13) - 1; // 0x1FFF + const int expansions_count_13 = 32; + static const uint8_t SHIFT_HI_13[32] = {0, 13, 26, 7, 20, 1, 14, 27, 8, 21, 2, + 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, + 30, 11, 24, 5, 18, 31, 12, 25, 6, 19}; + static const uint8_t SHIFT_LO_13[32] = {0, 0, 6, 0, 12, 0, 0, 5, 0, 11, 0, 0, 4, 0, 10, 0, + 0, 3, 0, 9, 0, 0, 2, 0, 8, 0, 0, 1, 0, 7, 0, 0}; + static const uint8_t READ_FLAG_13[32] = {1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, + 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_13, SHIFT_HI_13, + SHIFT_LO_13, READ_FLAG_13, mask13, nb13, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_13, SHIFT_HI_13, SHIFT_LO_13, + READ_FLAG_13, mask13, nb13, isZigZag); + } +} + +static void bitunpack256v32_14_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb14 = 14; + const uint32_t mask14 = (1u << nb14) - 1; // 0x3FFF + const int expansions_count_14 = 16; + static const uint8_t SHIFT_HI_14[16] = {0, 14, 28, 10, 24, 6, 20, 2, + 16, 30, 12, 26, 8, 22, 4, 18}; + static const uint8_t SHIFT_LO_14[16] = {0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 0, 10, 0, 0}; + static const uint8_t READ_FLAG_14[16] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_14, + SHIFT_HI_14, SHIFT_LO_14, READ_FLAG_14, mask14, + nb14, isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_14, SHIFT_HI_14, SHIFT_LO_14, + READ_FLAG_14, mask14, nb14, isZigZag); + } + } +} + +static void bitunpack256v32_15_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb15 = 15; + const uint32_t mask15 = (1u << 15) - 1; // 0x7FFF + + // expansions=32 => unpacks 256 values at once + const int expansions_count_15 = 32; + + static const uint8_t SHIFT_HI_15[32] = {0, 15, 30, 13, 28, 11, 26, 9, 24, 7, 22, + 5, 20, 3, 18, 1, 16, 31, 14, 29, 12, 27, + 10, 25, 8, 23, 6, 21, 4, 19, 2, 17}; + + static const uint8_t SHIFT_LO_15[32] = {0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, + 0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0, 11, 0, 13, 0, 0}; + + static const uint8_t READ_FLAG_15[32] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_15, SHIFT_HI_15, + SHIFT_LO_15, READ_FLAG_15, mask15, nb15, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_15, SHIFT_HI_15, SHIFT_LO_15, + READ_FLAG_15, mask15, nb15, isZigZag); + } +} + +static void bitunpack256v32_16_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb16 = 16; + const uint32_t mask16 = (1u << 16) - 1; // 0xFFFF + + const int expansions_count = 2; + // Iteration 0: directly read 8×32-bit; Iteration 1: only right shift 16 bits, no new data read + static const uint8_t SHIFT_HI_16[2] = {0, 16}; + static const uint8_t SHIFT_LO_16[2] = {0, 0}; + static const uint8_t READ_FLAG_16[2] = {1, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 16; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count, + SHIFT_HI_16, SHIFT_LO_16, READ_FLAG_16, mask16, + nb16, isZigZag); + } + } else { + for (int i = 0; i < 16; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_16, SHIFT_LO_16, + READ_FLAG_16, mask16, nb16, isZigZag); + } + } +} + +static void bitunpack256v32_17_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const int nb17 = 17; + const uint32_t mask17 = (1u << 17) - 1; // 0x1FFFF + + // expansions=32 => unpacks 256 values + const int expansions_count_17 = 32; + + static const uint8_t SHIFT_HI_17[32] = {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, + 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, + 22, 7, 24, 9, 26, 11, 28, 13, 30, 15}; + + static const uint8_t SHIFT_LO_17[32] = {0, 15, 0, 13, 0, 11, 0, 9, 0, 7, 0, 5, 0, 3, 0, 1, + 16, 0, 14, 0, 12, 0, 10, 0, 8, 0, 6, 0, 4, 0, 2, 0}; + + static const uint8_t READ_FLAG_17[32] = {1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_17, SHIFT_HI_17, + SHIFT_LO_17, READ_FLAG_17, mask17, nb17, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_17, SHIFT_HI_17, SHIFT_LO_17, + READ_FLAG_17, mask17, nb17, isZigZag); + } +} + +static void bitunpack256v32_18_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const int nb18 = 18; + const uint32_t mask18 = (1u << 18) - 1; // 0x3FFFF + + // expansions=16 => 128 values + const int expansions_count_18 = 16; + + static const uint8_t SHIFT_HI_18[16] = {0, 18, 4, 22, 8, 26, 12, 30, + 16, 2, 20, 6, 24, 10, 28, 14}; + static const uint8_t SHIFT_LO_18[16] = {0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 0, 8, 0, 4, 0}; + static const uint8_t READ_FLAG_18[16] = {// #0 =>1, #1 =>1, #2=>0, #3=>1, + // #4 =>0, #5 =>1, #6=>0, #7=>1, + // #8 =>1, #9 =>0, #10=>1, #11=>0, + // #12=>1, #13=>0, #14=>1, #15=>0 + 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_18, + SHIFT_HI_18, SHIFT_LO_18, READ_FLAG_18, mask18, + nb18, isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_18, SHIFT_HI_18, SHIFT_LO_18, + READ_FLAG_18, mask18, nb18, isZigZag); + } + } +} + +static void bitunpack256v32_19_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const int nb19 = 19; + const uint32_t mask19 = (1u << 19) - 1; // 0x7FFFF + + // expansions=32 => unpacks 256 values at once + const int expansions_count_19 = 32; + + static const uint8_t SHIFT_HI_19[32] = {0, 19, 6, 25, 12, 31, 18, 5, 24, 11, 30, + 17, 4, 23, 10, 29, 16, 3, 22, 9, 28, 15, + 2, 21, 8, 27, 14, 1, 20, 7, 26, 13}; + static const uint8_t SHIFT_LO_19[32] = {0, 13, 0, 7, 0, 1, 14, 0, 8, 0, 2, 15, 0, 9, 0, 3, + 16, 0, 10, 0, 4, 17, 0, 11, 0, 5, 18, 0, 12, 0, 6, 0}; + static const uint8_t READ_FLAG_19[32] = {1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, + 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_19, SHIFT_HI_19, + SHIFT_LO_19, READ_FLAG_19, mask19, nb19, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_19, SHIFT_HI_19, SHIFT_LO_19, + READ_FLAG_19, mask19, nb19, isZigZag); + } +} + +static void bitunpack256v32_20_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const int nb20 = 20; + const uint32_t mask20 = (1u << 20) - 1; // 0xFFFFF + + // expansions=8 => process 64 values at once + const int expansions_count_20 = 8; + + // shift tables for k=0..7 + static const uint8_t SHIFT_HI_20[8] = {0, 20, 8, 28, 16, 4, 24, 12}; + static const uint8_t SHIFT_LO_20[8] = {0, 12, 0, 4, 16, 0, 8, 0}; + static const uint8_t READ_FLAG_20[8] = {1, 1, 0, 1, 1, 0, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_20, + SHIFT_HI_20, SHIFT_LO_20, READ_FLAG_20, mask20, + nb20, isZigZag); + } + } else { + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_20, SHIFT_HI_20, SHIFT_LO_20, + READ_FLAG_20, mask20, nb20, isZigZag); + } + } +} + +static void bitunpack256v32_21_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const uint32_t mask21 = (1u << 21) - 1; // 0x1FFFFF + const int nb21 = 21; + + // expansions=32 => unpacks 256 values at once + const int expansions_count_21 = 32; + + static const uint8_t SHIFT_HI_21[32] = {0, 21, 10, 31, 20, 9, 30, 19, 8, 29, 18, + 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, + 14, 3, 24, 13, 2, 23, 12, 1, 22, 11}; + static const uint8_t SHIFT_LO_21[32] = {0, 11, 0, 1, 12, 0, 2, 13, 0, 3, 14, + 0, 4, 15, 0, 5, 16, 0, 6, 17, 0, 7, + 18, 0, 8, 19, 0, 9, 20, 0, 10, 0}; + static const uint8_t READ_FLAG_21[32] = { + // Check original expansions #k if there's a "load #X" => 1 if yes, 0 if no + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, + 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_21, SHIFT_HI_21, + SHIFT_LO_21, READ_FLAG_21, mask21, nb21, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_21, SHIFT_HI_21, SHIFT_LO_21, + READ_FLAG_21, mask21, nb21, isZigZag); + } +} + +static void bitunpack256v32_22_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const uint32_t mask22 = (1u << 22) - 1; // 0x3FFFFF + const int nb22 = 22; + + // b=22 => one block function with expansions=16 => outputs 128 values + // need to call it twice to get 256 values + const int expansions_count_22 = 16; + + static const uint8_t SHIFT_HI_22[16] = {/* 0 */ 0, /* 1 */ 22, /* 2 */ 12, /* 3 */ 2, + /* 4 */ 24, /* 5 */ 14, /* 6 */ 4, /* 7 */ 26, + /* 8 */ 16, /* 9 */ 6, /*10 */ 28, /*11 */ 18, + /*12 */ 8, /*13 */ 30, /*14 */ 20, /*15 */ 10}; + + static const uint8_t SHIFT_LO_22[16] = {/* 0 */ 0, /* 1 */ 10, /* 2 */ 20, /* 3 */ 0, + /* 4 */ 8, /* 5 */ 18, /* 6 */ 0, /* 7 */ 6, + /* 8 */ 16, /* 9 */ 0, /*10 */ 4, /*11 */ 14, + /*12 */ 0, /*13 */ 2, /*14 */ 12, /*15 */ 0}; + + static const uint8_t READ_FLAG_22[16] = { + // From original code: expansions #3, #6, #9, #12, #15 don't read, others do + 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_22, + SHIFT_HI_22, SHIFT_LO_22, READ_FLAG_22, mask22, + nb22, isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_22, SHIFT_HI_22, SHIFT_LO_22, + READ_FLAG_22, mask22, nb22, isZigZag); + } + } +} + +static void bitunpack256v32_23_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const int nb23 = 23; + const uint32_t mask23 = (1u << 23) - 1; // 0x7FFFFF + + // expansions_count=32 + const int expansions_count_23 = 32; + + // Predefined SHIFT_HI_23, SHIFT_LO_23, READ_FLAG_23 + static const uint8_t SHIFT_HI_23[32] = {0, 23, 14, 5, 28, 19, 10, 1, 24, 15, 6, + 29, 20, 11, 2, 25, 16, 7, 30, 21, 12, 3, + 26, 17, 8, 31, 22, 13, 4, 27, 18, 9}; + + static const uint8_t SHIFT_LO_23[32] = {0, 9, 18, 0, 4, 13, 22, 0, 8, 17, 0, + 3, 12, 21, 0, 7, 16, 0, 2, 11, 20, 0, + 6, 15, 0, 1, 10, 19, 0, 5, 14, 0}; + + static const uint8_t READ_FLAG_23[32] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, + 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_23, SHIFT_HI_23, + SHIFT_LO_23, READ_FLAG_23, mask23, nb23, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_23, SHIFT_HI_23, SHIFT_LO_23, + READ_FLAG_23, mask23, nb23, isZigZag); + } +} +static void bitunpack256v32_24_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // base bits & mask + const int nb24 = 24; + const uint32_t mask24 = (1u << 24) - 1; // 0xFFFFFF + + // expansions_count=4 (corresponds to 4 expansions => outputs 32 values) + const int expansions_count_24 = 4; + + // k=0 => leftover>>0, new<<0 + // k=1 => leftover>>24, new<<8 + // k=2 => leftover>>16, new<<16 + // k=3 => leftover>>8, no new block read + static const uint8_t SHIFT_HI_24[4] = {0, 24, 16, 8}; + static const uint8_t SHIFT_LO_24[4] = {0, 8, 16, 0}; + + // Only read new blocks for steps 0,1,2, not for step 3 + static const uint8_t READ_FLAG_24[4] = {1, 1, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 8; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_24, + SHIFT_HI_24, SHIFT_LO_24, READ_FLAG_24, mask24, + nb24, isZigZag); + } + } else { + for (int i = 0; i < 8; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_24, SHIFT_HI_24, SHIFT_LO_24, + READ_FLAG_24, mask24, nb24, isZigZag); + } + } +} + +static void bitunpack256v32_25_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // mask & base bits + const uint32_t mask25 = (1u << 25) - 1; // 0x1FFFFFF + const int nb25 = 25; + + // 32 expansions total + const int expansions_count_25 = 32; + + // Extract high and low shift amounts from original implementation + static const uint8_t SHIFT_HI_25[32] = { + /* #0 */ 0, /* #1 */ 25, /* #2 */ 18, /* #3 */ 11, + /* #4 */ 4, /* #5 */ 29, /* #6 */ 22, /* #7 */ 15, + /* #8 */ 8, /* #9 */ 1, /* #10 */ 26, /* #11 */ 19, + /* #12 */ 12, /* #13 */ 5, /* #14 */ 30, /* #15 */ 23, + /* #16 */ 16, /* #17 */ 9, /* #18 */ 2, /* #19 */ 27, + /* #20 */ 20, /* #21 */ 13, /* #22 */ 6, /* #23 */ 31, + /* #24 */ 24, /* #25 */ 17, /* #26 */ 10, /* #27 */ 3, + /* #28 */ 28, /* #29 */ 21, /* #30 */ 14, /* #31 */ 7}; + + static const uint8_t SHIFT_LO_25[32] = { + /* #0 */ 0, /* #1 */ 7, /* #2 */ 14, /* #3 */ 21, + /* #4 */ 0, /* #5 */ 3, /* #6 */ 10, /* #7 */ 17, + /* #8 */ 24, /* #9 */ 0, /* #10 */ 6, /* #11 */ 13, + /* #12 */ 20, /* #13 */ 0, /* #14 */ 2, /* #15 */ 9, + /* #16 */ 16, /* #17 */ 23, /* #18 */ 0, /* #19 */ 5, + /* #20 */ 12, /* #21 */ 19, /* #22 */ 0, /* #23 */ 1, + /* #24 */ 8, /* #25 */ 15, /* #26 */ 22, /* #27 */ 0, + /* #28 */ 4, /* #29 */ 11, /* #30 */ 18, /* #31 */ 0}; + + // Mark which steps don't need to read new data + // Based on original code, expansions #4, #9, #13, #18, #22, #27, #31 don't need to read new data + static const uint8_t READ_FLAG_25[32] = { + /* #0 */ 1, /* #1 */ 1, /* #2 */ 1, /* #3 */ 1, + /* #4 */ 0, /* #5 */ 1, /* #6 */ 1, /* #7 */ 1, + /* #8 */ 1, /* #9 */ 0, /* #10 */ 1, /* #11 */ 1, + /* #12 */ 1, /* #13 */ 0, /* #14 */ 1, /* #15 */ 1, + /* #16 */ 1, /* #17 */ 1, /* #18 */ 0, /* #19 */ 1, + /* #20 */ 1, /* #21 */ 1, /* #22 */ 0, /* #23 */ 1, + /* #24 */ 1, /* #25 */ 1, /* #26 */ 1, /* #27 */ 0, + /* #28 */ 1, /* #29 */ 1, /* #30 */ 1, /* #31 */ 0}; + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_25, + SHIFT_HI_25, SHIFT_LO_25, READ_FLAG_25, mask25, + nb25, isZigZag); + } + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_25, SHIFT_HI_25, SHIFT_LO_25, + READ_FLAG_25, mask25, nb25, isZigZag); + } +} + +static void bitunpack256v32_26_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // mask & base bits + const uint32_t mask26 = (1u << 26) - 1; // 0x3FFFFFF + const int nb26 = 26; + + // 16 expansions total + const int expansions_count_26 = 16; + + // Extract high and low shift amounts from original implementation + static const uint8_t SHIFT_HI_26[16] = { + /* #0 */ 0, /* #1 */ 26, /* #2 */ 20, /* #3 */ 14, + /* #4 */ 8, /* #5 */ 2, /* #6 */ 28, /* #7 */ 22, + /* #8 */ 16, /* #9 */ 10, /* #10 */ 4, /* #11 */ 30, + /* #12 */ 24, /* #13 */ 18, /* #14 */ 12, /* #15 */ 6}; + + static const uint8_t SHIFT_LO_26[16] = { + /* #0 */ 0, /* #1 */ 6, /* #2 */ 12, /* #3 */ 18, + /* #4 */ 24, /* #5 */ 0, /* #6 */ 4, /* #7 */ 10, + /* #8 */ 16, /* #9 */ 22, /* #10 */ 0, /* #11 */ 2, + /* #12 */ 8, /* #13 */ 14, /* #14 */ 20, /* #15 */ 0}; + + // Mark which steps don't need to read new data + // Based on original code, expansions #5, #10, #15 don't need to read new data + static const uint8_t READ_FLAG_26[16] = { + /* #0 */ 1, /* #1 */ 1, /* #2 */ 1, /* #3 */ 1, + /* #4 */ 1, /* #5 */ 0, /* #6 */ 1, /* #7 */ 1, + /* #8 */ 1, /* #9 */ 1, /* #10 */ 0, /* #11 */ 1, + /* #12 */ 1, /* #13 */ 1, /* #14 */ 1, /* #15 */ 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_26, + SHIFT_HI_26, SHIFT_LO_26, READ_FLAG_26, mask26, + nb26, isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_26, SHIFT_HI_26, SHIFT_LO_26, + READ_FLAG_26, mask26, nb26, isZigZag); + } + } +} + +static void bitunpack256v32_27_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + // mask & base bits + const uint32_t mask27 = (1u << 27) - 1; // 0x7FFFFFF + const int nb27 = 27; + + // 32 expansions total + const int expansions_count_27 = 32; + + // Extract high and low shift amounts from original implementation + static const uint8_t SHIFT_HI_27[32] = { + /* #0 */ 0, /* #1 */ 27, /* #2 */ 22, /* #3 */ 17, + /* #4 */ 12, /* #5 */ 7, /* #6 */ 2, /* #7 */ 29, + /* #8 */ 24, /* #9 */ 19, /* #10 */ 14, /* #11 */ 9, + /* #12 */ 4, /* #13 */ 31, /* #14 */ 26, /* #15 */ 21, + /* #16 */ 16, /* #17 */ 11, /* #18 */ 6, /* #19 */ 1, + /* #20 */ 28, /* #21 */ 23, /* #22 */ 18, /* #23 */ 13, + /* #24 */ 8, /* #25 */ 3, /* #26 */ 30, /* #27 */ 25, + /* #28 */ 20, /* #29 */ 15, /* #30 */ 10, /* #31 */ 5}; + + static const uint8_t SHIFT_LO_27[32] = { + /* #0 */ 0, /* #1 */ 5, /* #2 */ 10, /* #3 */ 15, + /* #4 */ 20, /* #5 */ 25, /* #6 */ 0, /* #7 */ 3, + /* #8 */ 8, /* #9 */ 13, /* #10 */ 18, /* #11 */ 23, + /* #12 */ 0, /* #13 */ 1, /* #14 */ 6, /* #15 */ 11, + /* #16 */ 16, /* #17 */ 21, /* #18 */ 26, /* #19 */ 0, + /* #20 */ 4, /* #21 */ 9, /* #22 */ 14, /* #23 */ 19, + /* #24 */ 24, /* #25 */ 0, /* #26 */ 2, /* #27 */ 7, + /* #28 */ 12, /* #29 */ 17, /* #30 */ 22, /* #31 */ 0}; + + // Mark which steps don't need to read new data + // From original code, steps #6, #12, #19, #25, #31 don't have CPY8(iv, *pIn) + static const uint8_t READ_FLAG_27[32] = { + /* #0 */ 1, /* #1 */ 1, /* #2 */ 1, /* #3 */ 1, + /* #4 */ 1, /* #5 */ 1, /* #6 */ 0, /* #7 */ 1, + /* #8 */ 1, /* #9 */ 1, /* #10 */ 1, /* #11 */ 1, + /* #12 */ 0, /* #13 */ 1, /* #14 */ 1, /* #15 */ 1, + /* #16 */ 1, /* #17 */ 1, /* #18 */ 1, /* #19 */ 0, + /* #20 */ 1, /* #21 */ 1, /* #22 */ 1, /* #23 */ 1, + /* #24 */ 1, /* #25 */ 0, /* #26 */ 1, /* #27 */ 1, + /* #28 */ 1, /* #29 */ 1, /* #30 */ 1, /* #31 */ 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_27, SHIFT_HI_27, + SHIFT_LO_27, READ_FLAG_27, mask27, nb27, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_27, SHIFT_HI_27, SHIFT_LO_27, + READ_FLAG_27, mask27, nb27, isZigZag); + } +} +static void bitunpack256v32_28_scalar( + uint32_t** pIn, uint32_t** pOut, + uint32_t** pPEX, // Optional parameter, non-NULL for extended version + unsigned char** pBB, bool isZigZag) // Optional parameter, non-NULL for extended version +{ + // Common constant definitions + const uint32_t mask28 = (1u << 28) - 1; // 0xFFFFFFF + const int nb28 = 28; + const int expansions_count_28 = 8; + static const uint8_t SHIFT_HI_28[8] = {0, 28, 24, 20, 16, 12, 8, 4}; + static const uint8_t SHIFT_LO_28[8] = {0, 4, 8, 12, 16, 20, 24, 0}; + static const uint8_t READ_FLAG_28[8] = {1, 1, 1, 1, 1, 1, 1, 0}; + + // Choose template based on whether extension parameters are provided + if (pPEX != NULL && pBB != NULL) { + // Call extended template, each call outputs 64 values, loop 4 times to get 256 + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count_28, + SHIFT_HI_28, SHIFT_LO_28, READ_FLAG_28, mask28, + nb28, isZigZag); + } + } else { + // Call non-extended template, also each call outputs 64 values, loop 4 times to get 256 + for (int i = 0; i < 4; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count_28, SHIFT_HI_28, SHIFT_LO_28, + READ_FLAG_28, mask28, nb28, isZigZag); + } + } +} + +static void bitunpack256v32_29_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const uint32_t mask29 = (1U << 29) - 1; // 0x1FFFFFFF + const int expansions_count = 32; + static const uint8_t SHIFT_HI_29[32] = {0, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2, + 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1, + 30, 27, 24, 21, 18, 15, 12, 9, 6, 3}; + static const uint8_t SHIFT_LO_29[32] = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 0, + 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 0, + 2, 5, 8, 11, 14, 17, 20, 23, 26, 0}; + static const uint8_t READ_FLAG_29[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count, SHIFT_HI_29, + SHIFT_LO_29, READ_FLAG_29, mask29, 29, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_29, SHIFT_LO_29, + READ_FLAG_29, mask29, 29, isZigZag); + } +} + +static void bitunpack256v32_30_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const uint32_t mask30 = (1U << 30) - 1; // 0x3FFFFFFF + const int expansions_count = 16; + static const uint8_t SHIFT_HI_30[16] = {0, 30, 28, 26, 24, 22, 20, 18, + 16, 14, 12, 10, 8, 6, 4, 2}; + static const uint8_t SHIFT_LO_30[16] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 0}; + static const uint8_t READ_FLAG_30[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + if (pPEX != NULL && pBB != NULL) { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count, + SHIFT_HI_30, SHIFT_LO_30, READ_FLAG_30, mask30, 30, + isZigZag); + } + } else { + for (int i = 0; i < 2; i++) { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_30, SHIFT_LO_30, + READ_FLAG_30, mask30, 30, isZigZag); + } + } +} + +static void bitunpack256v32_31_scalar(uint32_t** pIn, uint32_t** pOut, uint32_t** pPEX, + unsigned char** pBB, bool isZigZag) { + const uint32_t mask31 = (1U << 31) - 1; // 0x7FFFFFFF + const int expansions_count = 32; + // Construct parameter arrays: + // For k==0: SHIFT_HI = 0, SHIFT_LO = 0, READ_FLAG = 1 + // For k = 1 .. 30: SHIFT_HI = 32 - k, SHIFT_LO = k, READ_FLAG = 1 + // For k==31: SHIFT_HI = 1, SHIFT_LO = 0, READ_FLAG = 0 + static const uint8_t SHIFT_HI[32] = {0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + static const uint8_t SHIFT_LO[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0}; + static const uint8_t READ_FLAG[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + if (pPEX != NULL && pBB != NULL) { + bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, expansions_count, SHIFT_HI, + SHIFT_LO, READ_FLAG, mask31, 31, isZigZag); + } else { + bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI, SHIFT_LO, READ_FLAG, + mask31, 31, isZigZag); + } +} + +static void bitunpack256v32_32_scalar( + uint32_t** pIn, uint32_t** pOut, + uint32_t** pPEX, // Optional parameter, non-NULL for extended version + unsigned char** pBB, bool isZigZag) // Optional parameter, non-NULL for extended version +{ + uint32_t* ip = *pIn; + uint32_t* op = *pOut; + const int nb = 32; // When b=32, each 32-bit integer stores a value directly + + // There are 32 groups, each group has 8 numbers, totaling 256 numbers + for (int i = 0; i < 32; i++) { + // Copy 8 input values directly to output (avoid calling CPY8) + for (int j = 0; j < 8; j++) { + op[j] = ip[j]; + } + ip += 8; + + if (pPEX != NULL && pBB != NULL) { + uint8_t xm8 = **pBB; + (*pBB)++; + if (xm8 != 0) { + applyException_8bits(xm8, pPEX, nb, op); + } + } + if (isZigZag) { + for (int j = 0; j < 8; j++) { + op[j] = zigzagDecode_scalar(op[j]); + } + } + op += 8; + } + *pIn = ip; + *pOut = op; +} + +// Define function pointer type for unpacking functions +typedef void (*unpack_func_t)(uint32_t**, uint32_t**, unsigned**, unsigned char**, bool); + +// Array of function pointers for each bit width (0 to 32) +static unpack_func_t unpack_funcs[33] = { + bitunpack256v32_0_scalar, bitunpack256v32_1_scalar, bitunpack256v32_2_scalar, + bitunpack256v32_3_scalar, bitunpack256v32_4_scalar, bitunpack256v32_5_scalar, + bitunpack256v32_6_scalar, bitunpack256v32_7_scalar, bitunpack256v32_8_scalar, + bitunpack256v32_9_scalar, bitunpack256v32_10_scalar, bitunpack256v32_11_scalar, + bitunpack256v32_12_scalar, bitunpack256v32_13_scalar, bitunpack256v32_14_scalar, + bitunpack256v32_15_scalar, bitunpack256v32_16_scalar, bitunpack256v32_17_scalar, + bitunpack256v32_18_scalar, bitunpack256v32_19_scalar, bitunpack256v32_20_scalar, + bitunpack256v32_21_scalar, bitunpack256v32_22_scalar, bitunpack256v32_23_scalar, + bitunpack256v32_24_scalar, bitunpack256v32_25_scalar, bitunpack256v32_26_scalar, + bitunpack256v32_27_scalar, bitunpack256v32_28_scalar, bitunpack256v32_29_scalar, + bitunpack256v32_30_scalar, bitunpack256v32_31_scalar, bitunpack256v32_32_scalar}; +/** + * + * @param in Compressed data input stream + * @param n Currently unused, can be processed according to actual needs + * @param out Output buffer for decompressed 32-bit integers (must accommodate at least 256 32-bit integers) + * @param b Bit width for each integer, this example only demonstrates the b=8 branch + * @return Returns the next readable input position after decompression (consistent with original logic) + */ +unsigned char* bitunpack256scalarv32_withzigzag(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned b, + bool isZigZag) { + // Debug output (optional, can be removed in production) + //printf("bitunpack256scalarv32_withzigzag b=%d bits=%d isZigZag=%d\n", b, b & 0x3f, isZigZag); + + // Calculate input pointer offset + unsigned char* ip = (unsigned char*)(in + PAD8(256 * b)); + + // Initialize pointers + uint32_t* pIn32 = (uint32_t*)in; + uint32_t* pOut32 = (uint32_t*)out; + + unsigned bits = b & 0x3f; + // Execute unpacking if b is in valid range + if (bits <= 32) { + unpack_funcs[bits](&pIn32, &pOut32, NULL, NULL, isZigZag); + } + + return ip; +} +unsigned char* bitunpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned b) { + // Debug output (optional, can be removed in production) + //printf("bitunpack256scalarv32 b=%d bits=%d\n", b, b & 0x3f); + + // Calculate input pointer offset + unsigned char* ip = (unsigned char*)(in + PAD8(256 * b)); + + bitunpack256scalarv32_withzigzag(in, n, out, b, false); + + return ip; +} +unsigned char* _bitd1unpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned start, unsigned b, + unsigned* __restrict pex, unsigned char* bb) { + //printf("_bitd1unpack256scalarv32, b=%d\n", b & 0x3f); + unsigned* deltas = (unsigned*)malloc(n * sizeof(unsigned)); + if (!deltas) return NULL; + + const unsigned char* orig_in = in; + in = _bitunpack256scalarv32(in, n, deltas, b, pex, bb, false); + + unsigned running_sum = start; + for (unsigned i = 0; i < n; ++i) { + running_sum += deltas[i] + 1; + out[i] = running_sum; + } + + free(deltas); + return (unsigned char*)in; +} + +// Add this after the definition of _bitunpack256w32 in the SSE2/SSSE3 section + +// Delta1 unpacking for 256 32-bit integers (no exceptions) +unsigned char* bitd1unpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned start, unsigned b) { + //printf("bitd1unpack256scalarv32, b=%d\n", b & 0x3f); + const unsigned char* _in = in; + unsigned deltas[n]; + + in = bitunpack256scalarv32(in, n, deltas, b); + + unsigned running_sum = start; + for (unsigned i = 0; i < n; ++i) { + running_sum += deltas[i] + 1; + out[i] = running_sum; + } + + return (unsigned char*)in; +} + +unsigned char* _bitunpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned b, + unsigned* __restrict pex, unsigned char* bb, bool isZigZag) { + // Debug output (optional, can be removed in production) + //printf("_bitunpack256scalarv32 bits=%d isZigZag=%d\n", b & 0x3f, isZigZag); + + // Calculate input pointer offset + unsigned char* ip = (unsigned char*)(in + PAD8(256 * b)); + + // Initialize pointers + unsigned* pPEX = pex; + unsigned char* pBB = bb; + uint32_t* pIn32 = (uint32_t*)in; + uint32_t* pOut32 = (uint32_t*)out; + + unsigned bits = b & 0x3f; + // Execute unpacking if b is in valid range + if (bits <= 32) { + unpack_funcs[bits](&pIn32, &pOut32, &pPEX, &pBB, isZigZag); + } + + return ip; +} + +unsigned char* bitzunpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned start, unsigned b) { + // Debug output (optional, can be removed in production) + //printf("bitzunpack256scalarv32 b=%d bits=%d\n", b, b & 0x3f); + const unsigned char* _in = in; + unsigned deltas[n]; + + in = bitunpack256scalarv32_withzigzag(in, n, deltas, b, true); + + unsigned running_sum = start; + for (unsigned i = 0; i < n; ++i) { + running_sum += deltas[i]; + out[i] = running_sum; + } + + return (unsigned char*)in; +} +unsigned char* _bitzunpack256scalarv32(const unsigned char* __restrict in, unsigned n, + unsigned* __restrict out, unsigned start, unsigned b, + unsigned* __restrict pex, unsigned char* bb) { + // Debug output (optional, can be removed in production) + //printf("_bitzunpack256scalarv32 bits=%d\n", b & 0x3f); + + unsigned* deltas = (unsigned*)malloc(n * sizeof(unsigned)); + if (!deltas) return NULL; + + const unsigned char* orig_in = in; + in = _bitunpack256scalarv32(in, n, deltas, b, pex, bb, true); + + unsigned running_sum = start; + for (unsigned i = 0; i < n; ++i) { + running_sum += deltas[i]; + out[i] = running_sum; + } + + free(deltas); + return (unsigned char*)in; +} #define STOZ64(_op_, _ov_) _mm_storeu_si128(_op_++, _ov_); _mm_storeu_si128(_op_++, _ov_) #define STO64( _op_, _ov_, _zv_) _mm_storeu_si128(_op_++, _mm_unpacklo_epi32(_ov_,_zv_));_mm_storeu_si128(_op_++, _mm_unpacklo_epi32(_mm_srli_si128(_ov_,8),_zv_)) diff --git a/src/ext/for/test_bitd1unpack.cpp b/src/ext/for/test_bitd1unpack.cpp new file mode 100644 index 00000000000..23484a7f50e --- /dev/null +++ b/src/ext/for/test_bitd1unpack.cpp @@ -0,0 +1,399 @@ +#include <assert.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <vp4.h> +#include <vint.h> +#include "conf.h" + +// 定义PAD8宏 +#ifndef PAD8 +#define PAD8(_x_) (((_x_) + 7) / 8) +#endif + +const unsigned TEST_SIZE = 512; + +#ifdef __AVX2__ +void generate_test_data(unsigned* raw_values, unsigned n, unsigned char* encoded_data, + unsigned* out_size) { + // 使用p4nd1enc256v32编码原始数据 + size_t end_ptr = p4nd1enc256v32(raw_values, n, encoded_data); + + // 计算编码后数据大小 + *out_size = end_ptr; +} +#endif +#define _1vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\ + if(!(_x_ & 0x80u)) { _act_;}\ + else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++; _act_;}\ + else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\ + else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu); _ip_ += 3; _act_;}\ + else { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\ +} while(0) +#define xvbxget32(_ip_, _x_) _1vbxget32(_ip_, _x_, ;) + +// 用于快速得到 10^k 的一个表,避免多次调用 pow +// 注意 10^10=10000000000 需要 64 位才能存 +static const uint64_t g_pow10[] = { + 1ULL, // 10^0 + 10ULL, // 10^1 + 100ULL, // 10^2 + 1000ULL, // 10^3 + 10000ULL, // 10^4 + 100000ULL, // 10^5 + 1000000ULL, // 10^6 + 10000000ULL, // 10^7 + 100000000ULL, // 10^8 + 1000000000ULL,// 10^9 + 10000000000ULL// 10^10 +}; + +// 计算 10^(floor(b/3)),若超出 g_pow10 范围可再加判断 +static inline uint64_t get_pow10_for_b(unsigned b) { + // floor(b/3) + unsigned idx = b / 3; + if (idx >= sizeof(g_pow10)/sizeof(g_pow10[0])) { + // 超过预置表最大 10^10,就固定返回 10^10 或自行处理 + return g_pow10[10]; + } + return g_pow10[idx]; +} + +// 计算 2^b 的函数 +static inline unsigned power2(unsigned b) { + // (1U << b) 当 b=32 时也可能溢出,你可自行判断 + return (1U << b); +} + + +/** + * @param values 输出数组 + * @param n 要生成的数据个数 + * @param b 当前位宽 + * @param with_exception 0=无异常,1=有异常 + */ +void generate_raw_data_for_bitwidth(unsigned* values, unsigned n, + unsigned b, int with_exception) +{ + if (n == 0) return; + + if (!with_exception) { + // ===================================== + // 无异常模式:递增序列 + // ===================================== + // 1) 先给一个随机初始值 base (你也可随意决定) + unsigned base = rand() % 1000; + values[0] = base; + + // 2) 根据 b 分段决定“增量最大范围” + unsigned inc_range; + if (b < 4) { + // b=0 => 2^0=1, b=1 =>2, b=2 =>4, b=3=>8 + inc_range = power2(b); + } else { + // b>=4 => 用10^(floor(b/3)) => 10,100,1000,... + uint64_t r = get_pow10_for_b(b); + // 这里最好判断 r 是否超出 unsigned 范围 + // 若测试场景不会特别大,可以直接转为 unsigned + if (r > 0xFFFFFFFFULL) { + r = 0xFFFFFFFFULL; // 避免溢出 + } + inc_range = (unsigned)r; + } + + // 3) 生成递增序列 + for (unsigned i = 1; i < n; i++) { + // +1 是为了避免 0 增量的情况 + unsigned inc = 1 + rand() % inc_range; + base += inc; + values[i] = base; + } + + } else { + // ===================================== + // 有异常模式:直接随机 + // ===================================== + + // 观察示例得知: + // - b=0 => rand()%2 + // - b=2 => rand()%4 + // - b=3 => rand()%10 + // - b=7 => rand()%100 + // - b=10 => rand()%1000 + // - b=13 => rand()%10000 + // => 规律:当 b >= 3 用 10^(floor(b/3));当 b < 3 用特殊处理 + + uint64_t val_range = 0; // 用 64 位临时存,最后再转回 unsigned + + if (b == 0) { + val_range = 2; // 0..1 + } + else if (b == 1) { + // 你没给 b=1 的具体例子,这里假设跟 b=0 一样 => range=2 + val_range = 2; // 0..1 + } + else if (b == 2) { + val_range = 4; // 0..3 + } + else { + // b>=3 => 用 10^(floor(b/3)) + val_range = get_pow10_for_b(b); + // 同样检查一下是否超过 unsigned + if (val_range > 0xFFFFFFFFULL) { + val_range = 0xFFFFFFFFULL; + } + } + + // 直接随机 + for (unsigned i = 0; i < n; i++) { + unsigned x = (unsigned)(rand() % (unsigned)val_range); + values[i] = x; + } + } +} + +/** + * 生成 n 个有符号数: + * - b<3: 范围很小(±(1<<b) 之类) + * - b>=3: 直接从 ±(10^(floor(b/3))) 随机, 并包含一定的负值 + * + * with_exception=0 => 生成一个“有序/有限范围” + * with_exception=1 => 生成一个“更大随机范围” (你可自定义) + */ +static void generate_raw_signed_data_for_zigzag(unsigned *values, + unsigned n, + unsigned b, + int with_exception) +{ + if (n == 0) return; + + // srand(...) 在外部一次初始化 + uint64_t val_range = 1; + if (b < 3) { + // 例如 b=0 =>±1, b=1=>±2, b=2=>±4 + val_range = (1ULL << b); + } else { + // b>=3 => use get_pow10_for_b(b) => 10^(floor(b/3)) + val_range = get_pow10_for_b(b); // 参考你贴的 delta pfor + if(val_range > 0x7fffffffULL) { + val_range = 0x7fffffffULL; // 避免溢出 32-bit + } + } + + for(unsigned i=0; i<n; i++){ + // 先产生 0..val_range-1 + int32_t x = (int32_t)(rand() % (unsigned)val_range); + // 随机决定正负 + if(with_exception) { + // 例如 50% 概率取反 + if((rand() & 1) == 1) x = -x; + } else { + // 不带异常 => 大部分正, 也可以小概率负 + if((rand()%10)==0) x = -x; + } + values[i] = x; + } +} +#ifdef __AVX2__ +void run_testZigzag(unsigned b, + int with_exception, + unsigned TEST_SIZE, + unsigned *raw_values, + unsigned char *encoded_data, + unsigned *decoded1, + unsigned *decoded2) +{ + printf("Zigzag 测试: 位宽 b=%u, with_exception=%d\n", b, with_exception); + + // 1) 生成带正负 raw data + generate_raw_signed_data_for_zigzag(raw_values, TEST_SIZE, b, with_exception); + unsigned encoded_size = p4nzenc256v32(raw_values, TEST_SIZE, encoded_data); + + // 获取编码头部信息(例如起始值等) + unsigned start; + unsigned char* copy = encoded_data; + xvbxget32(copy, start); + unsigned char encoded_b = copy[0]; // 编码后的第一个字节为位宽 + if((encoded_b & 0x40)) { + encoded_b &= 0x3f; + } else { + if(encoded_b & 0x80) { + encoded_b &= 0x7f; + } + } + printf(" 编码参数: 位宽 b=%u, 起始值 start=%u, 编码大小=%u字节\n", encoded_b, start, encoded_size); + + // 3) decode => two versions for cross-check + // (here we define "decoded1" from "bitzunpack256v32...??" and "decoded2" from "bitzunpack256scalarv32Zigzag"??) + memset(decoded1,0,TEST_SIZE*sizeof(unsigned)); + memset(decoded2,0,TEST_SIZE*sizeof(unsigned)); + + // "decoded1" => maybe vector version if you have it? e.g. "bitzunpack256v32(in,b, out,??)" + // "decoded2" => scalar version ? + + // for demonstration, we do the same decode to compare: + p4nzdec256v32(encoded_data, TEST_SIZE, decoded1); + p4nzdec256scalarv32(encoded_data, TEST_SIZE, decoded2); + + // 4) compare mismatch + int mismatch=0; + for(unsigned i=0;i<TEST_SIZE;i++){ + if(decoded1[i] != decoded2[i]){ + if(mismatch<10) + printf(" mismatch at i=%u: dec1=%d, dec2=%d\n", i, decoded1[i], decoded2[i]); + mismatch++; + } + } + if(mismatch==0){ + printf(" decode1 & decode2 match!\n"); + // verify with original + int error=0; + for(unsigned i=0;i<TEST_SIZE;i++){ + if(decoded1[i] != raw_values[i]){ + if(error<10) + printf(" raw mismatch at i=%u: raw=%d, dec=%d\n", i,raw_values[i], decoded1[i]); + error++; + } + } + if(error==0) printf(" and match raw data!\n"); + else printf(" total %d raw mismatch\n", error); + } else { + printf(" total mismatch=%d\n", mismatch); + } + printf("\n"); +} + +void run_test(unsigned b, int with_exception, unsigned TEST_SIZE, + unsigned* raw_values, unsigned char* encoded_data, + unsigned* decoded1, unsigned* decoded2) { + printf("测试: 位宽 b=%u, 异常%s\n", b, (with_exception ? "有" : "无")); + + // 生成符合当前 b 与异常模式的原始数据 + generate_raw_data_for_bitwidth(raw_values, TEST_SIZE, b, with_exception); + + unsigned encoded_size; + generate_test_data(raw_values, TEST_SIZE, encoded_data, &encoded_size); + + // 获取编码头部信息(例如起始值等) + unsigned start; + unsigned char* copy = encoded_data; + xvbxget32(copy, start); + unsigned char encoded_b = copy[0]; // 编码后的第一个字节为位宽 + if((encoded_b & 0x40)) { + encoded_b &= 0x3f; + } else { + if(encoded_b & 0x80) { + encoded_b &= 0x7f; + } + } + printf(" 编码参数: 位宽 b=%u, 起始值 start=%u, 编码大小=%u字节\n", encoded_b, start, encoded_size); + + // 清空解码缓冲区 + memset(decoded1, 0, TEST_SIZE * sizeof(unsigned)); + memset(decoded2, 0, TEST_SIZE * sizeof(unsigned)); + + // 调用两种解码方式 + p4nd1dec256v32(encoded_data, TEST_SIZE, decoded1); + p4nd1dec256scalarv32(encoded_data, TEST_SIZE, decoded2); + + // 比较两个解码结果是否匹配 + int mismatch = 0; + for (unsigned i = 0; i < TEST_SIZE; i++) { + if (decoded1[i] != decoded2[i]) { + if (mismatch < 10) + printf(" 不匹配: 索引 %u, 原始值=%u, 原始解码=%u, 标量解码=%u\n", + i, raw_values[i], decoded1[i], decoded2[i]); + mismatch++; + } + } + if (mismatch == 0) { + printf(" 通过: 所有解码值匹配!\n"); + // 验证解码值与原始数据是否一致 + int error = 0; + for (unsigned i = 0; i < TEST_SIZE && error < 10; i++) { + if (decoded1[i] != raw_values[i]) { + printf(" 编码/解码错误: 索引 %u, 原始值=%u, 解码值=%u\n", + i, raw_values[i], decoded1[i]); + error++; + } + } + if (error == 0) + printf(" 验证通过: 解码结果与原始数据一致\n"); + } else { + printf(" 失败: 有 %d 个值不匹配\n", mismatch); + printf(" 原始数据 (前16个): "); + for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++) + printf("%u ", raw_values[i]); + printf("...\n"); + printf(" 原始解码 (前16个): "); + for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++) + printf("%u ", decoded1[i]); + printf("...\n"); + printf(" 标量解码 (前16个): "); + for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++) + printf("%u ", decoded2[i]); + printf("...\n"); + } + printf("\n"); +} + +void testZigZag() +{ + const unsigned TEST_SIZE=512; //or512 + unsigned *raw_values= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned)); + unsigned *decoded1= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned)); + unsigned *decoded2= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned)); + unsigned char* encoded_data= (unsigned char*) malloc(TEST_SIZE*4+ 10); //maybe + + srand((unsigned)time(NULL)); + printf("开始测试 p4nzdec256v32...\n"); + + for(unsigned b=0; b<=32; b++){ + run_testZigzag(b,0, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2); + run_testZigzag(b,1, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2); + } + + free(raw_values); + free(decoded1); + free(decoded2); + free(encoded_data); +} + +void test_p4nd1dec256v32() { + const unsigned TEST_SIZE = 512; + + // 分配缓冲区 + unsigned* raw_values = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned)); + unsigned char* encoded_data = (unsigned char*)malloc(TEST_SIZE * sizeof(unsigned) * 2); + unsigned* decoded1 = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned)); + unsigned* decoded2 = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned)); + + srand((unsigned)time(NULL)); + printf("开始测试 p4nd1dec256v32...\n"); + + // 对 b = 0 到 31 测试两种模式:无异常和有异常 + for (unsigned b = 0; b < 32; b++) { + run_test(b, 0, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2); + run_test(b, 1, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2); + } + // 对 b == 32 只测试无异常情况 + run_test(32, 0, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2); + + free(raw_values); + free(encoded_data); + free(decoded1); + free(decoded2); + + printf("测试完成!\n"); +} +#endif +int main() { +#ifdef __AVX2__ + test_p4nd1dec256v32(); + testZigZag(); + //test_until_b1_achieved_improved(); +#endif + return 0; +} diff --git a/src/ext/for/vp4.h b/src/ext/for/vp4.h index fae28df8d45..39460b614c4 100644 --- a/src/ext/for/vp4.h +++ b/src/ext/for/vp4.h @@ -99,6 +99,7 @@ size_t p4nd1dec32( unsigned char *__restrict in, size_t n, uint32_t *__restri size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out); size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nd1dec256scalarv32(unsigned char* __restrict in, size_t n, uint32_t* __restrict out); size_t p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); //Zigzag size_t p4nzdec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); @@ -107,6 +108,7 @@ size_t p4nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restri size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nzdec256scalarv32(unsigned char* __restrict in, size_t n, uint32_t* __restrict out); size_t p4nzdec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); //************** Low level API - n limited to 128/256 *************************************** diff --git a/src/ext/for/vp4d.c b/src/ext/for/vp4d.c index a255fc2a2f0..e1bbacde3b6 100644 --- a/src/ext/for/vp4d.c +++ b/src/ext/for/vp4d.c @@ -253,6 +253,36 @@ extern char _shuffle_16[256][16]; #define BITUNPACK bitunpack256w #define BITUNPACKD bitunpack256w #define _BITUNPACKD _bitunpack256w +#include "vp4d.c" +#define P4DELTA(a) ,a +#define P4DELTA_(a) a +#define DELTA + +#undef _P4DEC +#undef P4DEC +#undef P4NDEC +#undef BITUNPACKD +#undef _BITUNPACKD + + +#define _P4DEC _p4d1dec256scalarv +#define P4DEC p4d1dec256scalarv +#define P4NDEC p4nd1dec256scalarv +#define P4NDECS p4d1dec +#define BITUNPACK bitunpack256scalarv +#define BITUNPACKD bitd1unpack256scalarv +#define _BITUNPACKD _bitd1unpack256scalarv +#define BITUNDD bitd1dec +#include "vp4d.c" + +#define _P4DEC _p4zdec256scalarv +#define P4DEC p4zdec256scalarv +#define P4NDEC p4nzdec256scalarv +#define P4NDECS p4zdec +#define BITUNPACKD bitzunpack256scalarv +#define _BITUNPACKD _bitzunpack256scalarv +#define BITUNDD bitzdec +#define USIZE 32 #include "vp4d.c" #endif #undef DELTA diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 5392cc79188..4b12bb6f9c3 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -16,7 +16,7 @@ SOURCE_GROUP("search" ./search/*) SOURCE_GROUP("search-spans" ./search/spans/*) SOURCE_GROUP("store" ./store/*) SOURCE_GROUP("util" ./util/*) - +INCLUDE_DIRECTORIES( ${clucene_SOURCE_DIR}/src/ext/* ) IF (BUILD_CONTRIBS_LIB) SET(test_contribs_lib_files ./contribs-lib/analysis/testChinese.cpp) SET(EXTRA_LIBS ${EXTRA_LIBS} clucene-contribs-lib) @@ -106,6 +106,7 @@ SET(test_files ./tests.cpp ./util/TestStrConvert.cpp ./query/TestMultiPhraseQuery.cpp ./store/TestUTF8Chars.cpp + ./store/testPFOR.cpp ${test_HEADERS}) IF (USE_SHARED_OBJECT_FILES) GET_SHARED_FILES(clucene_shared_Files) diff --git a/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat new file mode 100644 index 00000000000..e823aef9968 Binary files /dev/null and b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat differ diff --git a/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat new file mode 100644 index 00000000000..a27177e3090 Binary files /dev/null and b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat differ diff --git a/src/test/store/testPFOR.cpp b/src/test/store/testPFOR.cpp new file mode 100644 index 00000000000..a8ddbc871b5 --- /dev/null +++ b/src/test/store/testPFOR.cpp @@ -0,0 +1,546 @@ +#include "test.h" + +#include <memory.h> +#include <stdlib.h> +#include <time.h> + +#include <cstdint> +#include <cstdio> +#include <vector> + +#include "CLucene/index/CodeMode.h" +#include "CLucene/store/FSDirectory.h" +#include "CLucene/store/IndexInput.h" +#include "CLucene/store/IndexOutput.h" +#include "CLucene/util/PFORUtil.h" +#include "CuTest.h" +#include "for/vp4.h" + +using namespace lucene::store; +// Add a helper macro for printing more detailed error messages when assertions fail +#define CuAssertTrueWithMessage(tc, message, condition) \ + do { \ + if (!(condition)) { \ + printf("Assertion failed: %s\n", message); \ + } \ + CuAssertTrue(tc, condition); \ + } while (0) + +static const uint64_t g_pow10[] = { + 1ULL, // 10^0 + 10ULL, // 10^1 + 100ULL, // 10^2 + 1000ULL, // 10^3 + 10000ULL, // 10^4 + 100000ULL, // 10^5 + 1000000ULL, // 10^6 + 10000000ULL, // 10^7 + 100000000ULL, // 10^8 + 1000000000ULL, // 10^9 + 10000000000ULL // 10^10 +}; + +// 计算 10^(floor(b/3)),若超出 g_pow10 范围可再加判断 +static inline uint64_t get_pow10_for_b(unsigned b) { + // floor(b/3) + unsigned idx = b / 3; + if (idx >= sizeof(g_pow10) / sizeof(g_pow10[0])) { + // 超过预置表最大 10^10,就固定返回 10^10 或自行处理 + return g_pow10[10]; + } + return g_pow10[idx]; +} + +// 计算 2^b 的函数 +static inline unsigned power2(unsigned b) { + // (1U << b) 当 b=32 时也可能溢出,你可自行判断 + return (1U << b); +} + +/** + * @param values 输出数组 + * @param n 要生成的数据个数 + * @param b 当前位宽 + * @param with_exception 0=无异常,1=有异常 + */ +void generate_raw_data_for_bitwidth(unsigned* values, unsigned n, unsigned b, int with_exception) { + if (n == 0) return; + + if (!with_exception) { + // ===================================== + // 无异常模式:递增序列 + // ===================================== + // 1) 先给一个随机初始值 base (你也可随意决定) + unsigned base = rand() % 1000; + values[0] = base; + + // 2) 根据 b 分段决定"增量最大范围" + unsigned inc_range; + if (b < 4) { + // b=0 => 2^0=1, b=1 =>2, b=2 =>4, b=3=>8 + inc_range = power2(b); + } else { + // b>=4 => 用10^(floor(b/3)) => 10,100,1000,... + uint64_t r = get_pow10_for_b(b); + // 这里最好判断 r 是否超出 unsigned 范围 + // 若测试场景不会特别大,可以直接转为 unsigned + if (r > 0xFFFFFFFFULL) { + r = 0xFFFFFFFFULL; // 避免溢出 + } + inc_range = (unsigned)r; + } + + // 3) 生成递增序列 + for (unsigned i = 1; i < n; i++) { + // +1 是为了避免 0 增量的情况 + unsigned inc = 1 + rand() % inc_range; + base += inc; + values[i] = base; + } + + } else { + // ===================================== + // 有异常模式:直接随机 + // ===================================== + + // 观察示例得知: + // - b=0 => rand()%2 + // - b=2 => rand()%4 + // - b=3 => rand()%10 + // - b=7 => rand()%100 + // - b=10 => rand()%1000 + // - b=13 => rand()%10000 + // => 规律:当 b >= 3 用 10^(floor(b/3));当 b < 3 用特殊处理 + + uint64_t val_range = 0; // 用 64 位临时存,最后再转回 unsigned + + if (b == 0) { + val_range = 2; // 0..1 + } else if (b == 1) { + // 你没给 b=1 的具体例子,这里假设跟 b=0 一样 => range=2 + val_range = 2; // 0..1 + } else if (b == 2) { + val_range = 4; // 0..3 + } else { + // b>=3 => 用 10^(floor(b/3)) + val_range = get_pow10_for_b(b); + // 同样检查一下是否超过 unsigned + if (val_range > 0xFFFFFFFFULL) { + val_range = 0xFFFFFFFFULL; + } + } + + // 直接随机 + for (unsigned i = 0; i < n; i++) { + unsigned x = (unsigned)(rand() % (unsigned)val_range); + values[i] = x; + } + } +} + +void test_pfor_has_prox(CuTest* tc) { + const unsigned TEST_SIZE = 512; + const char* testFileName = "pfor.dat"; + + // 分配缓冲区 + std::vector<unsigned> docDeltaBuffer(TEST_SIZE); + std::vector<unsigned> freqBuffer(TEST_SIZE); + std::vector<unsigned> encoded_data(TEST_SIZE * 2); + std::vector<unsigned> decoded1(TEST_SIZE); + std::vector<unsigned> decoded2(TEST_SIZE); + + srand((unsigned)time(NULL)); + printf("开始测试 p4nd1dec256v32...\n"); + + { + generate_raw_data_for_bitwidth(docDeltaBuffer.data(), TEST_SIZE, 32, 0); + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + + auto* output = dir->createOutput(testFileName); + + lucene::util::pfor_encode(output, docDeltaBuffer, freqBuffer, true); + output->close(); + _CLDELETE(output); + dir->close(); + _CLDELETE(dir); + } + { + IndexInput* input = nullptr; + CLuceneError error; + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + bool result = dir->openInput(testFileName, input, error); + lucene::util::pfor_decode(input, decoded1, decoded2, true, false); + for (size_t i = 0; i < TEST_SIZE; i++) { + CuAssertIntEquals(tc, _T("docDeltaBuffer[%zu] != decoded1[%zu]"), docDeltaBuffer[i], + decoded1[i]); + CuAssertIntEquals(tc, _T("freqBuffer[%zu] != decoded2[%zu]"), freqBuffer[i], + decoded2[i]); + } + input->close(); + _CLDELETE(input); + dir->close(); + _CLDELETE(dir); + } + printf("测试完成!\n"); +} + +void test_pfor_no_prox(CuTest* tc) { + const unsigned TEST_SIZE = 512; + const char* testFileName = "pfor.dat"; + + // 分配缓冲区 + std::vector<unsigned> docDeltaBuffer(TEST_SIZE); + std::vector<unsigned> freqBuffer(TEST_SIZE); + std::vector<unsigned> encoded_data(TEST_SIZE * 2); + std::vector<unsigned> decoded1(TEST_SIZE); + std::vector<unsigned> decoded2(TEST_SIZE); + + srand((unsigned)time(NULL)); + printf("开始测试 p4nd1dec256v32...\n"); + + { + generate_raw_data_for_bitwidth(docDeltaBuffer.data(), TEST_SIZE, 32, 0); + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + + auto* output = dir->createOutput(testFileName); + + lucene::util::pfor_encode(output, docDeltaBuffer, freqBuffer, false); + output->close(); + _CLDELETE(output); + dir->close(); + _CLDELETE(dir); + } + { + IndexInput* input = nullptr; + CLuceneError error; + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + bool result = dir->openInput(testFileName, input, error); + lucene::util::pfor_decode(input, decoded1, decoded2, false, false); + for (size_t i = 0; i < TEST_SIZE; i++) { + CuAssertIntEquals(tc, _T("docDeltaBuffer[%zu] != decoded1[%zu]"), docDeltaBuffer[i], + decoded1[i]); + CuAssertIntEquals(tc, _T("freqBuffer[%zu] != decoded2[%zu]"), freqBuffer[i], + decoded2[i]); + } + input->close(); + _CLDELETE(input); + dir->close(); + _CLDELETE(dir); + } + printf("测试完成!\n"); +} + +// Test the compatibility of P4DEC and P4ENC +void test_p4dec_p4enc_compat(CuTest* tc) { + const unsigned TEST_SIZE = 512; + const char* testFileName = "pfor_p4enc.dat"; + + // Allocate buffers + std::vector<uint32_t> originalData(TEST_SIZE); + std::vector<uint32_t> decodedData(TEST_SIZE); + std::vector<uint32_t> freqs(TEST_SIZE); + std::vector<uint32_t> decodedFreqs(TEST_SIZE); + + srand((unsigned)time(NULL)); + printf("Testing P4ENC and pfor_decode compatibility...\n"); + + // Generate test data with delta encoding pattern (increasing values) + generate_raw_data_for_bitwidth(originalData.data(), TEST_SIZE, 32, 0); + generate_raw_data_for_bitwidth(freqs.data(), TEST_SIZE, 32, 1); + + auto encode = [](IndexOutput* out, std::vector<uint32_t>& buffer, bool isDoc) { + std::vector<uint8_t> compress(4 * buffer.size() + PFOR_BLOCK_SIZE); + size_t size = 0; + if (isDoc) { + size = P4ENC(buffer.data(), buffer.size(), compress.data()); + } else { + size = P4NZENC(buffer.data(), buffer.size(), compress.data()); + } + out->writeVInt(size); + out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); + }; + // 第一步:使用P4ENC编码数据并写入文件 + { + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + auto* output = dir->createOutput(testFileName); + + // 写入编码模式和大小 + output->writeByte((char)lucene::index::CodeMode::kPfor); + output->writeVInt(TEST_SIZE); + + // 编码并写入数据 + encode(output, originalData, true); + encode(output, freqs, false); + + output->close(); + _CLDELETE(output); + dir->close(); + _CLDELETE(dir); + } + + // 第二步:使用pfor_decode解码数据 + { + IndexInput* input = nullptr; + CLuceneError error; + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + bool result = dir->openInput(testFileName, input, error); + + // 使用pfor_decode解码数据 (不使用代理 has_prox=false, compatibleRead=false) + uint32_t decoded_size = + lucene::util::pfor_decode(input, decodedData, decodedFreqs, true, false); + + // 验证解码大小 + CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE, decoded_size); + + // 验证解码数据与原始数据匹配 + for (size_t i = 0; i < TEST_SIZE; i++) { + //printf("freqs[%zu] = %u, decodedFreqs[%zu] = %u\n", i, freqs[i], i, decodedFreqs[i]); + //printf("originalData[%zu] = %u, decodedData[%zu] = %u\n", i, originalData[i], i, decodedData[i]); + CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original", + originalData[i] == decodedData[i]); + CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original", + freqs[i] == decodedFreqs[i]); + } + + input->close(); + _CLDELETE(input); + dir->close(); + _CLDELETE(dir); + } + + printf("P4ENC/pfor_decode compatibility test completed successfully!\n"); +} + +// Test cross-platform compatibility for P4DEC/P4ENC +void test_cross_platform_compat(CuTest* tc) { + const unsigned TEST_SIZE = 512; + const char* testFileName = "pfor_cross_platform.dat"; + + // Allocate buffers + std::vector<uint32_t> originalData(TEST_SIZE); + std::vector<uint32_t> decodedData(TEST_SIZE); + + srand((unsigned)time(NULL)); + printf("Testing cross-platform compatibility...\n"); + + // Generate test data with different patterns + for (unsigned i = 0; i < TEST_SIZE; i++) { + // Mix of small and large values to test different bit widths + if (i % 10 == 0) { + originalData[i] = rand() % 1000000; // Occasional large value + } else { + originalData[i] = rand() % 100; // Mostly small values + } + } + + // Part 1: Write encoded data to file using PFOR encoding + { + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + auto* output = dir->createOutput(testFileName); + + // Write encoding mode and size + output->writeByte((char)lucene::index::CodeMode::kPfor); + output->writeVInt(TEST_SIZE); + + // Encode and write the data + std::vector<uint8_t> compress(4 * TEST_SIZE + PFOR_BLOCK_SIZE); + size_t size = lucene::util::P4ENC(originalData.data(), TEST_SIZE, compress.data()); + output->writeVInt(size); + output->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size); + + output->close(); + _CLDELETE(output); + dir->close(); + _CLDELETE(dir); + } + + // Part 2: Read encoded data from file and decode it with compatibleRead=true + { + IndexInput* input = nullptr; + CLuceneError error; + auto* dir = lucene::store::FSDirectory::getDirectory("./"); + bool result = dir->openInput(testFileName, input, error); + + // Verify the encoded format + char mode = input->readByte(); + uint32_t arraySize = input->readVInt(); + CuAssertIntEquals(tc, _T("Array size mismatch"), TEST_SIZE, arraySize); + + // Read, decode and verify + uint32_t SerializedSize = input->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + input->readBytes(buf.data(), SerializedSize); + + // Use P4DEC for decoding, simulating cross-platform read + lucene::util::P4DEC(buf.data(), arraySize, decodedData.data()); + + // Verify decoded data matches original + for (size_t i = 0; i < TEST_SIZE; i++) { + CuAssertIntEquals(tc, _T("Cross-platform decoded data mismatch at %zu"), + originalData[i], decodedData[i]); + } + + input->close(); + _CLDELETE(input); + dir->close(); + _CLDELETE(dir); + } + + printf("Cross-platform compatibility test completed successfully!\n"); +} + +// Test compatibility between encoded by ARM old version data and decoded by x86 new version with compatible mode +void test_p4ndx_compatibility(CuTest* tc) { + const unsigned TEST_SIZE = 512; + const char* testFileName1 = "pfor_p4ndx_compat_gen_by_old_version_x86_64.dat"; + const char* testFileName2 = "pfor_p4ndx_compat_gen_by_old_version_arm.dat"; + + // Allocate buffers + std::vector<uint32_t> docDeltaBuffer = { + 635, 1188, 1795, 2109, 2694, 3612, 3714, 4511, 5072, 5352, 5526, + 5894, 6706, 6891, 6979, 7080, 7586, 7789, 8530, 9065, 9704, 9949, + 10377, 10678, 11516, 11921, 12226, 13133, 13417, 13854, 14215, 14486, 15476, + 16444, 17380, 18306, 19191, 19580, 20302, 21099, 21119, 22014, 22178, 22361, + 22440, 23043, 23326, 24262, 25067, 25443, 26265, 27061, 27681, 27931, 28027, + 28837, 28843, 29595, 29663, 29953, 30494, 30922, 31834, 32364, 33111, 33311, + 33766, 34749, 35689, 36217, 36348, 36660, 37083, 37378, 37872, 38725, 39622, + 40399, 40540, 40594, 41098, 42060, 42909, 43032, 43243, 43539, 43823, 44040, + 44439, 44790, 45648, 46587, 46718, 46839, 47307, 47536, 48208, 48482, 49046, + 49658, 50460, 51154, 51429, 52005, 52993, 53761, 54541, 54778, 55674, 56594, + 57236, 57635, 58517, 59007, 59881, 60325, 60462, 60971, 60983, 61518, 62378, + 63247, 64073, 64415, 64757, 65050, 65620, 65633, 66552, 66685, 67661, 67733, + 67912, 68514, 69161, 69327, 69697, 70475, 71229, 71846, 71896, 72291, 72659, + 72942, 73178, 73419, 74145, 74517, 75266, 75356, 75615, 76575, 77533, 77617, + 77918, 78569, 79297, 79520, 79536, 80534, 81241, 81584, 81653, 82538, 83483, + 83550, 83601, 84267, 85112, 85268, 85550, 86444, 87347, 87996, 88172, 88310, + 88551, 88804, 89666, 90008, 90350, 90822, 91475, 92127, 93034, 93340, 93994, + 94628, 95156, 95825, 96457, 96691, 96703, 96755, 96874, 97182, 97301, 97822, + 98795, 99758, 100434, 101040, 101248, 101826, 102081, 102816, 102884, 103731, 104070, + 104999, 105539, 106220, 106972, 107165, 107849, 108507, 109005, 109342, 109633, 109658, + 110016, 110290, 110900, 111621, 111947, 112675, 112703, 113499, 114099, 114451, 115209, + 115837, 116794, 117111, 117668, 118231, 118634, 119258, 119668, 120409, 121313, 122262, + 123035, 123690, 124183, 124991, 125303, 126293, 126790, 127745, 128111, 128965, 129545, + 129873, 130447, 130704, 130759, 131712, 131764, 131771, 132075, 132236, 132870, 133482, + 134311, 134501, 134676, 134907, 135073, 136009, 136333, 136402, 137286, 137734, 137810, + 138539, 138795, 139534, 139604, 140356, 140401, 141189, 142146, 142771, 142886, 143416, + 143649, 144170, 145004, 145289, 145816, 146305, 146750, 146910, 147010, 147636, 147986, + 148612, 149468, 150335, 150896, 151427, 152362, 153159, 154138, 154500, 155025, 155259, + 155360, 155954, 156291, 156436, 157169, 157462, 157583, 158430, 158604, 158958, 159326, + 159333, 159971, 160865, 161712, 162146, 162552, 162850, 162909, 163016, 163940, 164207, + 165180, 165664, 166461, 167368, 167648, 168423, 168692, 168848, 169208, 169929, 170679, + 171375, 172240, 172722, 173710, 174696, 175377, 175890, 176581, 176629, 177148, 177476, + 177769, 178486, 178599, 179297, 179312, 179836, 180640, 180930, 181072, 181848, 182621, + 183559, 183594, 183999, 184064, 184719, 185279, 186055, 186430, 187091, 187915, 188154, + 188649, 188812, 189388, 189563, 190239, 190505, 190727, 190921, 191866, 192732, 192995, + 193405, 193969, 194246, 195179, 195546, 196464, 196890, 197385, 198075, 198790, 199319, + 199765, 199896, 200079, 200085, 200992, 201549, 202215, 202945, 203092, 203252, 204144, + 204219, 204905, 205472, 205812, 206071, 206184, 206821, 206946, 207673, 207719, 208407, + 208762, 209092, 209498, 209770, 209877, 210129, 210442, 211263, 212043, 212802, 213754, + 214068, 214832, 215690, 215912, 216693, 217632, 218001, 218942, 219124, 219567, 220545, + 220646, 220780, 221017, 221582, 222352, 223065, 223356, 223523, 224275, 224920, 225768, + 225925, 226841, 227795, 228556, 228784, 229559, 230099, 231085, 231163, 231369, 231470, + 231757, 232184, 233066, 233291, 234086, 234260, 235018, 235607, 235758, 236616, 237339, + 238078, 238500, 239344, 239795, 239859, 240222, 240424, 241132, 241694, 242405, 243380, + 243896, 244367, 244922, 245564, 245926, 246818, 247537, 248104, 249097, 249102, 249448, + 249674, 250255, 250395, 250794, 251132, 251213, 252114, 252662, 252817, 253457, 253778, + 254128, 254570, 254955, 255667, 256311, 256755}; + std::vector<uint32_t> freqBuffer = { + 73, 5, 18, 40, 27, 24, 33, 88, 15, 51, 7, 59, 7, 4, 84, 39, 43, 34, 28, 75, 35, 75, + 29, 26, 48, 79, 67, 32, 42, 10, 75, 67, 67, 45, 7, 94, 21, 40, 35, 37, 43, 94, 48, 2, + 98, 85, 41, 93, 19, 22, 69, 54, 49, 50, 32, 97, 81, 0, 29, 24, 62, 57, 91, 30, 54, 51, + 76, 76, 91, 63, 65, 87, 57, 13, 89, 7, 98, 31, 1, 70, 5, 22, 76, 54, 24, 9, 52, 6, + 9, 33, 82, 71, 4, 2, 25, 53, 97, 76, 30, 25, 20, 93, 90, 7, 3, 55, 96, 10, 6, 79, + 63, 76, 84, 85, 52, 39, 10, 13, 91, 68, 22, 76, 50, 46, 19, 75, 99, 16, 4, 81, 41, 24, + 27, 83, 31, 30, 38, 27, 92, 44, 59, 56, 20, 43, 93, 25, 82, 55, 38, 25, 23, 13, 2, 25, + 59, 21, 53, 10, 89, 57, 44, 82, 81, 71, 17, 64, 53, 55, 43, 97, 0, 2, 53, 72, 46, 99, + 49, 28, 54, 40, 6, 30, 53, 8, 5, 5, 64, 81, 60, 74, 22, 17, 18, 4, 50, 41, 21, 14, + 94, 28, 58, 92, 80, 60, 97, 5, 58, 96, 54, 87, 3, 46, 45, 33, 99, 53, 40, 15, 86, 1, + 90, 8, 18, 60, 64, 21, 54, 37, 87, 48, 65, 45, 92, 98, 58, 42, 3, 16, 90, 9, 55, 93, + 56, 0, 26, 7, 5, 67, 23, 91, 20, 65, 99, 38, 77, 15, 11, 31, 52, 99, 32, 18, 96, 76, + 68, 54, 18, 71, 23, 9, 32, 78, 2, 88, 31, 81, 48, 88, 0, 23, 80, 20, 40, 31, 10, 17, + 47, 22, 49, 99, 21, 81, 69, 17, 57, 37, 24, 28, 60, 47, 37, 93, 77, 91, 33, 8, 72, 33, + 97, 72, 56, 29, 92, 96, 60, 55, 14, 59, 77, 15, 11, 98, 48, 32, 67, 57, 70, 91, 85, 82, + 90, 74, 75, 68, 66, 61, 28, 90, 94, 77, 15, 3, 6, 59, 99, 19, 14, 65, 30, 91, 32, 41, + 41, 80, 74, 9, 38, 96, 52, 75, 78, 43, 2, 6, 63, 68, 19, 91, 10, 13, 69, 25, 16, 27, + 85, 68, 98, 99, 33, 81, 91, 66, 74, 84, 98, 48, 93, 88, 96, 98, 16, 27, 93, 18, 85, 56, + 38, 4, 47, 48, 69, 68, 74, 38, 48, 11, 6, 98, 10, 91, 31, 53, 9, 6, 38, 60, 54, 83, + 48, 3, 33, 64, 30, 26, 34, 67, 82, 72, 71, 82, 21, 92, 2, 47, 30, 50, 58, 88, 1, 20, + 32, 32, 74, 93, 38, 64, 53, 45, 99, 54, 48, 33, 70, 30, 59, 5, 97, 94, 29, 20, 76, 50, + 12, 78, 49, 95, 81, 7, 83, 34, 80, 67, 18, 6, 13, 57, 70, 18, 54, 69, 72, 54, 2, 95, + 36, 14, 52, 33, 8, 81, 5, 36, 84, 17, 14, 33, 12, 47, 93, 48, 81, 25, 67, 52, 31, 80, + 9, 1, 99, 15, 22, 23, 69, 25}; + std::vector<uint32_t> decodedDocs(TEST_SIZE); + std::vector<uint32_t> decodedFreqs(TEST_SIZE); + + srand((unsigned)time(NULL)); + printf("Testing pfor_decode compatibility...\n"); + +#if defined(__AVX2__) + // Part 2: Decode data using pfor_decode with compatible mode (compatibleRead=true) + { + IndexInput* input = nullptr; + CLuceneError error; + auto* dir = lucene::store::FSDirectory::getDirectory(clucene_data_location); + bool result = dir->openInput(testFileName2, input, error); + + // Use pfor_decode with compatibleRead=true + uint32_t decoded_size = + lucene::util::pfor_decode(input, decodedDocs, decodedFreqs, true, true); + + // Verify decoded size + CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE, decoded_size); + + // Verify decoded data matches original + for (size_t i = 0; i < TEST_SIZE; i++) { + CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original", + docDeltaBuffer[i] == decodedDocs[i]); + CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original", + freqBuffer[i] == decodedFreqs[i]); + } + + input->close(); + _CLDELETE(input); + dir->close(); + _CLDELETE(dir); + } +#elif defined(__ARM_NEON) || defined(__SSSE3__) + { + IndexInput* input = nullptr; + CLuceneError error; + auto* dir = lucene::store::FSDirectory::getDirectory(clucene_data_location); + bool result = dir->openInput(testFileName1, input, error); + + // Use pfor_decode with compatibleRead=true + uint32_t decoded_size = + lucene::util::pfor_decode(input, decodedDocs, decodedFreqs, true, true); + + // Verify decoded size + CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE, decoded_size); + + // Verify decoded data matches original + for (size_t i = 0; i < TEST_SIZE; i++) { + CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original", + docDeltaBuffer[i] == decodedDocs[i]); + CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original", + freqBuffer[i] == decodedFreqs[i]); + } + + input->close(); + _CLDELETE(input); + dir->close(); + _CLDELETE(dir); + } +#endif + printf("compatibility test completed successfully!\n"); +} +CuSuite* testPFORSuite() { + CuSuite* suite = CuSuiteNew(_T("PFOR Test Suite")); + + SUITE_ADD_TEST(suite, test_pfor_has_prox); + SUITE_ADD_TEST(suite, test_pfor_no_prox); + SUITE_ADD_TEST(suite, test_p4dec_p4enc_compat); + SUITE_ADD_TEST(suite, test_cross_platform_compat); + SUITE_ADD_TEST(suite, test_p4ndx_compatibility); + + return suite; +} \ No newline at end of file diff --git a/src/test/test.h b/src/test/test.h index 19f37e81243..7f9bd908ee8 100644 --- a/src/test/test.h +++ b/src/test/test.h @@ -86,7 +86,7 @@ CuSuite *testMultiPhraseQuery(void); CuSuite *testIndexCompaction(void); CuSuite *testStringReader(void); CuSuite *testUTF8CharsSuite(void); - +CuSuite *testPFORSuite(void); #ifdef TEST_CONTRIB_LIBS //CuSuite *testGermanAnalyzer(void); CuSuite *testchinese(void); diff --git a/src/test/tests.cpp b/src/test/tests.cpp index 7cd9f657385..e0ee6055f32 100644 --- a/src/test/tests.cpp +++ b/src/test/tests.cpp @@ -20,7 +20,8 @@ unittest tests[] = { {"IndexCompaction", testIndexCompaction}, {"testStringReader", testStringReader}, {"TestUTF8Chars", testUTF8CharsSuite}, + {"testPFOR", testPFORSuite}, #ifdef TEST_CONTRIB_LIBS - {"chinese", testchinese}, + //{"chinese", testchinese}, #endif {"LastTest", NULL}}; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org