(doris-thirdparty) branch clucene-2.0 updated: [improve](pfor) add non-simd implementation for PFOR 256 (#297)

airborne Sun, 23 Mar 2025 07:17:39 -0700

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene-2.0
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene-2.0 by this push:
     new d3a628663ad [improve](pfor) add non-simd implementation for PFOR 256 
(#297)
d3a628663ad is described below

commit d3a628663ad07a38568f07deb04f5bc07fcc5869
Author: airborne12 <jiang...@selectdb.com>
AuthorDate: Sun Mar 23 22:17:26 2025 +0800

    [improve](pfor) add non-simd implementation for PFOR 256 (#297)
    
    * [improve](pfor) add non-simd implementation for PFOR 256
    
    * [improve](pfor) add non-simd implementation for PFOR 256
    
    * [improve](pfor) add non-simd implementation for PFOR 256
    
    * add zigzag scalar function
    
    * add unitest for pfor encode/decode
    
    * add unitest for pfor encode/decode
    
    * add unitest for pfor encode/decode
    
    * add unitest for pfor encode/decode
    
    * add unitest for pfor encode/decode
---
 CMakeLists.txt                                     |   27 +-
 cmake/Toolchain-aarch64.cmake                      |    5 +
 src/core/CLucene/index/CodeMode.h                  |    4 +-
 src/core/CLucene/index/FieldInfos.cpp              |   56 +-
 src/core/CLucene/index/SDocumentWriter.cpp         |   20 +-
 src/core/CLucene/index/SegmentTermDocs.cpp         |   43 +-
 src/core/CLucene/index/_FieldInfos.h               |    9 +-
 src/core/CLucene/index/_SegmentHeader.h            |    6 +-
 src/core/CLucene/util/PFORUtil.cpp                 |  190 +++-
 src/core/CLucene/util/PFORUtil.h                   |   10 +-
 src/ext/for/CMakeLists.txt                         |   16 +
 src/ext/for/bitpack.h                              |   17 +
 src/ext/for/bitunpack.c                            | 1202 ++++++++++++++++++++
 src/ext/for/test_bitd1unpack.cpp                   |  399 +++++++
 src/ext/for/vp4.h                                  |    2 +
 src/ext/for/vp4d.c                                 |   30 +
 src/test/CMakeLists.txt                            |    3 +-
 .../pfor_p4ndx_compat_gen_by_old_version_arm.dat   |  Bin 0 -> 1168 bytes
 ...pfor_p4ndx_compat_gen_by_old_version_x86_64.dat |  Bin 0 -> 1164 bytes
 src/test/store/testPFOR.cpp                        |  546 +++++++++
 src/test/test.h                                    |    2 +-
 src/test/tests.cpp                                 |    3 +-
 22 files changed, 2494 insertions(+), 96 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3b77861857..44a673a9e4a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,8 +19,6 @@ SET(CLUCENE_VERSION 
"${CLUCENE_VERSION_MAJOR}.${CLUCENE_VERSION_MINOR}.${CLUCENE
 #CMake 2.6+ is recommended to an improved Boost module
 CMAKE_MINIMUM_REQUIRED(VERSION 2.4.0 FATAL_ERROR)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
-
 if(COMMAND cmake_policy)
   cmake_policy(SET CMP0003 NEW)
   cmake_policy(SET CMP0043 NEW)
@@ -135,9 +133,30 @@ elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "LSAN")
     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_LSAN}")
 endif()
 
-if (USE_AVX2)
-       SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX2")
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64")
+    set (ARCH_AMD64 1)
+endif ()
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*)")
+    set (ARCH_AARCH64 1)
+endif ()
+if (ARCH_AARCH64 OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    set (ARCH_ARM 1)
+endif ()
+if (ARCH_AMD64)
+    if (USE_SSE4_2)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+    endif()
+    message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+    if (USE_AVX2)
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DUSE_AVX2")
+    endif()
 endif()
+
+if (ARCH_ARM)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc")
+endif()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+
 if (__COMPILER_CLANG)
        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-narrowing -g 
-fno-omit-frame-pointer")
 else ()
diff --git a/cmake/Toolchain-aarch64.cmake b/cmake/Toolchain-aarch64.cmake
new file mode 100644
index 00000000000..948164d513f
--- /dev/null
+++ b/cmake/Toolchain-aarch64.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
diff --git a/src/core/CLucene/index/CodeMode.h 
b/src/core/CLucene/index/CodeMode.h
index 3c39e94ecb6..05dd8b82649 100644
--- a/src/core/CLucene/index/CodeMode.h
+++ b/src/core/CLucene/index/CodeMode.h
@@ -5,7 +5,9 @@ CL_NS_DEF(index)
 enum class CodeMode { 
   kDefault = 0,
   kPfor = 1,
-  kRange = 2
+  kRange = 2,
+  kPfor256 = 3,
+  kPfor128 = 4
 };
 
 CL_NS_END
\ No newline at end of file
diff --git a/src/core/CLucene/index/FieldInfos.cpp 
b/src/core/CLucene/index/FieldInfos.cpp
index 00e0c4275a5..155c14e945f 100644
--- a/src/core/CLucene/index/FieldInfos.cpp
+++ b/src/core/CLucene/index/FieldInfos.cpp
@@ -21,24 +21,20 @@ CL_NS_USE(document)
 CL_NS_USE(util)
 CL_NS_DEF(index)
 
-
-FieldInfo::FieldInfo(const TCHAR *_fieldName,
-                     const bool _isIndexed,
-                     const int32_t _fieldNumber,
-                     const bool _storeTermVector,
-                     const bool _storeOffsetWithTermVector,
-                     const bool _storePositionWithTermVector,
-                     const bool _omitNorms,
-                                                                               
 const bool _hasProx,
-                     const bool _storePayloads) : 
name(CLStringIntern::intern(_fieldName )),
-                                                  isIndexed(_isIndexed),
-                                                  number(_fieldNumber),
-                                                  
storeTermVector(_storeTermVector),
-                                                  
storeOffsetWithTermVector(_storeOffsetWithTermVector),
-                                                  
storePositionWithTermVector(_storePositionWithTermVector),
-                                                  omitNorms(_omitNorms), 
hasProx(_hasProx),
-                                                                               
                                                                                
                                        storePayloads(_storePayloads) {
-}
+FieldInfo::FieldInfo(const TCHAR* _fieldName, const bool _isIndexed, const 
int32_t _fieldNumber,
+                     const bool _storeTermVector, const bool 
_storeOffsetWithTermVector,
+                     const bool _storePositionWithTermVector, const bool 
_omitNorms,
+                     const bool _hasProx, const bool _storePayloads, const 
bool _compatibleRead)
+        : name(CLStringIntern::intern(_fieldName)),
+          isIndexed(_isIndexed),
+          number(_fieldNumber),
+          storeTermVector(_storeTermVector),
+          storeOffsetWithTermVector(_storeOffsetWithTermVector),
+          storePositionWithTermVector(_storePositionWithTermVector),
+          omitNorms(_omitNorms),
+          hasProx(_hasProx),
+          storePayloads(_storePayloads),
+          compatibleRead(_compatibleRead) {}
 
 FieldInfo::~FieldInfo(){
        CL_NS(util)::CLStringIntern::unintern(name);
@@ -46,7 +42,7 @@ FieldInfo::~FieldInfo(){
 
 FieldInfo* FieldInfo::clone() {
        return _CLNEW FieldInfo(name, isIndexed, number, storeTermVector, 
storePositionWithTermVector,
-               storeOffsetWithTermVector, omitNorms, hasProx, storePayloads);
+               storeOffsetWithTermVector, omitNorms, hasProx, storePayloads, 
compatibleRead);
 }
 
 FieldInfos::FieldInfos():
@@ -103,6 +99,17 @@ bool FieldInfos::hasProx() {
        return false;
 }
 
+bool FieldInfos::compatibleRead() {
+       int numFields = byNumber.size();
+       for (int i = 0; i < numFields; i++) {
+               FieldInfo* fi = fieldInfo(i);
+               if (fi->compatibleRead) {
+                       return true;
+               }
+       }
+       return false;
+}
+
 IndexVersion FieldInfos::getIndexVersion() {
        int numFields = byNumber.size();
        for (int i = 0; i < numFields; i++) {
@@ -137,11 +144,11 @@ void FieldInfos::add(const TCHAR** names, const bool 
isIndexed, const bool store
 FieldInfo* FieldInfos::add(const TCHAR* name, const bool isIndexed, const bool 
storeTermVector,
                            const bool storePositionWithTermVector,
                            const bool storeOffsetWithTermVector, const bool 
omitNorms,
-                           const bool hasProx, const bool storePayloads) {
+                           const bool hasProx, const bool storePayloads, const 
bool compatibleRead) {
   FieldInfo* fi = fieldInfo(name);
        if (fi == NULL) {
                return addInternal(name, isIndexed, storeTermVector, 
storePositionWithTermVector,
-                                                                               
                storeOffsetWithTermVector, omitNorms, hasProx, storePayloads);
+                                                                               
                storeOffsetWithTermVector, omitNorms, hasProx, storePayloads, 
compatibleRead);
   } else {
                if (fi->isIndexed != isIndexed) {
                        fi->isIndexed = true;                      // once 
indexed, always index
@@ -164,6 +171,9 @@ FieldInfo* FieldInfos::add(const TCHAR* name, const bool 
isIndexed, const bool s
                if (fi->storePayloads != storePayloads) {
                        fi->storePayloads = true;
                }
+               if (fi->compatibleRead != compatibleRead) {
+                       fi->compatibleRead = compatibleRead;
+               }
        }
        return fi;
 }
@@ -172,10 +182,10 @@ FieldInfo* FieldInfos::addInternal(const TCHAR* name, 
const bool isIndexed,
                                    const bool storeTermVector,
                                    const bool storePositionWithTermVector,
                                    const bool storeOffsetWithTermVector, const 
bool omitNorms,
-                                   const bool hasProx, const bool 
storePayloads) {
+                                   const bool hasProx, const bool 
storePayloads, const bool compatibleRead) {
        FieldInfo* fi = _CLNEW FieldInfo(name, isIndexed, byNumber.size(), 
storeTermVector,
                                                                                
                                                                
storePositionWithTermVector, storeOffsetWithTermVector,
-                                                                               
                                                                omitNorms, 
hasProx, storePayloads);
+                                                                               
                                                                omitNorms, 
hasProx, storePayloads, compatibleRead);
   byNumber.push_back(fi);
        byName.put( fi->name, fi);
        return fi;
diff --git a/src/core/CLucene/index/SDocumentWriter.cpp 
b/src/core/CLucene/index/SDocumentWriter.cpp
index 2b85fe5bbca..8d5df79a447 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -1198,31 +1198,13 @@ void 
SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa
 
         skipListWriter->resetSkip();
 
-        auto encode = [](IndexOutput* out, std::vector<uint32_t>& buffer, bool 
isDoc) {
-            std::vector<uint8_t> compress(4 * buffer.size() + PFOR_BLOCK_SIZE);
-            size_t size = 0;
-            if (isDoc) {
-                size = P4ENC(buffer.data(), buffer.size(), compress.data());
-            } else {
-                size = P4NZENC(buffer.data(), buffer.size(), compress.data());
-            }
-            out->writeVInt(size);
-            out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), 
size);
-            buffer.resize(0);
-        };
-
         // Now termStates has numToMerge FieldMergeStates
         // which all share the same term.  Now we must
         // interleave the docID streams.
         while (numToMerge > 0) {
 
             if ((++df % skipInterval) == 0) {
-                freqOut->writeByte((char)CodeMode::kPfor);
-                freqOut->writeVInt(docDeltaBuffer.size());
-                encode(freqOut, docDeltaBuffer, true);
-                if (hasProx_) {
-                    encode(freqOut, freqBuffer, false);
-                }
+                pfor_encode(freqOut, docDeltaBuffer, freqBuffer, hasProx_);
 
                 skipListWriter->setSkipData(lastDoc, 
currentFieldStorePayloads, lastPayloadLength);
                 skipListWriter->bufferSkip(df);
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp 
b/src/core/CLucene/index/SegmentTermDocs.cpp
index e346dc0ca24..ae9e3a4508f 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -22,7 +22,7 @@ SegmentTermDocs::SegmentTermDocs(const SegmentReader 
*_parent) : parent(_parent)
                                                                  count(0), 
df(0), deletedDocs(_parent->deletedDocs), _doc(-1), _freq(0), 
skipInterval(_parent->tis->getSkipInterval()),
                                                                  
maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL), 
freqBasePointer(0), proxBasePointer(0),
                                                                  
skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0), 
indexVersion_(_parent->_fieldInfos->getIndexVersion()),
-                                                                 
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, 
indexVersion_) {
+                                                                 
hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, 
indexVersion_, _parent->_fieldInfos->compatibleRead()) {
     CND_CONDITION(_parent != NULL, "Parent is NULL");
     memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
     memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t));
@@ -222,13 +222,13 @@ int32_t TermDocsBuffer::refillV0() {
                 uint32_t SerializedSize = freqStream_->readVInt();
                 std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
                 freqStream_->readBytes(buf.data(), SerializedSize);
-                P4DEC(buf.data(), arraySize, docs_.data());
+                util::P4DEC(buf.data(), arraySize, docs_.data());
             }
             {
                 uint32_t SerializedSize = freqStream_->readVInt();
                 std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
                 freqStream_->readBytes(buf.data(), SerializedSize);
-                P4NZDEC(buf.data(), arraySize, freqs_.data());
+                util::P4NZDEC(buf.data(), arraySize, freqs_.data());
             }
         } else if (mode == (char)CodeMode::kDefault) {
             uint32_t docDelta = 0;
@@ -258,7 +258,7 @@ int32_t TermDocsBuffer::refillV0() {
                 uint32_t serializedSize = freqStream_->readVInt();
                 std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE);
                 freqStream_->readBytes(buf.data(), serializedSize);
-                P4DEC(buf.data(), arraySize, docs_.data());
+                util::P4DEC(buf.data(), arraySize, docs_.data());
             }
         }
         return arraySize;
@@ -266,40 +266,7 @@ int32_t TermDocsBuffer::refillV0() {
 }
 
 int32_t TermDocsBuffer::refillV1() {
-    char mode = freqStream_->readByte();
-    uint32_t arraySize = freqStream_->readVInt();
-    if (mode == (char)CodeMode::kPfor) {
-        {
-            uint32_t SerializedSize = freqStream_->readVInt();
-            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
-            freqStream_->readBytes(buf.data(), SerializedSize);
-            P4DEC(buf.data(), arraySize, docs_.data());
-        }
-        if (hasProx_) {
-            uint32_t SerializedSize = freqStream_->readVInt();
-            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
-            freqStream_->readBytes(buf.data(), SerializedSize);
-            P4NZDEC(buf.data(), arraySize, freqs_.data());
-        }
-    } else if (mode == (char)CodeMode::kDefault) {
-        uint32_t docDelta = 0;
-        for (uint32_t i = 0; i < arraySize; i++) {
-            uint32_t docCode = freqStream_->readVInt();
-            if (hasProx_) {
-                docDelta += (docCode >> 1);
-                docs_[i] = docDelta;
-                if ((docCode & 1) != 0) {
-                    freqs_[i] = 1;
-                } else {
-                    freqs_[i] = freqStream_->readVInt();
-                }
-            } else {
-                docDelta += docCode;
-                docs_[i] = docDelta;
-            }            
-        }
-    }
-    return arraySize;
+    return pfor_decode(freqStream_, docs_, freqs_, hasProx_, compatibleRead_);
 }
 
 CL_NS_END
diff --git a/src/core/CLucene/index/_FieldInfos.h 
b/src/core/CLucene/index/_FieldInfos.h
index ed142c4435c..f80388bb73d 100644
--- a/src/core/CLucene/index/_FieldInfos.h
+++ b/src/core/CLucene/index/_FieldInfos.h
@@ -38,6 +38,7 @@ class FieldInfo :LUCENE_BASE{
        IndexVersion indexVersion_ = IndexVersion::kV1;
 
        bool storePayloads; // whether this field stores payloads together with 
term positions
+       bool compatibleRead; // whether index docid list is read cross 
platform(eg x86 and arm64)
 
        //Func - Constructor
        //       Initialises FieldInfo.
@@ -59,7 +60,8 @@ class FieldInfo :LUCENE_BASE{
                const bool storePositionWithTermVector,
                const bool omitNorms,
                const bool hasProx,
-               const bool storePayloads);
+               const bool storePayloads,
+               const bool compatibleRead);
 
     //Func - Destructor
        //Pre  - true
@@ -133,6 +135,7 @@ public:
        void addIndexed(const TCHAR** names, const bool storeTermVectors, const 
bool storePositionWithTermVector, const bool storeOffsetWithTermVector);
 
        bool hasProx();
+       bool compatibleRead();
        IndexVersion getIndexVersion();
 
        /**
@@ -167,13 +170,13 @@ public:
        FieldInfo* add(const TCHAR* name, const bool isIndexed, const bool 
storeTermVector = false,
                                                                        const 
bool storePositionWithTermVector = false,
                                                                        const 
bool storeOffsetWithTermVector = false, const bool omitNorms = false,
-                                                                       const 
bool hasProx = false, const bool storePayloads = false);
+                                                                       const 
bool hasProx = false, const bool storePayloads = false, const bool 
compatibleRead = false);
 
   // was void
        FieldInfo* addInternal(const TCHAR* name, const bool isIndexed, const 
bool storeTermVector,
                                                                                
                        const bool storePositionWithTermVector,
                                                                                
                        const bool storeOffsetWithTermVector, const bool 
omitNorms,
-                                                                               
                        const bool hasProx, const bool storePayloads);
+                                                                               
                        const bool hasProx, const bool storePayloads, const 
bool compatibleRead = false);
 
        int32_t fieldNumber(const TCHAR* fieldName)const;
        
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index c1f01e7cecb..8087423953b 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -32,12 +32,13 @@ class SegmentReader;
 
 class TermDocsBuffer {
 public:
-  TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, 
IndexVersion indexVersion)
+  TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, 
IndexVersion indexVersion, bool compatibleRead)
       : docs_(PFOR_BLOCK_SIZE + 3),
         freqs_(PFOR_BLOCK_SIZE + 3),
         freqStream_(freqStream),
         hasProx_(hasProx),
-        indexVersion_(indexVersion) {
+        indexVersion_(indexVersion),
+        compatibleRead_(compatibleRead) {
   }
 
   ~TermDocsBuffer() {
@@ -83,6 +84,7 @@ private:
   CL_NS(store)::IndexInput* freqStream_ = nullptr;
 
   bool hasProx_ = false;
+  bool compatibleRead_ = false;
   IndexVersion indexVersion_ = IndexVersion::kV0; 
 };
 
diff --git a/src/core/CLucene/util/PFORUtil.cpp 
b/src/core/CLucene/util/PFORUtil.cpp
index ae27f521553..d241a305acf 100644
--- a/src/core/CLucene/util/PFORUtil.cpp
+++ b/src/core/CLucene/util/PFORUtil.cpp
@@ -15,19 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 #include "PFORUtil.h"
+#include "CLucene/debug/error.h"
+#include "CLucene/index/CodeMode.h"
 #include "vp4.h"
 #if (defined(__i386) || defined(__x86_64__))
 #include <cpuid.h>
 #endif
 
-namespace {
+CL_NS_DEF(util)
 using DEC_FUNC = size_t (*)(unsigned char *__restrict, size_t, uint32_t 
*__restrict);
 using ENC_FUNC = size_t (*)(uint32_t *__restrict in, size_t n, unsigned char 
*__restrict out);
 DEC_FUNC g_p4nd1dec;
 DEC_FUNC g_p4nzdec;
 ENC_FUNC g_p4nd1enc;
 ENC_FUNC g_p4nzenc;
-} // anonymous namespace
 
 size_t DefaultDEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict 
out) {
     size_t bufferSize = 0;
@@ -129,3 +130,188 @@ size_t P4ENC(uint32_t *__restrict in, size_t n, unsigned 
char *__restrict out) {
 size_t P4NZENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict 
out) {
     return g_p4nzenc(in, n, out);
 }
+void pfor_encode(store::IndexOutput* out, std::vector<uint32_t>& 
docDeltaBuffer, std::vector<uint32_t>& freqBuffer, bool has_prox) {
+#ifdef __AVX2__
+    out->writeByte((char)index::CodeMode::kPfor256);
+    out->writeVInt(docDeltaBuffer.size());
+    std::vector<uint8_t> compress(4 * docDeltaBuffer.size() + PFOR_BLOCK_SIZE);
+    size_t size = 0;
+    size = p4nd1enc256v32(docDeltaBuffer.data(), docDeltaBuffer.size(), 
compress.data());
+    out->writeVInt(size);
+    out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size);
+    if (has_prox) {
+        size = p4nzenc256v32(freqBuffer.data(), freqBuffer.size(), 
compress.data());
+        out->writeVInt(size);
+        out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), 
size);
+    }
+#elif (defined(__SSSE3__) || defined(__ARM_NEON))
+    out->writeByte((char)index::CodeMode::kPfor128);
+    out->writeVInt(docDeltaBuffer.size());
+    std::vector<uint8_t> compress(4 * docDeltaBuffer.size() + PFOR_BLOCK_SIZE);
+    size_t size = 0;
+    size = p4nd1enc32(docDeltaBuffer.data(), docDeltaBuffer.size(), 
compress.data());
+    out->writeVInt(size);
+    out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), size);
+    if (has_prox) {
+        size = p4nzenc32(freqBuffer.data(), freqBuffer.size(), 
compress.data());
+        out->writeVInt(size);
+        out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), 
size);
+    }
+#else
+    out->writeByte((char)index::CodeMode::kDefault);
+    out->writeVInt(docDeltaBuffer.size());
+    uint32_t lastDoc = 0;
+    for (int32_t i = 0; i < docDeltaBuffer.size(); i++) {
+        uint32_t curDoc = docDeltaBuffer[i];
+        if (has_prox) {
+            uint32_t newDocCode = (curDoc - lastDoc) << 1;
+            lastDoc = curDoc;
+            uint32_t freq = freqBuffer[i];
+            if (1 == freq) {
+                out->writeVInt(newDocCode | 1);
+            } else {
+                out->writeVInt(newDocCode);
+                out->writeVInt(freq);
+            }
+        } else {
+            out->writeVInt(curDoc - lastDoc);
+            lastDoc = curDoc;
+        }
+    }
+#endif
+    docDeltaBuffer.resize(0);
+    freqBuffer.resize(0);
+}
+
+uint32_t pfor_decode(store::IndexInput* in, std::vector<uint32_t>& docs, 
std::vector<uint32_t>& freqs, bool has_prox, bool compatibleRead) {
+    char mode = in->readByte();
+    uint32_t arraySize = in->readVInt();
+    // old version, need to separate read based on compatibleRead
+    if (mode == (char)index::CodeMode::kPfor) {
+        {
+            uint32_t SerializedSize = in->readVInt();
+            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+            in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+            // if compatibleRead is true, means we are reading old version 
arm64 index in x86_64 platform.
+            if (compatibleRead) {
+                p4nd1dec32(buf.data(), arraySize, docs.data());
+            } else {
+                p4nd1dec256v32(buf.data(), arraySize, docs.data());
+            }
+#elif (defined(__ARM_NEON))
+            // if compatibleRead is true, means we are reading old version 
x86_64 index in arm64 platform.
+            if (compatibleRead) {
+                p4nd1dec256scalarv32(buf.data(), arraySize, docs.data());
+            } else {
+                p4nd1dec32(buf.data(), arraySize, docs.data());
+            }
+#elif (defined(__SSSE3__))
+            // if compatibleRead is true, means we are reading old version 
x86_64 index in x86_64 which does not support avx2.
+            if (compatibleRead) {
+                p4nd1dec256scalarv32(buf.data(), arraySize, docs.data());
+            } else {
+                DefaultDDEC(buf.data(), arraySize, docs.data());
+            }
+#else
+            DefaultDDEC(buf.data(), arraySize, docs.data());
+#endif
+        }
+        if (has_prox) {
+            uint32_t SerializedSize = in->readVInt();
+            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+            in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+            // if compatibleRead is true, means we are reading old version 
arm64 index in x86_64 platform.
+            if (compatibleRead) {
+                p4nzdec32(buf.data(), arraySize, freqs.data());
+            } else {
+                p4nzdec256v32(buf.data(), arraySize, freqs.data());
+            }
+#elif (defined(__ARM_NEON))
+            // if compatibleRead is true, means we are reading old version 
x86_64 index in arm64 platform.
+            if (compatibleRead) {
+                p4nzdec256scalarv32(buf.data(), arraySize, freqs.data());
+            } else {
+                p4nzdec32(buf.data(), arraySize, freqs.data());
+            }
+#elif (defined(__SSSE3__))
+            // if compatibleRead is true, means we are reading old version 
x86_64 index in x86_64 which does not support avx2.
+            if (compatibleRead) {
+                p4nzdec256scalarv32(buf.data(), arraySize, freqs.data());
+            } else {
+                DefaultDEC(buf.data(), arraySize, freqs.data());
+            }
+#else
+            DefaultDEC(buf.data(), arraySize, freqs.data());
+#endif
+        }
+    } else if (mode == (char)index::CodeMode::kDefault) {
+        uint32_t docDelta = 0;
+        for (uint32_t i = 0; i < arraySize; i++) {
+            uint32_t docCode = in->readVInt();
+            if (has_prox) {
+                docDelta += (docCode >> 1);
+                docs[i] = docDelta;
+                if ((docCode & 1) != 0) {
+                    freqs[i] = 1;
+                } else {
+                    freqs[i] = in->readVInt();
+                }
+            } else {
+                docDelta += docCode;
+                docs[i] = docDelta;
+            }            
+        }
+    } else if (mode == (char)index::CodeMode::kPfor256) {
+        // new version, read based on compatibleRead
+        {
+            uint32_t SerializedSize = in->readVInt();
+            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+            in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+            p4nd1dec256v32(buf.data(), arraySize, docs.data());
+#else
+            _CLTHROWA(CL_ERR_CorruptIndex, "PFOR256 is not supported on this 
platform");
+#endif
+        }
+        if (has_prox) {
+            uint32_t SerializedSize = in->readVInt();
+            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+            in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+            p4nzdec256v32(buf.data(), arraySize, freqs.data());
+#else
+            _CLTHROWA(CL_ERR_CorruptIndex, "PFOR256 is not supported on this 
platform");
+#endif
+        }
+    } else if (mode == (char)index::CodeMode::kPfor128) {
+        // new version, read based on compatibleRead
+         {
+            uint32_t SerializedSize = in->readVInt();
+            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+            in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+            p4nd1dec32(buf.data(), arraySize, docs.data());
+#elif (defined(__SSSE3__) || defined(__ARM_NEON))
+            p4nd1dec32(buf.data(), arraySize, docs.data());
+#else
+            _CLTHROWA(CL_ERR_CorruptIndex, "PFOR128 is not supported on this 
platform");
+#endif
+        }
+        if (has_prox) {
+            uint32_t SerializedSize = in->readVInt();
+            std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+            in->readBytes(buf.data(), SerializedSize);
+#if defined(USE_AVX2) && defined(__AVX2__)
+            p4nzdec32(buf.data(), arraySize, freqs.data());
+#elif (defined(__SSSE3__) || defined(__ARM_NEON))
+            p4nzdec32(buf.data(), arraySize, freqs.data());
+#else
+            _CLTHROWA(CL_ERR_CorruptIndex, "PFOR128 is not supported on this 
platform");
+#endif
+        }
+    }
+    return arraySize;
+}
+CL_NS_END
diff --git a/src/core/CLucene/util/PFORUtil.h b/src/core/CLucene/util/PFORUtil.h
index 29acb7fe7a6..bf44cb1bc23 100644
--- a/src/core/CLucene/util/PFORUtil.h
+++ b/src/core/CLucene/util/PFORUtil.h
@@ -18,9 +18,17 @@
 
 #include <cstddef>
 #include <cstdint>
+#include "CLucene/SharedHeader.h"
+#include "CLucene/CLConfig.h"
+#include "CLucene/store/IndexOutput.h"
+#include "CLucene/store/IndexInput.h"
+#include <vector>
+CL_NS_DEF(util)
 
 size_t P4DEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
 size_t P4NZDEC(unsigned char *__restrict in, size_t n, uint32_t *__restrict 
out);
 size_t P4ENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
 size_t P4NZENC(uint32_t *__restrict in, size_t n, unsigned char *__restrict 
out);
-
+void pfor_encode(store::IndexOutput* out, std::vector<uint32_t>& 
docDeltaBuffer, std::vector<uint32_t>& freqBuffer, bool has_prox);
+uint32_t pfor_decode(store::IndexInput* in, std::vector<uint32_t>& docs, 
std::vector<uint32_t>& freqs, bool has_prox, bool compatibleRead);
+CL_NS_END
diff --git a/src/ext/for/CMakeLists.txt b/src/ext/for/CMakeLists.txt
index 3b14781f69e..9c139a6f2a6 100644
--- a/src/ext/for/CMakeLists.txt
+++ b/src/ext/for/CMakeLists.txt
@@ -84,8 +84,24 @@ foreach(SRC_FILE ${SRC_FILES})
     endif()
 endforeach()
 
+add_executable(test_bitd1unpack test_bitd1unpack.cpp)
+
+target_link_libraries(test_bitd1unpack ic)
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    target_compile_options(test_bitd1unpack PRIVATE ${AVX2} -DAVX2_ON ${DEBUG})
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    target_compile_options(test_bitd1unpack PRIVATE -march=armv8-a ${DEBUG})
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+    target_compile_options(test_bitd1unpack PRIVATE -mcpu=power9 -mtune=power9 
-D__SSSE3__ ${DEBUG})
+endif()
+
 set(LIB_DESTINATION ../)
 
 install(TARGETS ic
       DESTINATION ${LIB_DESTINATION}
       COMPONENT ext)
+
+install(TARGETS test_bitd1unpack
+      DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+      COMPONENT tests)
diff --git a/src/ext/for/bitpack.h b/src/ext/for/bitpack.h
index b0b9e02275a..b8f91ad5690 100644
--- a/src/ext/for/bitpack.h
+++ b/src/ext/for/bitpack.h
@@ -30,6 +30,7 @@
 #include <stdint.h>
 #endif
 #include <stddef.h>
+#include <stdbool.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -271,6 +272,7 @@ unsigned char *bitunpack128v64(  const unsigned char 
*__restrict in, unsigned n,
 
 unsigned char *bitunpack256v32(  const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out,                 unsigned b);
 unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b);
+unsigned char *bitzunpack256scalarv32( const unsigned char *__restrict in, 
unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b);
 unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b);
@@ -299,10 +301,25 @@ unsigned char *_bitd1unpack128h32(const unsigned char 
*__restrict in, unsigned n
 unsigned char *_bitunpack256w32(  const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned 
char *bb);
 unsigned char *_bitunpack128v64(  const unsigned char *__restrict in, unsigned 
n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned 
char *bb);
 
+unsigned char* bitd1unpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                 unsigned* __restrict out, unsigned start, 
unsigned b);
+unsigned char* _bitd1unpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                  unsigned* __restrict out, unsigned start, 
unsigned b,
+                                  unsigned* __restrict pex, unsigned char* bb);
+unsigned char *bitunpack256scalarv32(const unsigned char *__restrict in, 
unsigned n, 
+                                     unsigned *__restrict out, unsigned b);
+unsigned char *_bitunpack256scalarv32(const unsigned char *__restrict in,
+                                unsigned n,
+                                unsigned *__restrict out,
+                                unsigned b,
+                                unsigned *__restrict pex,
+                                unsigned char *bb,
+                                bool isZigZag);
 unsigned char *_bitunpack256v32(  const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned 
char *bb);
 unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict 
pex, unsigned char *bb);
 unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict 
pex, unsigned char *bb);
 unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned 
n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict 
pex, unsigned char *bb);
+unsigned char *_bitzunpack256scalarv32( const unsigned char *__restrict in, 
unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned 
*__restrict pex, unsigned char *bb);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/ext/for/bitunpack.c b/src/ext/for/bitunpack.c
index 1dd78003ada..26817ac55bc 100644
--- a/src/ext/for/bitunpack.c
+++ b/src/ext/for/bitunpack.c
@@ -28,6 +28,9 @@
 #include "bitutil.h"
 #include "bitpack.h"
 #include "vint.h"
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
 
 #define PAD8(_x_) (((_x_)+7)/8)
 
@@ -690,6 +693,1205 @@ unsigned char *bitunpack256w32( const unsigned char 
*__restrict in, unsigned n,
   BITUNPACK128V32(in, b, out, sv);
   return (unsigned char *)_in+PAD8(256*b);
 }
+static void applyException_8bits(uint8_t xm8, uint32_t** pPEX, int nb, 
uint32_t ov[8]) {
+  uint32_t* ex = *pPEX;
+  for (int j = 0; j < 8; j++) {
+      if ((xm8 >> j) & 1) {
+          ov[j] += (ex[0] << nb);
+          ex++;
+      }
+  }
+  *pPEX = ex;
+}
+static inline uint32_t zigzagDecode_scalar(uint32_t x) {
+  // (x>>1) ^ -((x & 1) )
+  return (x >> 1) ^ -(x & 1);
+}
+static void bitunblk256v32_scalar_template(uint32_t** pIn, uint32_t** pOut, 
int expansions_count,
+                                           const uint8_t* SHIFT_HI, const 
uint8_t* SHIFT_LO,
+                                           const uint8_t* READ_FLAG, uint32_t 
mask, int nb,
+                                           bool isZigZag) {
+  const uint32_t* oldp = NULL; // pointer to current block data
+  uint32_t ov[8], tmp[8];
+
+  for (int k = 0; k < expansions_count; k++) {
+      if (k == 0) {
+          // Step 0: Load input block and directly take the lower nb bits
+          oldp = *pIn;
+          *pIn += 8;
+          for (int j = 0; j < 8; j++) {
+              ov[j] = oldp[j] & mask;
+          }
+      } else {
+          // First right shift the current block data by SHIFT_HI[k]
+          for (int j = 0; j < 8; j++) {
+              ov[j] = oldp[j] >> SHIFT_HI[k];
+          }
+          if (READ_FLAG[k]) {
+              // Need to load a new block: left shift the new block data by 
SHIFT_LO[k], then merge with ov
+              const uint32_t* newp = *pIn;
+              *pIn += 8;
+              for (int j = 0; j < 8; j++) {
+                  uint32_t part_lo = (newp[j] << SHIFT_LO[k]) & mask;
+                  ov[j] |= part_lo;
+              }
+              // Update current block pointer
+              oldp = newp;
+          } else {
+              // No need to load a new block, ensure the result is within mask 
range
+              for (int j = 0; j < 8; j++) {
+                  ov[j] &= mask;
+              }
+          }
+      }
+      // Write out the current 8 results
+      uint32_t* outp = *pOut;
+      for (int j = 0; j < 8; j++) {
+          if (isZigZag) {
+              outp[j] = zigzagDecode_scalar(ov[j]);
+          } else {
+              outp[j] = ov[j];
+          }
+      }
+      *pOut += 8;
+  }
+}
+/**
+ * Generic template: supports "some expansions don't need to read new blocks".
+ *
+ * Parameters:
+ *  - expansions_count: total number of expansions (for 29-bit, it might be 32 
times)
+ *  - SHIFT_HI[k], SHIFT_LO[k]: right shift for leftover, left shift for new 
block in k-th expansion
+ *  - READ_FLAG[k]: whether k-th expansion needs to read a new block (1 means 
yes, 0 means no)
+ *  - mask: for 29-bit = (1u << 29) - 1
+ *  - nb: base bits (29)
+ */
+static void bitunblk256v32_scalarBlock_ex_template(uint32_t** pIn, uint32_t** 
pOut, uint32_t** pPEX,
+                                                   unsigned char** pBB, int 
expansions_count,
+                                                   const uint8_t* SHIFT_HI, 
const uint8_t* SHIFT_LO,
+                                                   const uint8_t* READ_FLAG, 
uint32_t mask, int nb,
+                                                   bool isZigZag) {
+  const uint32_t* oldp = NULL; // leftover block (previous batch)
+
+  for (int k = 0; k < expansions_count; k++) {
+      uint32_t ov[8];
+
+      if (k == 0) {
+          // First time: directly read 8×32-bit and apply mask
+          oldp = *pIn;
+          *pIn += 8;
+          for (int j = 0; j < 8; j++) {
+              ov[j] = oldp[j] & mask;
+          }
+      } else {
+          // Subsequent expansions
+          uint8_t hi = SHIFT_HI[k];
+          uint8_t lo = SHIFT_LO[k];
+
+          // First shift leftover >> hi
+          for (int j = 0; j < 8; j++) {
+              ov[j] = (oldp[j] >> hi);
+          }
+
+          // If this expansion needs to read a new block, append newp << lo
+          if (READ_FLAG[k]) {
+              const uint32_t* newp = *pIn;
+              *pIn += 8;
+              for (int j = 0; j < 8; j++) {
+                  uint32_t part_lo = (newp[j] << lo) & mask;
+                  ov[j] |= part_lo;
+              }
+              // After reading, newp becomes the leftover for next time
+              oldp = newp;
+          } else {
+              // No need to read new block => just apply mask to leftover >> hi
+              for (int j = 0; j < 8; j++) {
+                  ov[j] &= mask;
+              }
+              // leftover remains unchanged, continue using oldp
+          }
+      }
+
+      // Apply exceptions
+      uint8_t xm8 = **pBB;
+      (*pBB)++;
+      applyException_8bits(xm8, pPEX, nb, ov);
+
+      // Write out this batch of 8 results
+      uint32_t* outp = *pOut;
+      for (int j = 0; j < 8; j++) {
+          if (isZigZag) {
+              outp[j] = zigzagDecode_scalar(ov[j]);
+          } else {
+              outp[j] = ov[j];
+          }
+      }
+      *pOut += 8;
+  }
+}
+static void bitunpack256v32_0_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  uint32_t* op = *pOut;
+  for (int i = 0; i < 32; i++) {
+      // Read bitmap if exists, otherwise default to 0
+      uint8_t xm8 = (pBB != NULL) ? **pBB : 0;
+      if (pBB != NULL) {
+          (*pBB)++;
+      }
+      // Initialize output array (all zeros by default)
+      uint32_t ov[8] = {0};
+      if (xm8 != 0 && pPEX != NULL) {
+          applyException_8bits(xm8, pPEX, 0, ov);
+      }
+
+      // Directly write 8 values using a loop to avoid repeated memory copy 
calls
+      for (int j = 0; j < 8; j++) {
+          if (isZigZag) {
+              op[j] = zigzagDecode_scalar(ov[j]);
+          } else {
+              op[j] = ov[j];
+          }
+      }
+      op += 8;
+  }
+  *pOut = op;
+}
+
+static void bitunpack256v32_1_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb1 = 1;
+  const uint32_t mask1 = 1; // 0x1
+  const int expansions_count_1 = 32;
+  static const uint8_t SHIFT_HI_1[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  
9,  10,
+                                         11, 12, 13, 14, 15, 16, 17, 18, 19, 
20, 21,
+                                         22, 23, 24, 25, 26, 27, 28, 29, 30, 
31};
+  static const uint8_t SHIFT_LO_1[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0};
+  static const uint8_t READ_FLAG_1[32] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_1, SHIFT_HI_1,
+                                             SHIFT_LO_1, READ_FLAG_1, mask1, 
nb1, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_1, 
SHIFT_HI_1, SHIFT_LO_1,
+                                     READ_FLAG_1, mask1, nb1, isZigZag);
+  }
+}
+
+static void bitunpack256v32_2_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb2 = 2;
+  const uint32_t mask2 = (1u << nb2) - 1; // 0x3
+  const int expansions_count_2 = 16;
+  static const uint8_t SHIFT_HI_2[16] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 
20, 22, 24, 26, 28, 30};
+  static const uint8_t SHIFT_LO_2[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0};
+  static const uint8_t READ_FLAG_2[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_2,
+                                                 SHIFT_HI_2, SHIFT_LO_2, 
READ_FLAG_2, mask2, nb2,
+                                                 isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_2, 
SHIFT_HI_2, SHIFT_LO_2,
+                                         READ_FLAG_2, mask2, nb2, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_3_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb3 = 3;
+  const uint32_t mask3 = (1u << nb3) - 1; // 0x7
+  const int expansions_count_3 = 32;
+  static const uint8_t SHIFT_HI_3[32] = {0, 3, 6, 9,  12, 15, 18, 21, 24, 27, 
30,
+                                         1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 
31,
+                                         2, 5, 8, 11, 14, 17, 20, 23, 26, 29};
+  static const uint8_t SHIFT_LO_3[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 
0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0};
+  static const uint8_t READ_FLAG_3[32] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_3, SHIFT_HI_3,
+                                             SHIFT_LO_3, READ_FLAG_3, mask3, 
nb3, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_3, 
SHIFT_HI_3, SHIFT_LO_3,
+                                     READ_FLAG_3, mask3, nb3, isZigZag);
+  }
+}
+
+static void bitunpack256v32_4_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const uint32_t mask4 = (1u << 4) - 1; // 0xF
+  const int nb = 4;                     // base bits
+  const int expansions_count = 8;
+  static const uint8_t SHIFT_HI_4[8] = {0, 4, 8, 12, 16, 20, 24, 28};
+  static const uint8_t SHIFT_LO_4[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  static const uint8_t READ_FLAG_4[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count, SHIFT_HI_4,
+                                                 SHIFT_LO_4, READ_FLAG_4, 
mask4, nb, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count, 
SHIFT_HI_4, SHIFT_LO_4,
+                                         READ_FLAG_4, mask4, nb, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_5_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb5 = 5;
+  const uint32_t mask5 = (1u << nb5) - 1; // 0x1F
+  const int expansions_count_5 = 32;
+  static const uint8_t SHIFT_HI_5[32] = {0,  5,  10, 15, 20, 25, 30, 3,  8,  
13, 18,
+                                         23, 28, 1,  6,  11, 16, 21, 26, 31, 
4,  9,
+                                         14, 19, 24, 29, 2,  7,  12, 17, 22, 
27};
+  static const uint8_t SHIFT_LO_5[32] = {0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 
4, 0, 0, 0,
+                                         0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 
0, 0, 0, 0};
+  static const uint8_t READ_FLAG_5[32] = {1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
1, 0, 0, 0,
+                                          0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 
0, 0, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_5, SHIFT_HI_5,
+                                             SHIFT_LO_5, READ_FLAG_5, mask5, 
nb5, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_5, 
SHIFT_HI_5, SHIFT_LO_5,
+                                     READ_FLAG_5, mask5, nb5, isZigZag);
+  }
+}
+static void bitunpack256v32_6_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb6 = 6;
+  const uint32_t mask6 = (1u << nb6) - 1; // 0x3F
+  const int expansions_count_6 = 16;
+  static const uint8_t SHIFT_HI_6[16] = {0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 
28, 2, 8, 14, 20, 26};
+  static const uint8_t SHIFT_LO_6[16] = {0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 4, 0, 
0, 0, 0, 0};
+  static const uint8_t READ_FLAG_6[16] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 
0, 0, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_6,
+                                                 SHIFT_HI_6, SHIFT_LO_6, 
READ_FLAG_6, mask6, nb6,
+                                                 isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_6, 
SHIFT_HI_6, SHIFT_LO_6,
+                                         READ_FLAG_6, mask6, nb6, isZigZag);
+      }
+  }
+}
+static void bitunpack256v32_7_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb7 = 7;
+  const uint32_t mask7 = (1u << nb7) - 1; // 0x7F
+  const int expansions_count = 32;
+  static const uint8_t SHIFT_HI_7[32] = {0,  7,  14, 21, 28, 3,  10, 17, 24, 
31, 6,
+                                         13, 20, 27, 2,  9,  16, 23, 30, 5,  
12, 19,
+                                         26, 1,  8,  15, 22, 29, 4,  11, 18, 
25};
+  static const uint8_t SHIFT_LO_7[32] = {0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 
0, 5, 0, 0,
+                                         0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 0, 3, 
0, 0, 0, 0};
+  static const uint8_t READ_FLAG_7[32] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 
0, 1, 0, 0,
+                                          0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 
0, 0, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count, SHIFT_HI_7,
+                                             SHIFT_LO_7, READ_FLAG_7, mask7, 
nb7, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_7, 
SHIFT_LO_7,
+                                     READ_FLAG_7, mask7, nb7, isZigZag);
+  }
+}
+static void bitunpack256v32_8_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb8 = 8;
+  const uint32_t mask8 = (1u << nb8) - 1; // 0xFF
+  const int expansions_count_8 = 4;
+  static const uint8_t SHIFT_HI_8[4] = {0, 8, 16, 24};
+  static const uint8_t SHIFT_LO_8[4] = {0, 0, 0, 0};
+  static const uint8_t READ_FLAG_8[4] = {1, 0, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 8; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_8,
+                                                 SHIFT_HI_8, SHIFT_LO_8, 
READ_FLAG_8, mask8, nb8,
+                                                 isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 8; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_8, 
SHIFT_HI_8, SHIFT_LO_8,
+                                         READ_FLAG_8, mask8, nb8, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_9_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                     unsigned char** pBB, bool isZigZag) {
+  const int nb9 = 9;
+  const uint32_t mask9 = (1u << nb9) - 1; // 0x1FF
+  const int expansions_count_9 = 32;
+  static const uint8_t SHIFT_HI_9[32] = {0, 9,  18, 27, 4,  13, 22, 31, 8,  
17, 26,
+                                         3, 12, 21, 30, 7,  16, 25, 2,  11, 
20, 29,
+                                         6, 15, 24, 1,  10, 19, 28, 5,  14, 
23};
+  static const uint8_t SHIFT_LO_9[32] = {0, 0, 0, 5, 0, 0, 0, 1, 0, 0, 6, 0, 
0, 0, 2, 0,
+                                         0, 7, 0, 0, 0, 3, 0, 0, 8, 0, 0, 0, 
4, 0, 0, 0};
+  static const uint8_t READ_FLAG_9[32] = {1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 
0, 0, 1, 0,
+                                          0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 
1, 0, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_9, SHIFT_HI_9,
+                                             SHIFT_LO_9, READ_FLAG_9, mask9, 
nb9, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_9, 
SHIFT_HI_9, SHIFT_LO_9,
+                                     READ_FLAG_9, mask9, nb9, isZigZag);
+  }
+}
+static void bitunpack256v32_10_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb10 = 10;
+  const uint32_t mask10 = (1u << nb10) - 1; // 0x3FF
+  const int expansions_count_10 = 16;
+  static const uint8_t SHIFT_HI_10[16] = {0,  10, 20, 30, 8,  18, 28, 6,
+                                          16, 26, 4,  14, 24, 2,  12, 22};
+  static const uint8_t SHIFT_LO_10[16] = {0, 0, 0, 2, 0, 0, 4, 0, 0, 6, 0, 0, 
8, 0, 0, 0};
+  static const uint8_t READ_FLAG_10[16] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 
1, 0, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_10,
+                                                 SHIFT_HI_10, SHIFT_LO_10, 
READ_FLAG_10, mask10,
+                                                 nb10, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_10, 
SHIFT_HI_10, SHIFT_LO_10,
+                                         READ_FLAG_10, mask10, nb10, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_11_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb11 = 11;
+  const uint32_t mask11 = (1u << nb11) - 1; // 0x7FF
+  const int expansions_count_11 = 32;
+  static const uint8_t SHIFT_HI_11[32] = {0,  11, 22, 1,  12, 23, 2,  13, 24, 
3,  14,
+                                          25, 4,  15, 26, 5,  16, 27, 6,  17, 
28, 7,
+                                          18, 29, 8,  19, 30, 9,  20, 31, 10, 
21};
+  static const uint8_t SHIFT_LO_11[32] = {0, 0, 10, 0, 0, 9, 0, 0, 8, 0, 0, 7, 
0, 0, 6, 0,
+                                          0, 5, 0,  0, 4, 0, 0, 3, 0, 0, 2, 0, 
0, 1, 0, 0};
+  static const uint8_t READ_FLAG_11[32] = {1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
0, 0, 1, 0,
+                                           0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 
0, 1, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_11, SHIFT_HI_11,
+                                             SHIFT_LO_11, READ_FLAG_11, 
mask11, nb11, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_11, 
SHIFT_HI_11, SHIFT_LO_11,
+                                     READ_FLAG_11, mask11, nb11, isZigZag);
+  }
+}
+
+static void bitunpack256v32_12_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb12 = 12;
+  const uint32_t mask12 = (1u << nb12) - 1; // 0xFFF
+  const int expansions_count_12 = 8;
+  static const uint8_t SHIFT_HI_12[8] = {0, 12, 24, 4, 16, 28, 8, 20};
+  static const uint8_t SHIFT_LO_12[8] = {0, 0, 8, 0, 0, 4, 0, 0};
+  static const uint8_t READ_FLAG_12[8] = {1, 0, 1, 0, 0, 1, 0, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_12,
+                                                 SHIFT_HI_12, SHIFT_LO_12, 
READ_FLAG_12, mask12,
+                                                 nb12, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_12, 
SHIFT_HI_12, SHIFT_LO_12,
+                                         READ_FLAG_12, mask12, nb12, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_13_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb13 = 13;
+  const uint32_t mask13 = (1u << nb13) - 1; // 0x1FFF
+  const int expansions_count_13 = 32;
+  static const uint8_t SHIFT_HI_13[32] = {0,  13, 26, 7,  20, 1,  14, 27, 8,  
21, 2,
+                                          15, 28, 9,  22, 3,  16, 29, 10, 23, 
4,  17,
+                                          30, 11, 24, 5,  18, 31, 12, 25, 6,  
19};
+  static const uint8_t SHIFT_LO_13[32] = {0, 0, 6, 0, 12, 0, 0, 5, 0, 11, 0, 
0, 4, 0, 10, 0,
+                                          0, 3, 0, 9, 0,  0, 2, 0, 8, 0,  0, 
1, 0, 7, 0,  0};
+  static const uint8_t READ_FLAG_13[32] = {1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 
1, 0, 1, 0,
+                                           0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 
0, 1, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_13, SHIFT_HI_13,
+                                             SHIFT_LO_13, READ_FLAG_13, 
mask13, nb13, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_13, 
SHIFT_HI_13, SHIFT_LO_13,
+                                     READ_FLAG_13, mask13, nb13, isZigZag);
+  }
+}
+
+static void bitunpack256v32_14_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb14 = 14;
+  const uint32_t mask14 = (1u << nb14) - 1; // 0x3FFF
+  const int expansions_count_14 = 16;
+  static const uint8_t SHIFT_HI_14[16] = {0,  14, 28, 10, 24, 6,  20, 2,
+                                          16, 30, 12, 26, 8,  22, 4,  18};
+  static const uint8_t SHIFT_LO_14[16] = {0, 0, 4, 0, 8, 0, 12, 0, 0, 2, 0, 6, 
0, 10, 0, 0};
+  static const uint8_t READ_FLAG_14[16] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 
0, 1, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_14,
+                                                 SHIFT_HI_14, SHIFT_LO_14, 
READ_FLAG_14, mask14,
+                                                 nb14, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_14, 
SHIFT_HI_14, SHIFT_LO_14,
+                                         READ_FLAG_14, mask14, nb14, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_15_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb15 = 15;
+  const uint32_t mask15 = (1u << 15) - 1; // 0x7FFF
+
+  // expansions=32 => unpacks 256 values at once
+  const int expansions_count_15 = 32;
+
+  static const uint8_t SHIFT_HI_15[32] = {0,  15, 30, 13, 28, 11, 26, 9,  24, 
7,  22,
+                                          5,  20, 3,  18, 1,  16, 31, 14, 29, 
12, 27,
+                                          10, 25, 8,  23, 6,  21, 4,  19, 2,  
17};
+
+  static const uint8_t SHIFT_LO_15[32] = {0, 0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 
 12, 0,  14, 0,
+                                          0, 1, 0, 3, 0, 5, 0, 7, 0, 9, 0,  
11, 0,  13, 0,  0};
+
+  static const uint8_t READ_FLAG_15[32] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 
1, 0, 1, 0,
+                                           0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 
0, 1, 0, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_15, SHIFT_HI_15,
+                                             SHIFT_LO_15, READ_FLAG_15, 
mask15, nb15, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_15, 
SHIFT_HI_15, SHIFT_LO_15,
+                                     READ_FLAG_15, mask15, nb15, isZigZag);
+  }
+}
+
+static void bitunpack256v32_16_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb16 = 16;
+  const uint32_t mask16 = (1u << 16) - 1; // 0xFFFF
+
+  const int expansions_count = 2;
+  // Iteration 0: directly read 8×32-bit; Iteration 1: only right shift 16 
bits, no new data read
+  static const uint8_t SHIFT_HI_16[2] = {0, 16};
+  static const uint8_t SHIFT_LO_16[2] = {0, 0};
+  static const uint8_t READ_FLAG_16[2] = {1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 16; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count,
+                                                 SHIFT_HI_16, SHIFT_LO_16, 
READ_FLAG_16, mask16,
+                                                 nb16, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 16; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count, 
SHIFT_HI_16, SHIFT_LO_16,
+                                         READ_FLAG_16, mask16, nb16, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_17_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const int nb17 = 17;
+  const uint32_t mask17 = (1u << 17) - 1; // 0x1FFFF
+
+  // expansions=32 => unpacks 256 values
+  const int expansions_count_17 = 32;
+
+  static const uint8_t SHIFT_HI_17[32] = {0,  17, 2,  19, 4,  21, 6,  23, 8,  
25, 10,
+                                          27, 12, 29, 14, 31, 16, 1,  18, 3,  
20, 5,
+                                          22, 7,  24, 9,  26, 11, 28, 13, 30, 
15};
+
+  static const uint8_t SHIFT_LO_17[32] = {0,  15, 0,  13, 0,  11, 0,  9, 0, 7, 
0, 5, 0, 3, 0, 1,
+                                          16, 0,  14, 0,  12, 0,  10, 0, 8, 0, 
6, 0, 4, 0, 2, 0};
+
+  static const uint8_t READ_FLAG_17[32] = {1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 
0, 1, 0, 1,
+                                           1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 
1, 0, 1, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_17, SHIFT_HI_17,
+                                             SHIFT_LO_17, READ_FLAG_17, 
mask17, nb17, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_17, 
SHIFT_HI_17, SHIFT_LO_17,
+                                     READ_FLAG_17, mask17, nb17, isZigZag);
+  }
+}
+
+static void bitunpack256v32_18_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const int nb18 = 18;
+  const uint32_t mask18 = (1u << 18) - 1; // 0x3FFFF
+
+  // expansions=16 => 128 values
+  const int expansions_count_18 = 16;
+
+  static const uint8_t SHIFT_HI_18[16] = {0,  18, 4,  22, 8,  26, 12, 30,
+                                          16, 2,  20, 6,  24, 10, 28, 14};
+  static const uint8_t SHIFT_LO_18[16] = {0, 14, 0, 10, 0, 6, 0, 2, 16, 0, 12, 
0, 8, 0, 4, 0};
+  static const uint8_t READ_FLAG_18[16] = {// #0 =>1, #1 =>1, #2=>0, #3=>1,
+                                           // #4 =>0, #5 =>1, #6=>0, #7=>1,
+                                           // #8 =>1, #9 =>0, #10=>1, #11=>0,
+                                           // #12=>1, #13=>0, #14=>1, #15=>0
+                                           1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 
1, 0, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_18,
+                                                 SHIFT_HI_18, SHIFT_LO_18, 
READ_FLAG_18, mask18,
+                                                 nb18, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_18, 
SHIFT_HI_18, SHIFT_LO_18,
+                                         READ_FLAG_18, mask18, nb18, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_19_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const int nb19 = 19;
+  const uint32_t mask19 = (1u << 19) - 1; // 0x7FFFF
+
+  // expansions=32 => unpacks 256 values at once
+  const int expansions_count_19 = 32;
+
+  static const uint8_t SHIFT_HI_19[32] = {0,  19, 6,  25, 12, 31, 18, 5,  24, 
11, 30,
+                                          17, 4,  23, 10, 29, 16, 3,  22, 9,  
28, 15,
+                                          2,  21, 8,  27, 14, 1,  20, 7,  26, 
13};
+  static const uint8_t SHIFT_LO_19[32] = {0,  13, 0,  7, 0, 1,  14, 0,  8, 0, 
2,  15, 0,  9, 0, 3,
+                                          16, 0,  10, 0, 4, 17, 0,  11, 0, 5, 
18, 0,  12, 0, 6, 0};
+  static const uint8_t READ_FLAG_19[32] = {1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 
0, 1, 0, 1,
+                                           1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 
1, 0, 1, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_19, SHIFT_HI_19,
+                                             SHIFT_LO_19, READ_FLAG_19, 
mask19, nb19, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_19, 
SHIFT_HI_19, SHIFT_LO_19,
+                                     READ_FLAG_19, mask19, nb19, isZigZag);
+  }
+}
+
+static void bitunpack256v32_20_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const int nb20 = 20;
+  const uint32_t mask20 = (1u << 20) - 1; // 0xFFFFF
+
+  // expansions=8 => process 64 values at once
+  const int expansions_count_20 = 8;
+
+  // shift tables for k=0..7
+  static const uint8_t SHIFT_HI_20[8] = {0, 20, 8, 28, 16, 4, 24, 12};
+  static const uint8_t SHIFT_LO_20[8] = {0, 12, 0, 4, 16, 0, 8, 0};
+  static const uint8_t READ_FLAG_20[8] = {1, 1, 0, 1, 1, 0, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_20,
+                                                 SHIFT_HI_20, SHIFT_LO_20, 
READ_FLAG_20, mask20,
+                                                 nb20, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_20, 
SHIFT_HI_20, SHIFT_LO_20,
+                                         READ_FLAG_20, mask20, nb20, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_21_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const uint32_t mask21 = (1u << 21) - 1; // 0x1FFFFF
+  const int nb21 = 21;
+
+  // expansions=32 => unpacks 256 values at once
+  const int expansions_count_21 = 32;
+
+  static const uint8_t SHIFT_HI_21[32] = {0,  21, 10, 31, 20, 9,  30, 19, 8,  
29, 18,
+                                          7,  28, 17, 6,  27, 16, 5,  26, 15, 
4,  25,
+                                          14, 3,  24, 13, 2,  23, 12, 1,  22, 
11};
+  static const uint8_t SHIFT_LO_21[32] = {0,  11, 0,  1,  12, 0,  2,  13, 0,  
3, 14,
+                                          0,  4,  15, 0,  5,  16, 0,  6,  17, 
0, 7,
+                                          18, 0,  8,  19, 0,  9,  20, 0,  10, 
0};
+  static const uint8_t READ_FLAG_21[32] = {
+          // Check original expansions #k if there's a "load #X" => 1 if yes, 
0 if no
+          1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
+          1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_21, SHIFT_HI_21,
+                                             SHIFT_LO_21, READ_FLAG_21, 
mask21, nb21, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_21, 
SHIFT_HI_21, SHIFT_LO_21,
+                                     READ_FLAG_21, mask21, nb21, isZigZag);
+  }
+}
+
+static void bitunpack256v32_22_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const uint32_t mask22 = (1u << 22) - 1; // 0x3FFFFF
+  const int nb22 = 22;
+
+  // b=22 => one block function with expansions=16 => outputs 128 values
+  // need to call it twice to get 256 values
+  const int expansions_count_22 = 16;
+
+  static const uint8_t SHIFT_HI_22[16] = {/* 0 */ 0,  /* 1 */ 22, /* 2 */ 12, 
/* 3 */ 2,
+                                          /* 4 */ 24, /* 5 */ 14, /* 6 */ 4,  
/* 7 */ 26,
+                                          /* 8 */ 16, /* 9 */ 6,  /*10 */ 28, 
/*11 */ 18,
+                                          /*12 */ 8,  /*13 */ 30, /*14 */ 20, 
/*15 */ 10};
+
+  static const uint8_t SHIFT_LO_22[16] = {/* 0 */ 0,  /* 1 */ 10, /* 2 */ 20, 
/* 3 */ 0,
+                                          /* 4 */ 8,  /* 5 */ 18, /* 6 */ 0,  
/* 7 */ 6,
+                                          /* 8 */ 16, /* 9 */ 0,  /*10 */ 4,  
/*11 */ 14,
+                                          /*12 */ 0,  /*13 */ 2,  /*14 */ 12, 
/*15 */ 0};
+
+  static const uint8_t READ_FLAG_22[16] = {
+          // From original code: expansions #3, #6, #9, #12, #15 don't read, 
others do
+          1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_22,
+                                                 SHIFT_HI_22, SHIFT_LO_22, 
READ_FLAG_22, mask22,
+                                                 nb22, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_22, 
SHIFT_HI_22, SHIFT_LO_22,
+                                         READ_FLAG_22, mask22, nb22, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_23_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const int nb23 = 23;
+  const uint32_t mask23 = (1u << 23) - 1; // 0x7FFFFF
+
+  // expansions_count=32
+  const int expansions_count_23 = 32;
+
+  // Predefined SHIFT_HI_23, SHIFT_LO_23, READ_FLAG_23
+  static const uint8_t SHIFT_HI_23[32] = {0,  23, 14, 5,  28, 19, 10, 1,  24, 
15, 6,
+                                          29, 20, 11, 2,  25, 16, 7,  30, 21, 
12, 3,
+                                          26, 17, 8,  31, 22, 13, 4,  27, 18, 
9};
+
+  static const uint8_t SHIFT_LO_23[32] = {0, 9,  18, 0, 4,  13, 22, 0, 8,  17, 
0,
+                                          3, 12, 21, 0, 7,  16, 0,  2, 11, 20, 
0,
+                                          6, 15, 0,  1, 10, 19, 0,  5, 14, 0};
+
+  static const uint8_t READ_FLAG_23[32] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 
1, 1, 0, 1,
+                                           1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 
0, 1, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_23, SHIFT_HI_23,
+                                             SHIFT_LO_23, READ_FLAG_23, 
mask23, nb23, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_23, 
SHIFT_HI_23, SHIFT_LO_23,
+                                     READ_FLAG_23, mask23, nb23, isZigZag);
+  }
+}
+static void bitunpack256v32_24_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // base bits & mask
+  const int nb24 = 24;
+  const uint32_t mask24 = (1u << 24) - 1; // 0xFFFFFF
+
+  // expansions_count=4 (corresponds to 4 expansions => outputs 32 values)
+  const int expansions_count_24 = 4;
+
+  // k=0 => leftover>>0, new<<0
+  // k=1 => leftover>>24, new<<8
+  // k=2 => leftover>>16, new<<16
+  // k=3 => leftover>>8,  no new block read
+  static const uint8_t SHIFT_HI_24[4] = {0, 24, 16, 8};
+  static const uint8_t SHIFT_LO_24[4] = {0, 8, 16, 0};
+
+  // Only read new blocks for steps 0,1,2, not for step 3
+  static const uint8_t READ_FLAG_24[4] = {1, 1, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 8; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_24,
+                                                 SHIFT_HI_24, SHIFT_LO_24, 
READ_FLAG_24, mask24,
+                                                 nb24, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 8; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_24, 
SHIFT_HI_24, SHIFT_LO_24,
+                                         READ_FLAG_24, mask24, nb24, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_25_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // mask & base bits
+  const uint32_t mask25 = (1u << 25) - 1; // 0x1FFFFFF
+  const int nb25 = 25;
+
+  // 32 expansions total
+  const int expansions_count_25 = 32;
+
+  // Extract high and low shift amounts from original implementation
+  static const uint8_t SHIFT_HI_25[32] = {
+          /* #0  */ 0,  /* #1  */ 25, /* #2  */ 18, /* #3  */ 11,
+          /* #4  */ 4,  /* #5  */ 29, /* #6  */ 22, /* #7  */ 15,
+          /* #8  */ 8,  /* #9  */ 1,  /* #10 */ 26, /* #11 */ 19,
+          /* #12 */ 12, /* #13 */ 5,  /* #14 */ 30, /* #15 */ 23,
+          /* #16 */ 16, /* #17 */ 9,  /* #18 */ 2,  /* #19 */ 27,
+          /* #20 */ 20, /* #21 */ 13, /* #22 */ 6,  /* #23 */ 31,
+          /* #24 */ 24, /* #25 */ 17, /* #26 */ 10, /* #27 */ 3,
+          /* #28 */ 28, /* #29 */ 21, /* #30 */ 14, /* #31 */ 7};
+
+  static const uint8_t SHIFT_LO_25[32] = {
+          /* #0  */ 0,  /* #1  */ 7,  /* #2  */ 14, /* #3  */ 21,
+          /* #4  */ 0,  /* #5  */ 3,  /* #6  */ 10, /* #7  */ 17,
+          /* #8  */ 24, /* #9  */ 0,  /* #10 */ 6,  /* #11 */ 13,
+          /* #12 */ 20, /* #13 */ 0,  /* #14 */ 2,  /* #15 */ 9,
+          /* #16 */ 16, /* #17 */ 23, /* #18 */ 0,  /* #19 */ 5,
+          /* #20 */ 12, /* #21 */ 19, /* #22 */ 0,  /* #23 */ 1,
+          /* #24 */ 8,  /* #25 */ 15, /* #26 */ 22, /* #27 */ 0,
+          /* #28 */ 4,  /* #29 */ 11, /* #30 */ 18, /* #31 */ 0};
+
+  // Mark which steps don't need to read new data
+  // Based on original code, expansions #4, #9, #13, #18, #22, #27, #31 don't 
need to read new data
+  static const uint8_t READ_FLAG_25[32] = {
+          /* #0  */ 1, /* #1  */ 1, /* #2  */ 1, /* #3  */ 1,
+          /* #4  */ 0, /* #5  */ 1, /* #6  */ 1, /* #7  */ 1,
+          /* #8  */ 1, /* #9  */ 0, /* #10 */ 1, /* #11 */ 1,
+          /* #12 */ 1, /* #13 */ 0, /* #14 */ 1, /* #15 */ 1,
+          /* #16 */ 1, /* #17 */ 1, /* #18 */ 0, /* #19 */ 1,
+          /* #20 */ 1, /* #21 */ 1, /* #22 */ 0, /* #23 */ 1,
+          /* #24 */ 1, /* #25 */ 1, /* #26 */ 1, /* #27 */ 0,
+          /* #28 */ 1, /* #29 */ 1, /* #30 */ 1, /* #31 */ 0};
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_25,
+                                                 SHIFT_HI_25, SHIFT_LO_25, 
READ_FLAG_25, mask25,
+                                                 nb25, isZigZag);
+      }
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_25, 
SHIFT_HI_25, SHIFT_LO_25,
+                                     READ_FLAG_25, mask25, nb25, isZigZag);
+  }
+}
+
+static void bitunpack256v32_26_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // mask & base bits
+  const uint32_t mask26 = (1u << 26) - 1; // 0x3FFFFFF
+  const int nb26 = 26;
+
+  // 16 expansions total
+  const int expansions_count_26 = 16;
+
+  // Extract high and low shift amounts from original implementation
+  static const uint8_t SHIFT_HI_26[16] = {
+          /* #0  */ 0,  /* #1  */ 26, /* #2  */ 20, /* #3  */ 14,
+          /* #4  */ 8,  /* #5  */ 2,  /* #6  */ 28, /* #7  */ 22,
+          /* #8  */ 16, /* #9  */ 10, /* #10 */ 4,  /* #11 */ 30,
+          /* #12 */ 24, /* #13 */ 18, /* #14 */ 12, /* #15 */ 6};
+
+  static const uint8_t SHIFT_LO_26[16] = {
+          /* #0  */ 0,  /* #1  */ 6,  /* #2  */ 12, /* #3  */ 18,
+          /* #4  */ 24, /* #5  */ 0,  /* #6  */ 4,  /* #7  */ 10,
+          /* #8  */ 16, /* #9  */ 22, /* #10 */ 0,  /* #11 */ 2,
+          /* #12 */ 8,  /* #13 */ 14, /* #14 */ 20, /* #15 */ 0};
+
+  // Mark which steps don't need to read new data
+  // Based on original code, expansions #5, #10, #15 don't need to read new 
data
+  static const uint8_t READ_FLAG_26[16] = {
+          /* #0  */ 1, /* #1  */ 1, /* #2  */ 1, /* #3  */ 1,
+          /* #4  */ 1, /* #5  */ 0, /* #6  */ 1, /* #7  */ 1,
+          /* #8  */ 1, /* #9  */ 1, /* #10 */ 0, /* #11 */ 1,
+          /* #12 */ 1, /* #13 */ 1, /* #14 */ 1, /* #15 */ 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_26,
+                                                 SHIFT_HI_26, SHIFT_LO_26, 
READ_FLAG_26, mask26,
+                                                 nb26, isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_26, 
SHIFT_HI_26, SHIFT_LO_26,
+                                         READ_FLAG_26, mask26, nb26, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_27_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  // mask & base bits
+  const uint32_t mask27 = (1u << 27) - 1; // 0x7FFFFFF
+  const int nb27 = 27;
+
+  // 32 expansions total
+  const int expansions_count_27 = 32;
+
+  // Extract high and low shift amounts from original implementation
+  static const uint8_t SHIFT_HI_27[32] = {
+          /* #0  */ 0,  /* #1  */ 27, /* #2  */ 22, /* #3  */ 17,
+          /* #4  */ 12, /* #5  */ 7,  /* #6  */ 2,  /* #7  */ 29,
+          /* #8  */ 24, /* #9  */ 19, /* #10 */ 14, /* #11 */ 9,
+          /* #12 */ 4,  /* #13 */ 31, /* #14 */ 26, /* #15 */ 21,
+          /* #16 */ 16, /* #17 */ 11, /* #18 */ 6,  /* #19 */ 1,
+          /* #20 */ 28, /* #21 */ 23, /* #22 */ 18, /* #23 */ 13,
+          /* #24 */ 8,  /* #25 */ 3,  /* #26 */ 30, /* #27 */ 25,
+          /* #28 */ 20, /* #29 */ 15, /* #30 */ 10, /* #31 */ 5};
+
+  static const uint8_t SHIFT_LO_27[32] = {
+          /* #0  */ 0,  /* #1  */ 5,  /* #2  */ 10, /* #3  */ 15,
+          /* #4  */ 20, /* #5  */ 25, /* #6  */ 0,  /* #7  */ 3,
+          /* #8  */ 8,  /* #9  */ 13, /* #10 */ 18, /* #11 */ 23,
+          /* #12 */ 0,  /* #13 */ 1,  /* #14 */ 6,  /* #15 */ 11,
+          /* #16 */ 16, /* #17 */ 21, /* #18 */ 26, /* #19 */ 0,
+          /* #20 */ 4,  /* #21 */ 9,  /* #22 */ 14, /* #23 */ 19,
+          /* #24 */ 24, /* #25 */ 0,  /* #26 */ 2,  /* #27 */ 7,
+          /* #28 */ 12, /* #29 */ 17, /* #30 */ 22, /* #31 */ 0};
+
+  // Mark which steps don't need to read new data
+  // From original code, steps #6, #12, #19, #25, #31 don't have CPY8(iv, *pIn)
+  static const uint8_t READ_FLAG_27[32] = {
+          /* #0  */ 1, /* #1  */ 1, /* #2  */ 1, /* #3  */ 1,
+          /* #4  */ 1, /* #5  */ 1, /* #6  */ 0, /* #7  */ 1,
+          /* #8  */ 1, /* #9  */ 1, /* #10 */ 1, /* #11 */ 1,
+          /* #12 */ 0, /* #13 */ 1, /* #14 */ 1, /* #15 */ 1,
+          /* #16 */ 1, /* #17 */ 1, /* #18 */ 1, /* #19 */ 0,
+          /* #20 */ 1, /* #21 */ 1, /* #22 */ 1, /* #23 */ 1,
+          /* #24 */ 1, /* #25 */ 0, /* #26 */ 1, /* #27 */ 1,
+          /* #28 */ 1, /* #29 */ 1, /* #30 */ 1, /* #31 */ 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_27, SHIFT_HI_27,
+                                             SHIFT_LO_27, READ_FLAG_27, 
mask27, nb27, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count_27, 
SHIFT_HI_27, SHIFT_LO_27,
+                                     READ_FLAG_27, mask27, nb27, isZigZag);
+  }
+}
+static void bitunpack256v32_28_scalar(
+        uint32_t** pIn, uint32_t** pOut,
+        uint32_t** pPEX,                    // Optional parameter, non-NULL 
for extended version
+        unsigned char** pBB, bool isZigZag) // Optional parameter, non-NULL 
for extended version
+{
+  // Common constant definitions
+  const uint32_t mask28 = (1u << 28) - 1; // 0xFFFFFFF
+  const int nb28 = 28;
+  const int expansions_count_28 = 8;
+  static const uint8_t SHIFT_HI_28[8] = {0, 28, 24, 20, 16, 12, 8, 4};
+  static const uint8_t SHIFT_LO_28[8] = {0, 4, 8, 12, 16, 20, 24, 0};
+  static const uint8_t READ_FLAG_28[8] = {1, 1, 1, 1, 1, 1, 1, 0};
+
+  // Choose template based on whether extension parameters are provided
+  if (pPEX != NULL && pBB != NULL) {
+      // Call extended template, each call outputs 64 values, loop 4 times to 
get 256
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count_28,
+                                                 SHIFT_HI_28, SHIFT_LO_28, 
READ_FLAG_28, mask28,
+                                                 nb28, isZigZag);
+      }
+  } else {
+      // Call non-extended template, also each call outputs 64 values, loop 4 
times to get 256
+      for (int i = 0; i < 4; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count_28, 
SHIFT_HI_28, SHIFT_LO_28,
+                                         READ_FLAG_28, mask28, nb28, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_29_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const uint32_t mask29 = (1U << 29) - 1; // 0x1FFFFFFF
+  const int expansions_count = 32;
+  static const uint8_t SHIFT_HI_29[32] = {0,  29, 26, 23, 20, 17, 14, 11, 8, 
5, 2,
+                                          31, 28, 25, 22, 19, 16, 13, 10, 7, 
4, 1,
+                                          30, 27, 24, 21, 18, 15, 12, 9,  6, 
3};
+  static const uint8_t SHIFT_LO_29[32] = {0, 3, 6, 9,  12, 15, 18, 21, 24, 27, 
0,
+                                          1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 
0,
+                                          2, 5, 8, 11, 14, 17, 20, 23, 26, 0};
+  static const uint8_t READ_FLAG_29[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 
1, 1, 1, 1,
+                                           1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count, SHIFT_HI_29,
+                                             SHIFT_LO_29, READ_FLAG_29, 
mask29, 29, isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI_29, 
SHIFT_LO_29,
+                                     READ_FLAG_29, mask29, 29, isZigZag);
+  }
+}
+
+static void bitunpack256v32_30_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const uint32_t mask30 = (1U << 30) - 1; // 0x3FFFFFFF
+  const int expansions_count = 16;
+  static const uint8_t SHIFT_HI_30[16] = {0,  30, 28, 26, 24, 22, 20, 18,
+                                          16, 14, 12, 10, 8,  6,  4,  2};
+  static const uint8_t SHIFT_LO_30[16] = {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 
20, 22, 24, 26, 28, 0};
+  static const uint8_t READ_FLAG_30[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 0};
+
+  if (pPEX != NULL && pBB != NULL) {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count,
+                                                 SHIFT_HI_30, SHIFT_LO_30, 
READ_FLAG_30, mask30, 30,
+                                                 isZigZag);
+      }
+  } else {
+      for (int i = 0; i < 2; i++) {
+          bitunblk256v32_scalar_template(pIn, pOut, expansions_count, 
SHIFT_HI_30, SHIFT_LO_30,
+                                         READ_FLAG_30, mask30, 30, isZigZag);
+      }
+  }
+}
+
+static void bitunpack256v32_31_scalar(uint32_t** pIn, uint32_t** pOut, 
uint32_t** pPEX,
+                                      unsigned char** pBB, bool isZigZag) {
+  const uint32_t mask31 = (1U << 31) - 1; // 0x7FFFFFFF
+  const int expansions_count = 32;
+  // Construct parameter arrays:
+  // For k==0: SHIFT_HI = 0, SHIFT_LO = 0, READ_FLAG = 1
+  // For k = 1 .. 30: SHIFT_HI = 32 - k, SHIFT_LO = k, READ_FLAG = 1
+  // For k==31: SHIFT_HI = 1, SHIFT_LO = 0, READ_FLAG = 0
+  static const uint8_t SHIFT_HI[32] = {0,  31, 30, 29, 28, 27, 26, 25, 24, 23, 
22,
+                                       21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 
11,
+                                       10, 9,  8,  7,  6,  5,  4,  3,  2,  1};
+  static const uint8_t SHIFT_LO[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  
10,
+                                       11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
21,
+                                       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 
0};
+  static const uint8_t READ_FLAG[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 0};
+  if (pPEX != NULL && pBB != NULL) {
+      bitunblk256v32_scalarBlock_ex_template(pIn, pOut, pPEX, pBB, 
expansions_count, SHIFT_HI,
+                                             SHIFT_LO, READ_FLAG, mask31, 31, 
isZigZag);
+  } else {
+      bitunblk256v32_scalar_template(pIn, pOut, expansions_count, SHIFT_HI, 
SHIFT_LO, READ_FLAG,
+                                     mask31, 31, isZigZag);
+  }
+}
+
+static void bitunpack256v32_32_scalar(
+        uint32_t** pIn, uint32_t** pOut,
+        uint32_t** pPEX,                    // Optional parameter, non-NULL 
for extended version
+        unsigned char** pBB, bool isZigZag) // Optional parameter, non-NULL 
for extended version
+{
+  uint32_t* ip = *pIn;
+  uint32_t* op = *pOut;
+  const int nb = 32; // When b=32, each 32-bit integer stores a value directly
+
+  // There are 32 groups, each group has 8 numbers, totaling 256 numbers
+  for (int i = 0; i < 32; i++) {
+      // Copy 8 input values directly to output (avoid calling CPY8)
+      for (int j = 0; j < 8; j++) {
+          op[j] = ip[j];
+      }
+      ip += 8;
+
+      if (pPEX != NULL && pBB != NULL) {
+          uint8_t xm8 = **pBB;
+          (*pBB)++;
+          if (xm8 != 0) {
+              applyException_8bits(xm8, pPEX, nb, op);
+          }
+      }
+      if (isZigZag) {
+          for (int j = 0; j < 8; j++) {
+              op[j] = zigzagDecode_scalar(op[j]);
+          }
+      }
+      op += 8;
+  }
+  *pIn = ip;
+  *pOut = op;
+}
+
+// Define function pointer type for unpacking functions
+typedef void (*unpack_func_t)(uint32_t**, uint32_t**, unsigned**, unsigned 
char**, bool);
+
+// Array of function pointers for each bit width (0 to 32)
+static unpack_func_t unpack_funcs[33] = {
+        bitunpack256v32_0_scalar,  bitunpack256v32_1_scalar,  
bitunpack256v32_2_scalar,
+        bitunpack256v32_3_scalar,  bitunpack256v32_4_scalar,  
bitunpack256v32_5_scalar,
+        bitunpack256v32_6_scalar,  bitunpack256v32_7_scalar,  
bitunpack256v32_8_scalar,
+        bitunpack256v32_9_scalar,  bitunpack256v32_10_scalar, 
bitunpack256v32_11_scalar,
+        bitunpack256v32_12_scalar, bitunpack256v32_13_scalar, 
bitunpack256v32_14_scalar,
+        bitunpack256v32_15_scalar, bitunpack256v32_16_scalar, 
bitunpack256v32_17_scalar,
+        bitunpack256v32_18_scalar, bitunpack256v32_19_scalar, 
bitunpack256v32_20_scalar,
+        bitunpack256v32_21_scalar, bitunpack256v32_22_scalar, 
bitunpack256v32_23_scalar,
+        bitunpack256v32_24_scalar, bitunpack256v32_25_scalar, 
bitunpack256v32_26_scalar,
+        bitunpack256v32_27_scalar, bitunpack256v32_28_scalar, 
bitunpack256v32_29_scalar,
+        bitunpack256v32_30_scalar, bitunpack256v32_31_scalar, 
bitunpack256v32_32_scalar};
+/**
+ *
+ * @param in   Compressed data input stream
+ * @param n    Currently unused, can be processed according to actual needs
+ * @param out  Output buffer for decompressed 32-bit integers (must 
accommodate at least 256 32-bit integers)
+ * @param b    Bit width for each integer, this example only demonstrates the 
b=8 branch
+ * @return     Returns the next readable input position after decompression 
(consistent with original logic)
+ */
+unsigned char* bitunpack256scalarv32_withzigzag(const unsigned char* 
__restrict in, unsigned n,
+                                                unsigned* __restrict out, 
unsigned b,
+                                                bool isZigZag) {
+  // Debug output (optional, can be removed in production)
+  //printf("bitunpack256scalarv32_withzigzag b=%d bits=%d isZigZag=%d\n", b, b 
& 0x3f, isZigZag);
+
+  // Calculate input pointer offset
+  unsigned char* ip = (unsigned char*)(in + PAD8(256 * b));
+
+  // Initialize pointers
+  uint32_t* pIn32 = (uint32_t*)in;
+  uint32_t* pOut32 = (uint32_t*)out;
+
+  unsigned bits = b & 0x3f;
+  // Execute unpacking if b is in valid range
+  if (bits <= 32) {
+      unpack_funcs[bits](&pIn32, &pOut32, NULL, NULL, isZigZag);
+  }
+
+  return ip;
+}
+unsigned char* bitunpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                     unsigned* __restrict out, unsigned b) {
+  // Debug output (optional, can be removed in production)
+  //printf("bitunpack256scalarv32 b=%d bits=%d\n", b, b & 0x3f);
+
+  // Calculate input pointer offset
+  unsigned char* ip = (unsigned char*)(in + PAD8(256 * b));
+
+  bitunpack256scalarv32_withzigzag(in, n, out, b, false);
+
+  return ip;
+}
+unsigned char* _bitd1unpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                        unsigned* __restrict out, unsigned 
start, unsigned b,
+                                        unsigned* __restrict pex, unsigned 
char* bb) {
+  //printf("_bitd1unpack256scalarv32, b=%d\n", b & 0x3f);
+  unsigned* deltas = (unsigned*)malloc(n * sizeof(unsigned));
+  if (!deltas) return NULL;
+
+  const unsigned char* orig_in = in;
+  in = _bitunpack256scalarv32(in, n, deltas, b, pex, bb, false);
+
+  unsigned running_sum = start;
+  for (unsigned i = 0; i < n; ++i) {
+      running_sum += deltas[i] + 1;
+      out[i] = running_sum;
+  }
+
+  free(deltas);
+  return (unsigned char*)in;
+}
+
+// Add this after the definition of _bitunpack256w32 in the SSE2/SSSE3 section
+
+// Delta1 unpacking for 256 32-bit integers (no exceptions)
+unsigned char* bitd1unpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                       unsigned* __restrict out, unsigned 
start, unsigned b) {
+  //printf("bitd1unpack256scalarv32, b=%d\n", b & 0x3f);
+  const unsigned char* _in = in;
+  unsigned deltas[n];
+
+  in = bitunpack256scalarv32(in, n, deltas, b);
+
+  unsigned running_sum = start;
+  for (unsigned i = 0; i < n; ++i) {
+      running_sum += deltas[i] + 1;
+      out[i] = running_sum;
+  }
+
+  return (unsigned char*)in;
+}
+
+unsigned char* _bitunpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                      unsigned* __restrict out, unsigned b,
+                                      unsigned* __restrict pex, unsigned char* 
bb, bool isZigZag) {
+  // Debug output (optional, can be removed in production)
+  //printf("_bitunpack256scalarv32 bits=%d isZigZag=%d\n", b & 0x3f, isZigZag);
+
+  // Calculate input pointer offset
+  unsigned char* ip = (unsigned char*)(in + PAD8(256 * b));
+
+  // Initialize pointers
+  unsigned* pPEX = pex;
+  unsigned char* pBB = bb;
+  uint32_t* pIn32 = (uint32_t*)in;
+  uint32_t* pOut32 = (uint32_t*)out;
+
+  unsigned bits = b & 0x3f;
+  // Execute unpacking if b is in valid range
+  if (bits <= 32) {
+      unpack_funcs[bits](&pIn32, &pOut32, &pPEX, &pBB, isZigZag);
+  }
+
+  return ip;
+}
+
+unsigned char* bitzunpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                      unsigned* __restrict out, unsigned 
start, unsigned b) {
+  // Debug output (optional, can be removed in production)
+  //printf("bitzunpack256scalarv32 b=%d bits=%d\n", b, b & 0x3f);
+  const unsigned char* _in = in;
+  unsigned deltas[n];
+
+  in = bitunpack256scalarv32_withzigzag(in, n, deltas, b, true);
+
+  unsigned running_sum = start;
+  for (unsigned i = 0; i < n; ++i) {
+      running_sum += deltas[i];
+      out[i] = running_sum;
+  }
+
+  return (unsigned char*)in;
+}
+unsigned char* _bitzunpack256scalarv32(const unsigned char* __restrict in, 
unsigned n,
+                                       unsigned* __restrict out, unsigned 
start, unsigned b,
+                                       unsigned* __restrict pex, unsigned 
char* bb) {
+  // Debug output (optional, can be removed in production)
+  //printf("_bitzunpack256scalarv32 bits=%d\n", b & 0x3f);
+
+  unsigned* deltas = (unsigned*)malloc(n * sizeof(unsigned));
+  if (!deltas) return NULL;
+
+  const unsigned char* orig_in = in;
+  in = _bitunpack256scalarv32(in, n, deltas, b, pex, bb, true);
+
+  unsigned running_sum = start;
+  for (unsigned i = 0; i < n; ++i) {
+      running_sum += deltas[i];
+      out[i] = running_sum;
+  }
+
+  free(deltas);
+  return (unsigned char*)in;
+}
 
 #define STOZ64(_op_, _ov_) _mm_storeu_si128(_op_++, _ov_); 
_mm_storeu_si128(_op_++, _ov_)
 #define STO64( _op_, _ov_, _zv_) _mm_storeu_si128(_op_++, 
_mm_unpacklo_epi32(_ov_,_zv_));_mm_storeu_si128(_op_++, 
_mm_unpacklo_epi32(_mm_srli_si128(_ov_,8),_zv_))
diff --git a/src/ext/for/test_bitd1unpack.cpp b/src/ext/for/test_bitd1unpack.cpp
new file mode 100644
index 00000000000..23484a7f50e
--- /dev/null
+++ b/src/ext/for/test_bitd1unpack.cpp
@@ -0,0 +1,399 @@
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <vp4.h>
+#include <vint.h>
+#include "conf.h"
+
+// 定义PAD8宏
+#ifndef PAD8
+#define PAD8(_x_) (((_x_) + 7) / 8)
+#endif
+
+const unsigned TEST_SIZE = 512;
+
+#ifdef __AVX2__
+void generate_test_data(unsigned* raw_values, unsigned n, unsigned char* 
encoded_data,
+                        unsigned* out_size) {
+    // 使用p4nd1enc256v32编码原始数据
+    size_t end_ptr = p4nd1enc256v32(raw_values, n, encoded_data);
+
+    // 计算编码后数据大小
+    *out_size = end_ptr;
+}
+#endif
+#define _1vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\
+       if(!(_x_ & 0x80u)) {                                                    
                  _act_;}\
+  else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++; 
                      _act_;}\
+  else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_);             
       _ip_ += 2; _act_;}\
+  else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu);       
           _ip_ += 3; _act_;}\
+  else                    { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | 
ctou32(_ip_); _ip_ += 4; _act_;}\
+} while(0)
+#define xvbxget32(_ip_, _x_) _1vbxget32(_ip_, _x_, ;)
+
+// 用于快速得到 10^k 的一个表，避免多次调用 pow
+// 注意 10^10=10000000000 需要 64 位才能存
+static const uint64_t g_pow10[] = {
+    1ULL,         // 10^0
+    10ULL,        // 10^1
+    100ULL,       // 10^2
+    1000ULL,      // 10^3
+    10000ULL,     // 10^4
+    100000ULL,    // 10^5
+    1000000ULL,   // 10^6
+    10000000ULL,  // 10^7
+    100000000ULL, // 10^8
+    1000000000ULL,// 10^9
+    10000000000ULL// 10^10
+};
+
+// 计算 10^(floor(b/3))，若超出 g_pow10 范围可再加判断
+static inline uint64_t get_pow10_for_b(unsigned b) {
+    // floor(b/3)
+    unsigned idx = b / 3;
+    if (idx >= sizeof(g_pow10)/sizeof(g_pow10[0])) {
+        // 超过预置表最大 10^10，就固定返回 10^10 或自行处理
+        return g_pow10[10];
+    }
+    return g_pow10[idx];
+}
+
+// 计算 2^b 的函数
+static inline unsigned power2(unsigned b) {
+    // (1U << b) 当 b=32 时也可能溢出，你可自行判断
+    return (1U << b);
+}
+
+
+/**
+ * @param values        输出数组
+ * @param n             要生成的数据个数
+ * @param b             当前位宽
+ * @param with_exception 0=无异常，1=有异常
+ */
+void generate_raw_data_for_bitwidth(unsigned* values, unsigned n,
+                                    unsigned b, int with_exception)
+{
+    if (n == 0) return;
+
+    if (!with_exception) {
+        // =====================================
+        //         无异常模式：递增序列
+        // =====================================
+        // 1) 先给一个随机初始值 base (你也可随意决定)
+        unsigned base = rand() % 1000;
+        values[0] = base;
+
+        // 2) 根据 b 分段决定“增量最大范围”
+        unsigned inc_range;
+        if (b < 4) {
+            // b=0 => 2^0=1, b=1 =>2, b=2 =>4, b=3=>8
+            inc_range = power2(b);
+        } else {
+            // b>=4 => 用10^(floor(b/3)) => 10,100,1000,...
+            uint64_t r = get_pow10_for_b(b);
+            // 这里最好判断 r 是否超出 unsigned 范围
+            // 若测试场景不会特别大，可以直接转为 unsigned
+            if (r > 0xFFFFFFFFULL) {
+                r = 0xFFFFFFFFULL; // 避免溢出
+            }
+            inc_range = (unsigned)r;
+        }
+
+        // 3) 生成递增序列
+        for (unsigned i = 1; i < n; i++) {
+            // +1 是为了避免 0 增量的情况
+            unsigned inc = 1 + rand() % inc_range;
+            base += inc;
+            values[i] = base;
+        }
+
+    } else {
+        // =====================================
+        //         有异常模式：直接随机
+        // =====================================
+
+        // 观察示例得知：
+        //  - b=0 => rand()%2
+        //  - b=2 => rand()%4
+        //  - b=3 => rand()%10
+        //  - b=7 => rand()%100
+        //  - b=10 => rand()%1000
+        //  - b=13 => rand()%10000
+        //  => 规律：当 b >= 3 用 10^(floor(b/3))；当 b < 3 用特殊处理
+
+        uint64_t val_range = 0; // 用 64 位临时存，最后再转回 unsigned
+
+        if (b == 0) {
+            val_range = 2; // 0..1
+        } 
+        else if (b == 1) {
+            // 你没给 b=1 的具体例子，这里假设跟 b=0 一样 => range=2
+            val_range = 2; // 0..1
+        } 
+        else if (b == 2) {
+            val_range = 4; // 0..3
+        }
+        else {
+            // b>=3 => 用 10^(floor(b/3))
+            val_range = get_pow10_for_b(b);
+            // 同样检查一下是否超过 unsigned
+            if (val_range > 0xFFFFFFFFULL) {
+                val_range = 0xFFFFFFFFULL;
+            }
+        }
+
+        // 直接随机
+        for (unsigned i = 0; i < n; i++) {
+            unsigned x = (unsigned)(rand() % (unsigned)val_range);
+            values[i] = x;
+        }
+    }
+}
+
+/**
+ * 生成 n 个有符号数:
+ *   - b<3: 范围很小(±(1<<b) 之类)
+ *   - b>=3: 直接从 ±(10^(floor(b/3))) 随机, 并包含一定的负值
+ *
+ * with_exception=0 => 生成一个“有序/有限范围”
+ * with_exception=1 => 生成一个“更大随机范围” (你可自定义)
+ */
+static void generate_raw_signed_data_for_zigzag(unsigned *values,
+                                                unsigned n,
+                                                unsigned b,
+                                                int with_exception)
+{
+    if (n == 0) return;
+
+    // srand(...) 在外部一次初始化
+    uint64_t val_range = 1;
+    if (b < 3) {
+        // 例如 b=0 =>±1, b=1=>±2, b=2=>±4
+        val_range = (1ULL << b);
+    } else {
+        // b>=3 => use get_pow10_for_b(b) => 10^(floor(b/3))
+        val_range = get_pow10_for_b(b); // 参考你贴的 delta pfor
+        if(val_range > 0x7fffffffULL) {
+            val_range = 0x7fffffffULL; // 避免溢出 32-bit
+        }
+    }
+
+    for(unsigned i=0; i<n; i++){
+        // 先产生 0..val_range-1
+        int32_t x = (int32_t)(rand() % (unsigned)val_range);
+        // 随机决定正负
+        if(with_exception) {
+            // 例如 50% 概率取反
+            if((rand() & 1) == 1) x = -x;
+        } else {
+            // 不带异常 => 大部分正, 也可以小概率负
+            if((rand()%10)==0) x = -x; 
+        }
+        values[i] = x;
+    }
+}
+#ifdef __AVX2__
+void run_testZigzag(unsigned b,
+                    int with_exception,
+                    unsigned TEST_SIZE,
+                    unsigned *raw_values,
+                    unsigned char *encoded_data,
+                    unsigned *decoded1,
+                    unsigned *decoded2)
+{
+    printf("Zigzag 测试: 位宽 b=%u, with_exception=%d\n", b, with_exception);
+
+    // 1) 生成带正负 raw data
+    generate_raw_signed_data_for_zigzag(raw_values, TEST_SIZE, b, 
with_exception);
+    unsigned encoded_size = p4nzenc256v32(raw_values, TEST_SIZE, encoded_data);
+
+    // 获取编码头部信息（例如起始值等）
+    unsigned start;
+    unsigned char* copy = encoded_data;
+    xvbxget32(copy, start);
+    unsigned char encoded_b = copy[0];  // 编码后的第一个字节为位宽
+    if((encoded_b & 0x40)) {
+        encoded_b &= 0x3f;
+    } else {
+        if(encoded_b & 0x80) {
+            encoded_b &= 0x7f;
+        }
+    }
+    printf("  编码参数: 位宽 b=%u, 起始值 start=%u, 编码大小=%u字节\n", encoded_b, start, 
encoded_size);
+
+    // 3) decode => two versions for cross-check 
+    //   (here we define "decoded1" from "bitzunpack256v32...??" and 
"decoded2" from "bitzunpack256scalarv32Zigzag"??)
+    memset(decoded1,0,TEST_SIZE*sizeof(unsigned));
+    memset(decoded2,0,TEST_SIZE*sizeof(unsigned));
+
+    // "decoded1" => maybe  vector version if you have it? e.g. 
"bitzunpack256v32(in,b, out,??)" 
+    // "decoded2" => scalar version ?
+
+    // for demonstration, we do the same decode to compare:
+    p4nzdec256v32(encoded_data, TEST_SIZE, decoded1);
+    p4nzdec256scalarv32(encoded_data, TEST_SIZE, decoded2);
+
+    // 4) compare mismatch
+    int mismatch=0;
+    for(unsigned i=0;i<TEST_SIZE;i++){
+        if(decoded1[i] != decoded2[i]){
+            if(mismatch<10)
+                printf(" mismatch at i=%u: dec1=%d, dec2=%d\n", i, 
decoded1[i], decoded2[i]);
+            mismatch++;
+        }
+    }
+    if(mismatch==0){
+        printf(" decode1 & decode2 match!\n");
+        // verify with original
+        int error=0;
+        for(unsigned i=0;i<TEST_SIZE;i++){
+            if(decoded1[i] != raw_values[i]){
+                if(error<10)
+                  printf(" raw mismatch at i=%u: raw=%d, dec=%d\n", 
i,raw_values[i], decoded1[i]);
+                error++;
+            }
+        }
+        if(error==0) printf(" and match raw data!\n");
+        else printf(" total %d raw mismatch\n", error);
+    } else {
+        printf(" total mismatch=%d\n", mismatch);
+    }
+    printf("\n");
+}
+
+void run_test(unsigned b, int with_exception, unsigned TEST_SIZE,
+              unsigned* raw_values, unsigned char* encoded_data,
+              unsigned* decoded1, unsigned* decoded2) {
+    printf("测试: 位宽 b=%u, 异常%s\n", b, (with_exception ? "有" : "无"));
+
+    // 生成符合当前 b 与异常模式的原始数据
+    generate_raw_data_for_bitwidth(raw_values, TEST_SIZE, b, with_exception);
+
+    unsigned encoded_size;
+    generate_test_data(raw_values, TEST_SIZE, encoded_data, &encoded_size);
+
+    // 获取编码头部信息（例如起始值等）
+    unsigned start;
+    unsigned char* copy = encoded_data;
+    xvbxget32(copy, start);
+    unsigned char encoded_b = copy[0];  // 编码后的第一个字节为位宽
+    if((encoded_b & 0x40)) {
+        encoded_b &= 0x3f;
+    } else {
+        if(encoded_b & 0x80) {
+            encoded_b &= 0x7f;
+        }
+    }
+    printf("  编码参数: 位宽 b=%u, 起始值 start=%u, 编码大小=%u字节\n", encoded_b, start, 
encoded_size);
+
+    // 清空解码缓冲区
+    memset(decoded1, 0, TEST_SIZE * sizeof(unsigned));
+    memset(decoded2, 0, TEST_SIZE * sizeof(unsigned));
+
+    // 调用两种解码方式
+    p4nd1dec256v32(encoded_data, TEST_SIZE, decoded1);
+    p4nd1dec256scalarv32(encoded_data, TEST_SIZE, decoded2);
+
+    // 比较两个解码结果是否匹配
+    int mismatch = 0;
+    for (unsigned i = 0; i < TEST_SIZE; i++) {
+        if (decoded1[i] != decoded2[i]) {
+            if (mismatch < 10)
+                printf("  不匹配: 索引 %u, 原始值=%u, 原始解码=%u, 标量解码=%u\n",
+                       i, raw_values[i], decoded1[i], decoded2[i]);
+            mismatch++;
+        }
+    }
+    if (mismatch == 0) {
+        printf("  通过: 所有解码值匹配!\n");
+        // 验证解码值与原始数据是否一致
+        int error = 0;
+        for (unsigned i = 0; i < TEST_SIZE && error < 10; i++) {
+            if (decoded1[i] != raw_values[i]) {
+                printf("  编码/解码错误: 索引 %u, 原始值=%u, 解码值=%u\n",
+                       i, raw_values[i], decoded1[i]);
+                error++;
+            }
+        }
+        if (error == 0)
+            printf("  验证通过: 解码结果与原始数据一致\n");
+    } else {
+        printf("  失败: 有 %d 个值不匹配\n", mismatch);
+        printf("  原始数据 (前16个): ");
+        for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++)
+            printf("%u ", raw_values[i]);
+        printf("...\n");
+        printf("  原始解码 (前16个): ");
+        for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++)
+            printf("%u ", decoded1[i]);
+        printf("...\n");
+        printf("  标量解码 (前16个): ");
+        for (unsigned i = 0; i < 16 && i < TEST_SIZE; i++)
+            printf("%u ", decoded2[i]);
+        printf("...\n");
+    }
+    printf("\n");
+}
+
+void testZigZag()
+{
+    const unsigned TEST_SIZE=512; //or512
+    unsigned *raw_values= (unsigned*) malloc(TEST_SIZE*sizeof(unsigned));
+    unsigned *decoded1=  (unsigned*) malloc(TEST_SIZE*sizeof(unsigned));
+    unsigned *decoded2=  (unsigned*) malloc(TEST_SIZE*sizeof(unsigned));
+    unsigned char* encoded_data= (unsigned char*) malloc(TEST_SIZE*4+ 10); 
//maybe
+
+    srand((unsigned)time(NULL));
+    printf("开始测试 p4nzdec256v32...\n");
+
+    for(unsigned b=0; b<=32; b++){
+        run_testZigzag(b,0, TEST_SIZE, raw_values, encoded_data, decoded1, 
decoded2);
+        run_testZigzag(b,1, TEST_SIZE, raw_values, encoded_data, decoded1, 
decoded2);
+    }
+
+    free(raw_values);
+    free(decoded1);
+    free(decoded2);
+    free(encoded_data);
+}
+
+void test_p4nd1dec256v32() {
+    const unsigned TEST_SIZE = 512;
+
+    // 分配缓冲区
+    unsigned* raw_values   = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned));
+    unsigned char* encoded_data = (unsigned char*)malloc(TEST_SIZE * 
sizeof(unsigned) * 2);
+    unsigned* decoded1     = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned));
+    unsigned* decoded2     = (unsigned*)malloc(TEST_SIZE * sizeof(unsigned));
+
+    srand((unsigned)time(NULL));
+    printf("开始测试 p4nd1dec256v32...\n");
+
+    // 对 b = 0 到 31 测试两种模式：无异常和有异常
+    for (unsigned b = 0; b < 32; b++) {
+        run_test(b, 0, TEST_SIZE, raw_values, encoded_data, decoded1, 
decoded2);
+        run_test(b, 1, TEST_SIZE, raw_values, encoded_data, decoded1, 
decoded2);
+    }
+    // 对 b == 32 只测试无异常情况
+    run_test(32, 0, TEST_SIZE, raw_values, encoded_data, decoded1, decoded2);
+
+    free(raw_values);
+    free(encoded_data);
+    free(decoded1);
+    free(decoded2);
+
+    printf("测试完成!\n");
+}
+#endif
+int main() {
+#ifdef __AVX2__
+    test_p4nd1dec256v32();
+    testZigZag();
+    //test_until_b1_achieved_improved();
+#endif
+    return 0;
+}
diff --git a/src/ext/for/vp4.h b/src/ext/for/vp4.h
index fae28df8d45..39460b614c4 100644
--- a/src/ext/for/vp4.h
+++ b/src/ext/for/vp4.h
@@ -99,6 +99,7 @@ size_t p4nd1dec32(    unsigned char *__restrict in, size_t n, 
uint32_t *__restri
 size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t 
*__restrict out);
 size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t 
*__restrict out);
 size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t 
*__restrict out);
+size_t p4nd1dec256scalarv32(unsigned char* __restrict in, size_t n, uint32_t* 
__restrict out);
 size_t p4nd1dec64(    unsigned char *__restrict in, size_t n, uint64_t 
*__restrict out);
 //Zigzag
 size_t p4nzdec8(      unsigned char *__restrict in, size_t n, uint8_t  
*__restrict out);
@@ -107,6 +108,7 @@ size_t p4nzdec32(     unsigned char *__restrict in, size_t 
n, uint32_t *__restri
 size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t 
*__restrict out);
 size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t 
*__restrict out);
 size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t 
*__restrict out);
+size_t p4nzdec256scalarv32(unsigned char* __restrict in, size_t n, uint32_t* 
__restrict out);
 size_t p4nzdec64(     unsigned char *__restrict in, size_t n, uint64_t 
*__restrict out);
 
 //************** Low level API - n limited to 128/256 
***************************************
diff --git a/src/ext/for/vp4d.c b/src/ext/for/vp4d.c
index a255fc2a2f0..e1bbacde3b6 100644
--- a/src/ext/for/vp4d.c
+++ b/src/ext/for/vp4d.c
@@ -253,6 +253,36 @@ extern char _shuffle_16[256][16];
 #define  BITUNPACK     bitunpack256w
 #define  BITUNPACKD    bitunpack256w
 #define  _BITUNPACKD  _bitunpack256w
+#include "vp4d.c"
+#define P4DELTA(a) ,a
+#define P4DELTA_(a) a
+#define DELTA
+
+#undef _P4DEC
+#undef  P4DEC
+#undef  P4NDEC
+#undef  BITUNPACKD
+#undef _BITUNPACKD
+
+
+#define _P4DEC        _p4d1dec256scalarv
+#define  P4DEC         p4d1dec256scalarv
+#define  P4NDEC        p4nd1dec256scalarv
+#define  P4NDECS       p4d1dec
+#define  BITUNPACK     bitunpack256scalarv
+#define  BITUNPACKD    bitd1unpack256scalarv
+#define  _BITUNPACKD  _bitd1unpack256scalarv
+#define  BITUNDD       bitd1dec
+#include "vp4d.c"
+
+#define _P4DEC        _p4zdec256scalarv
+#define  P4DEC         p4zdec256scalarv
+#define  P4NDEC        p4nzdec256scalarv
+#define  P4NDECS       p4zdec
+#define  BITUNPACKD    bitzunpack256scalarv
+#define  _BITUNPACKD  _bitzunpack256scalarv
+#define  BITUNDD       bitzdec
+#define USIZE 32
 #include "vp4d.c"
   #endif
 #undef  DELTA
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 5392cc79188..4b12bb6f9c3 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -16,7 +16,7 @@ SOURCE_GROUP("search" ./search/*)
 SOURCE_GROUP("search-spans" ./search/spans/*)
 SOURCE_GROUP("store" ./store/*)
 SOURCE_GROUP("util" ./util/*)
-
+INCLUDE_DIRECTORIES( ${clucene_SOURCE_DIR}/src/ext/* )
 IF (BUILD_CONTRIBS_LIB)
     SET(test_contribs_lib_files ./contribs-lib/analysis/testChinese.cpp)
     SET(EXTRA_LIBS ${EXTRA_LIBS} clucene-contribs-lib)
@@ -106,6 +106,7 @@ SET(test_files ./tests.cpp
         ./util/TestStrConvert.cpp
         ./query/TestMultiPhraseQuery.cpp
         ./store/TestUTF8Chars.cpp
+        ./store/testPFOR.cpp
         ${test_HEADERS})
 IF (USE_SHARED_OBJECT_FILES)
     GET_SHARED_FILES(clucene_shared_Files)
diff --git a/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat 
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat
new file mode 100644
index 00000000000..e823aef9968
Binary files /dev/null and 
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_arm.dat differ
diff --git a/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat 
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat
new file mode 100644
index 00000000000..a27177e3090
Binary files /dev/null and 
b/src/test/data/pfor_p4ndx_compat_gen_by_old_version_x86_64.dat differ
diff --git a/src/test/store/testPFOR.cpp b/src/test/store/testPFOR.cpp
new file mode 100644
index 00000000000..a8ddbc871b5
--- /dev/null
+++ b/src/test/store/testPFOR.cpp
@@ -0,0 +1,546 @@
+#include "test.h"
+
+#include <memory.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "CLucene/index/CodeMode.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/IndexInput.h"
+#include "CLucene/store/IndexOutput.h"
+#include "CLucene/util/PFORUtil.h"
+#include "CuTest.h"
+#include "for/vp4.h"
+
+using namespace lucene::store;
+// Add a helper macro for printing more detailed error messages when 
assertions fail
+#define CuAssertTrueWithMessage(tc, message, condition) \
+    do {                                                \
+        if (!(condition)) {                             \
+            printf("Assertion failed: %s\n", message);  \
+        }                                               \
+        CuAssertTrue(tc, condition);                    \
+    } while (0)
+
+static const uint64_t g_pow10[] = {
+        1ULL,          // 10^0
+        10ULL,         // 10^1
+        100ULL,        // 10^2
+        1000ULL,       // 10^3
+        10000ULL,      // 10^4
+        100000ULL,     // 10^5
+        1000000ULL,    // 10^6
+        10000000ULL,   // 10^7
+        100000000ULL,  // 10^8
+        1000000000ULL, // 10^9
+        10000000000ULL // 10^10
+};
+
+// 计算 10^(floor(b/3))，若超出 g_pow10 范围可再加判断
+static inline uint64_t get_pow10_for_b(unsigned b) {
+    // floor(b/3)
+    unsigned idx = b / 3;
+    if (idx >= sizeof(g_pow10) / sizeof(g_pow10[0])) {
+        // 超过预置表最大 10^10，就固定返回 10^10 或自行处理
+        return g_pow10[10];
+    }
+    return g_pow10[idx];
+}
+
+// 计算 2^b 的函数
+static inline unsigned power2(unsigned b) {
+    // (1U << b) 当 b=32 时也可能溢出，你可自行判断
+    return (1U << b);
+}
+
+/**
+ * @param values        输出数组
+ * @param n             要生成的数据个数
+ * @param b             当前位宽
+ * @param with_exception 0=无异常，1=有异常
+ */
+void generate_raw_data_for_bitwidth(unsigned* values, unsigned n, unsigned b, 
int with_exception) {
+    if (n == 0) return;
+
+    if (!with_exception) {
+        // =====================================
+        //         无异常模式：递增序列
+        // =====================================
+        // 1) 先给一个随机初始值 base (你也可随意决定)
+        unsigned base = rand() % 1000;
+        values[0] = base;
+
+        // 2) 根据 b 分段决定"增量最大范围"
+        unsigned inc_range;
+        if (b < 4) {
+            // b=0 => 2^0=1, b=1 =>2, b=2 =>4, b=3=>8
+            inc_range = power2(b);
+        } else {
+            // b>=4 => 用10^(floor(b/3)) => 10,100,1000,...
+            uint64_t r = get_pow10_for_b(b);
+            // 这里最好判断 r 是否超出 unsigned 范围
+            // 若测试场景不会特别大，可以直接转为 unsigned
+            if (r > 0xFFFFFFFFULL) {
+                r = 0xFFFFFFFFULL; // 避免溢出
+            }
+            inc_range = (unsigned)r;
+        }
+
+        // 3) 生成递增序列
+        for (unsigned i = 1; i < n; i++) {
+            // +1 是为了避免 0 增量的情况
+            unsigned inc = 1 + rand() % inc_range;
+            base += inc;
+            values[i] = base;
+        }
+
+    } else {
+        // =====================================
+        //         有异常模式：直接随机
+        // =====================================
+
+        // 观察示例得知：
+        //  - b=0 => rand()%2
+        //  - b=2 => rand()%4
+        //  - b=3 => rand()%10
+        //  - b=7 => rand()%100
+        //  - b=10 => rand()%1000
+        //  - b=13 => rand()%10000
+        //  => 规律：当 b >= 3 用 10^(floor(b/3))；当 b < 3 用特殊处理
+
+        uint64_t val_range = 0; // 用 64 位临时存，最后再转回 unsigned
+
+        if (b == 0) {
+            val_range = 2; // 0..1
+        } else if (b == 1) {
+            // 你没给 b=1 的具体例子，这里假设跟 b=0 一样 => range=2
+            val_range = 2; // 0..1
+        } else if (b == 2) {
+            val_range = 4; // 0..3
+        } else {
+            // b>=3 => 用 10^(floor(b/3))
+            val_range = get_pow10_for_b(b);
+            // 同样检查一下是否超过 unsigned
+            if (val_range > 0xFFFFFFFFULL) {
+                val_range = 0xFFFFFFFFULL;
+            }
+        }
+
+        // 直接随机
+        for (unsigned i = 0; i < n; i++) {
+            unsigned x = (unsigned)(rand() % (unsigned)val_range);
+            values[i] = x;
+        }
+    }
+}
+
+void test_pfor_has_prox(CuTest* tc) {
+    const unsigned TEST_SIZE = 512;
+    const char* testFileName = "pfor.dat";
+
+    // 分配缓冲区
+    std::vector<unsigned> docDeltaBuffer(TEST_SIZE);
+    std::vector<unsigned> freqBuffer(TEST_SIZE);
+    std::vector<unsigned> encoded_data(TEST_SIZE * 2);
+    std::vector<unsigned> decoded1(TEST_SIZE);
+    std::vector<unsigned> decoded2(TEST_SIZE);
+
+    srand((unsigned)time(NULL));
+    printf("开始测试 p4nd1dec256v32...\n");
+
+    {
+        generate_raw_data_for_bitwidth(docDeltaBuffer.data(), TEST_SIZE, 32, 
0);
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+
+        auto* output = dir->createOutput(testFileName);
+
+        lucene::util::pfor_encode(output, docDeltaBuffer, freqBuffer, true);
+        output->close();
+        _CLDELETE(output);
+        dir->close();
+        _CLDELETE(dir);
+    }
+    {
+        IndexInput* input = nullptr;
+        CLuceneError error;
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+        bool result = dir->openInput(testFileName, input, error);
+        lucene::util::pfor_decode(input, decoded1, decoded2, true, false);
+        for (size_t i = 0; i < TEST_SIZE; i++) {
+            CuAssertIntEquals(tc, _T("docDeltaBuffer[%zu] != decoded1[%zu]"), 
docDeltaBuffer[i],
+                              decoded1[i]);
+            CuAssertIntEquals(tc, _T("freqBuffer[%zu] != decoded2[%zu]"), 
freqBuffer[i],
+                              decoded2[i]);
+        }
+        input->close();
+        _CLDELETE(input);
+        dir->close();
+        _CLDELETE(dir);
+    }
+    printf("测试完成!\n");
+}
+
+void test_pfor_no_prox(CuTest* tc) {
+    const unsigned TEST_SIZE = 512;
+    const char* testFileName = "pfor.dat";
+
+    // 分配缓冲区
+    std::vector<unsigned> docDeltaBuffer(TEST_SIZE);
+    std::vector<unsigned> freqBuffer(TEST_SIZE);
+    std::vector<unsigned> encoded_data(TEST_SIZE * 2);
+    std::vector<unsigned> decoded1(TEST_SIZE);
+    std::vector<unsigned> decoded2(TEST_SIZE);
+
+    srand((unsigned)time(NULL));
+    printf("开始测试 p4nd1dec256v32...\n");
+
+    {
+        generate_raw_data_for_bitwidth(docDeltaBuffer.data(), TEST_SIZE, 32, 
0);
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+
+        auto* output = dir->createOutput(testFileName);
+
+        lucene::util::pfor_encode(output, docDeltaBuffer, freqBuffer, false);
+        output->close();
+        _CLDELETE(output);
+        dir->close();
+        _CLDELETE(dir);
+    }
+    {
+        IndexInput* input = nullptr;
+        CLuceneError error;
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+        bool result = dir->openInput(testFileName, input, error);
+        lucene::util::pfor_decode(input, decoded1, decoded2, false, false);
+        for (size_t i = 0; i < TEST_SIZE; i++) {
+            CuAssertIntEquals(tc, _T("docDeltaBuffer[%zu] != decoded1[%zu]"), 
docDeltaBuffer[i],
+                              decoded1[i]);
+            CuAssertIntEquals(tc, _T("freqBuffer[%zu] != decoded2[%zu]"), 
freqBuffer[i],
+                              decoded2[i]);
+        }
+        input->close();
+        _CLDELETE(input);
+        dir->close();
+        _CLDELETE(dir);
+    }
+    printf("测试完成!\n");
+}
+
+// Test the compatibility of P4DEC and P4ENC
+void test_p4dec_p4enc_compat(CuTest* tc) {
+    const unsigned TEST_SIZE = 512;
+    const char* testFileName = "pfor_p4enc.dat";
+
+    // Allocate buffers
+    std::vector<uint32_t> originalData(TEST_SIZE);
+    std::vector<uint32_t> decodedData(TEST_SIZE);
+    std::vector<uint32_t> freqs(TEST_SIZE);
+    std::vector<uint32_t> decodedFreqs(TEST_SIZE);
+
+    srand((unsigned)time(NULL));
+    printf("Testing P4ENC and pfor_decode compatibility...\n");
+
+    // Generate test data with delta encoding pattern (increasing values)
+    generate_raw_data_for_bitwidth(originalData.data(), TEST_SIZE, 32, 0);
+    generate_raw_data_for_bitwidth(freqs.data(), TEST_SIZE, 32, 1);
+
+    auto encode = [](IndexOutput* out, std::vector<uint32_t>& buffer, bool 
isDoc) {
+        std::vector<uint8_t> compress(4 * buffer.size() + PFOR_BLOCK_SIZE);
+        size_t size = 0;
+        if (isDoc) {
+            size = P4ENC(buffer.data(), buffer.size(), compress.data());
+        } else {
+            size = P4NZENC(buffer.data(), buffer.size(), compress.data());
+        }
+        out->writeVInt(size);
+        out->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), 
size);
+    };
+    // 第一步：使用P4ENC编码数据并写入文件
+    {
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+        auto* output = dir->createOutput(testFileName);
+
+        // 写入编码模式和大小
+        output->writeByte((char)lucene::index::CodeMode::kPfor);
+        output->writeVInt(TEST_SIZE);
+
+        // 编码并写入数据
+        encode(output, originalData, true);
+        encode(output, freqs, false);
+
+        output->close();
+        _CLDELETE(output);
+        dir->close();
+        _CLDELETE(dir);
+    }
+
+    // 第二步：使用pfor_decode解码数据
+    {
+        IndexInput* input = nullptr;
+        CLuceneError error;
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+        bool result = dir->openInput(testFileName, input, error);
+
+        // 使用pfor_decode解码数据 (不使用代理 has_prox=false, compatibleRead=false)
+        uint32_t decoded_size =
+                lucene::util::pfor_decode(input, decodedData, decodedFreqs, 
true, false);
+
+        // 验证解码大小
+        CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE, 
decoded_size);
+
+        // 验证解码数据与原始数据匹配
+        for (size_t i = 0; i < TEST_SIZE; i++) {
+            //printf("freqs[%zu] = %u, decodedFreqs[%zu] = %u\n", i, freqs[i], 
i, decodedFreqs[i]);
+            //printf("originalData[%zu] = %u, decodedData[%zu] = %u\n", i, 
originalData[i], i, decodedData[i]);
+            CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original",
+                                    originalData[i] == decodedData[i]);
+            CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original",
+                                    freqs[i] == decodedFreqs[i]);
+        }
+
+        input->close();
+        _CLDELETE(input);
+        dir->close();
+        _CLDELETE(dir);
+    }
+
+    printf("P4ENC/pfor_decode compatibility test completed successfully!\n");
+}
+
+// Test cross-platform compatibility for P4DEC/P4ENC
+void test_cross_platform_compat(CuTest* tc) {
+    const unsigned TEST_SIZE = 512;
+    const char* testFileName = "pfor_cross_platform.dat";
+
+    // Allocate buffers
+    std::vector<uint32_t> originalData(TEST_SIZE);
+    std::vector<uint32_t> decodedData(TEST_SIZE);
+
+    srand((unsigned)time(NULL));
+    printf("Testing cross-platform compatibility...\n");
+
+    // Generate test data with different patterns
+    for (unsigned i = 0; i < TEST_SIZE; i++) {
+        // Mix of small and large values to test different bit widths
+        if (i % 10 == 0) {
+            originalData[i] = rand() % 1000000; // Occasional large value
+        } else {
+            originalData[i] = rand() % 100; // Mostly small values
+        }
+    }
+
+    // Part 1: Write encoded data to file using PFOR encoding
+    {
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+        auto* output = dir->createOutput(testFileName);
+
+        // Write encoding mode and size
+        output->writeByte((char)lucene::index::CodeMode::kPfor);
+        output->writeVInt(TEST_SIZE);
+
+        // Encode and write the data
+        std::vector<uint8_t> compress(4 * TEST_SIZE + PFOR_BLOCK_SIZE);
+        size_t size = lucene::util::P4ENC(originalData.data(), TEST_SIZE, 
compress.data());
+        output->writeVInt(size);
+        output->writeBytes(reinterpret_cast<const uint8_t*>(compress.data()), 
size);
+
+        output->close();
+        _CLDELETE(output);
+        dir->close();
+        _CLDELETE(dir);
+    }
+
+    // Part 2: Read encoded data from file and decode it with 
compatibleRead=true
+    {
+        IndexInput* input = nullptr;
+        CLuceneError error;
+        auto* dir = lucene::store::FSDirectory::getDirectory("./");
+        bool result = dir->openInput(testFileName, input, error);
+
+        // Verify the encoded format
+        char mode = input->readByte();
+        uint32_t arraySize = input->readVInt();
+        CuAssertIntEquals(tc, _T("Array size mismatch"), TEST_SIZE, arraySize);
+
+        // Read, decode and verify
+        uint32_t SerializedSize = input->readVInt();
+        std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE);
+        input->readBytes(buf.data(), SerializedSize);
+
+        // Use P4DEC for decoding, simulating cross-platform read
+        lucene::util::P4DEC(buf.data(), arraySize, decodedData.data());
+
+        // Verify decoded data matches original
+        for (size_t i = 0; i < TEST_SIZE; i++) {
+            CuAssertIntEquals(tc, _T("Cross-platform decoded data mismatch at 
%zu"),
+                              originalData[i], decodedData[i]);
+        }
+
+        input->close();
+        _CLDELETE(input);
+        dir->close();
+        _CLDELETE(dir);
+    }
+
+    printf("Cross-platform compatibility test completed successfully!\n");
+}
+
+// Test compatibility between encoded by ARM old version data and decoded by 
x86 new version with compatible mode
+void test_p4ndx_compatibility(CuTest* tc) {
+    const unsigned TEST_SIZE = 512;
+    const char* testFileName1 = 
"pfor_p4ndx_compat_gen_by_old_version_x86_64.dat";
+    const char* testFileName2 = "pfor_p4ndx_compat_gen_by_old_version_arm.dat";
+
+    // Allocate buffers
+    std::vector<uint32_t> docDeltaBuffer = {
+            635,    1188,   1795,   2109,   2694,   3612,   3714,   4511,   
5072,   5352,   5526,
+            5894,   6706,   6891,   6979,   7080,   7586,   7789,   8530,   
9065,   9704,   9949,
+            10377,  10678,  11516,  11921,  12226,  13133,  13417,  13854,  
14215,  14486,  15476,
+            16444,  17380,  18306,  19191,  19580,  20302,  21099,  21119,  
22014,  22178,  22361,
+            22440,  23043,  23326,  24262,  25067,  25443,  26265,  27061,  
27681,  27931,  28027,
+            28837,  28843,  29595,  29663,  29953,  30494,  30922,  31834,  
32364,  33111,  33311,
+            33766,  34749,  35689,  36217,  36348,  36660,  37083,  37378,  
37872,  38725,  39622,
+            40399,  40540,  40594,  41098,  42060,  42909,  43032,  43243,  
43539,  43823,  44040,
+            44439,  44790,  45648,  46587,  46718,  46839,  47307,  47536,  
48208,  48482,  49046,
+            49658,  50460,  51154,  51429,  52005,  52993,  53761,  54541,  
54778,  55674,  56594,
+            57236,  57635,  58517,  59007,  59881,  60325,  60462,  60971,  
60983,  61518,  62378,
+            63247,  64073,  64415,  64757,  65050,  65620,  65633,  66552,  
66685,  67661,  67733,
+            67912,  68514,  69161,  69327,  69697,  70475,  71229,  71846,  
71896,  72291,  72659,
+            72942,  73178,  73419,  74145,  74517,  75266,  75356,  75615,  
76575,  77533,  77617,
+            77918,  78569,  79297,  79520,  79536,  80534,  81241,  81584,  
81653,  82538,  83483,
+            83550,  83601,  84267,  85112,  85268,  85550,  86444,  87347,  
87996,  88172,  88310,
+            88551,  88804,  89666,  90008,  90350,  90822,  91475,  92127,  
93034,  93340,  93994,
+            94628,  95156,  95825,  96457,  96691,  96703,  96755,  96874,  
97182,  97301,  97822,
+            98795,  99758,  100434, 101040, 101248, 101826, 102081, 102816, 
102884, 103731, 104070,
+            104999, 105539, 106220, 106972, 107165, 107849, 108507, 109005, 
109342, 109633, 109658,
+            110016, 110290, 110900, 111621, 111947, 112675, 112703, 113499, 
114099, 114451, 115209,
+            115837, 116794, 117111, 117668, 118231, 118634, 119258, 119668, 
120409, 121313, 122262,
+            123035, 123690, 124183, 124991, 125303, 126293, 126790, 127745, 
128111, 128965, 129545,
+            129873, 130447, 130704, 130759, 131712, 131764, 131771, 132075, 
132236, 132870, 133482,
+            134311, 134501, 134676, 134907, 135073, 136009, 136333, 136402, 
137286, 137734, 137810,
+            138539, 138795, 139534, 139604, 140356, 140401, 141189, 142146, 
142771, 142886, 143416,
+            143649, 144170, 145004, 145289, 145816, 146305, 146750, 146910, 
147010, 147636, 147986,
+            148612, 149468, 150335, 150896, 151427, 152362, 153159, 154138, 
154500, 155025, 155259,
+            155360, 155954, 156291, 156436, 157169, 157462, 157583, 158430, 
158604, 158958, 159326,
+            159333, 159971, 160865, 161712, 162146, 162552, 162850, 162909, 
163016, 163940, 164207,
+            165180, 165664, 166461, 167368, 167648, 168423, 168692, 168848, 
169208, 169929, 170679,
+            171375, 172240, 172722, 173710, 174696, 175377, 175890, 176581, 
176629, 177148, 177476,
+            177769, 178486, 178599, 179297, 179312, 179836, 180640, 180930, 
181072, 181848, 182621,
+            183559, 183594, 183999, 184064, 184719, 185279, 186055, 186430, 
187091, 187915, 188154,
+            188649, 188812, 189388, 189563, 190239, 190505, 190727, 190921, 
191866, 192732, 192995,
+            193405, 193969, 194246, 195179, 195546, 196464, 196890, 197385, 
198075, 198790, 199319,
+            199765, 199896, 200079, 200085, 200992, 201549, 202215, 202945, 
203092, 203252, 204144,
+            204219, 204905, 205472, 205812, 206071, 206184, 206821, 206946, 
207673, 207719, 208407,
+            208762, 209092, 209498, 209770, 209877, 210129, 210442, 211263, 
212043, 212802, 213754,
+            214068, 214832, 215690, 215912, 216693, 217632, 218001, 218942, 
219124, 219567, 220545,
+            220646, 220780, 221017, 221582, 222352, 223065, 223356, 223523, 
224275, 224920, 225768,
+            225925, 226841, 227795, 228556, 228784, 229559, 230099, 231085, 
231163, 231369, 231470,
+            231757, 232184, 233066, 233291, 234086, 234260, 235018, 235607, 
235758, 236616, 237339,
+            238078, 238500, 239344, 239795, 239859, 240222, 240424, 241132, 
241694, 242405, 243380,
+            243896, 244367, 244922, 245564, 245926, 246818, 247537, 248104, 
249097, 249102, 249448,
+            249674, 250255, 250395, 250794, 251132, 251213, 252114, 252662, 
252817, 253457, 253778,
+            254128, 254570, 254955, 255667, 256311, 256755};
+    std::vector<uint32_t> freqBuffer = {
+            73, 5,  18, 40, 27, 24, 33, 88, 15, 51, 7,  59, 7,  4,  84, 39, 
43, 34, 28, 75, 35, 75,
+            29, 26, 48, 79, 67, 32, 42, 10, 75, 67, 67, 45, 7,  94, 21, 40, 
35, 37, 43, 94, 48, 2,
+            98, 85, 41, 93, 19, 22, 69, 54, 49, 50, 32, 97, 81, 0,  29, 24, 
62, 57, 91, 30, 54, 51,
+            76, 76, 91, 63, 65, 87, 57, 13, 89, 7,  98, 31, 1,  70, 5,  22, 
76, 54, 24, 9,  52, 6,
+            9,  33, 82, 71, 4,  2,  25, 53, 97, 76, 30, 25, 20, 93, 90, 7,  3, 
 55, 96, 10, 6,  79,
+            63, 76, 84, 85, 52, 39, 10, 13, 91, 68, 22, 76, 50, 46, 19, 75, 
99, 16, 4,  81, 41, 24,
+            27, 83, 31, 30, 38, 27, 92, 44, 59, 56, 20, 43, 93, 25, 82, 55, 
38, 25, 23, 13, 2,  25,
+            59, 21, 53, 10, 89, 57, 44, 82, 81, 71, 17, 64, 53, 55, 43, 97, 0, 
 2,  53, 72, 46, 99,
+            49, 28, 54, 40, 6,  30, 53, 8,  5,  5,  64, 81, 60, 74, 22, 17, 
18, 4,  50, 41, 21, 14,
+            94, 28, 58, 92, 80, 60, 97, 5,  58, 96, 54, 87, 3,  46, 45, 33, 
99, 53, 40, 15, 86, 1,
+            90, 8,  18, 60, 64, 21, 54, 37, 87, 48, 65, 45, 92, 98, 58, 42, 3, 
 16, 90, 9,  55, 93,
+            56, 0,  26, 7,  5,  67, 23, 91, 20, 65, 99, 38, 77, 15, 11, 31, 
52, 99, 32, 18, 96, 76,
+            68, 54, 18, 71, 23, 9,  32, 78, 2,  88, 31, 81, 48, 88, 0,  23, 
80, 20, 40, 31, 10, 17,
+            47, 22, 49, 99, 21, 81, 69, 17, 57, 37, 24, 28, 60, 47, 37, 93, 
77, 91, 33, 8,  72, 33,
+            97, 72, 56, 29, 92, 96, 60, 55, 14, 59, 77, 15, 11, 98, 48, 32, 
67, 57, 70, 91, 85, 82,
+            90, 74, 75, 68, 66, 61, 28, 90, 94, 77, 15, 3,  6,  59, 99, 19, 
14, 65, 30, 91, 32, 41,
+            41, 80, 74, 9,  38, 96, 52, 75, 78, 43, 2,  6,  63, 68, 19, 91, 
10, 13, 69, 25, 16, 27,
+            85, 68, 98, 99, 33, 81, 91, 66, 74, 84, 98, 48, 93, 88, 96, 98, 
16, 27, 93, 18, 85, 56,
+            38, 4,  47, 48, 69, 68, 74, 38, 48, 11, 6,  98, 10, 91, 31, 53, 9, 
 6,  38, 60, 54, 83,
+            48, 3,  33, 64, 30, 26, 34, 67, 82, 72, 71, 82, 21, 92, 2,  47, 
30, 50, 58, 88, 1,  20,
+            32, 32, 74, 93, 38, 64, 53, 45, 99, 54, 48, 33, 70, 30, 59, 5,  
97, 94, 29, 20, 76, 50,
+            12, 78, 49, 95, 81, 7,  83, 34, 80, 67, 18, 6,  13, 57, 70, 18, 
54, 69, 72, 54, 2,  95,
+            36, 14, 52, 33, 8,  81, 5,  36, 84, 17, 14, 33, 12, 47, 93, 48, 
81, 25, 67, 52, 31, 80,
+            9,  1,  99, 15, 22, 23, 69, 25};
+    std::vector<uint32_t> decodedDocs(TEST_SIZE);
+    std::vector<uint32_t> decodedFreqs(TEST_SIZE);
+
+    srand((unsigned)time(NULL));
+    printf("Testing pfor_decode compatibility...\n");
+
+#if defined(__AVX2__)
+    // Part 2: Decode data using pfor_decode with compatible mode 
(compatibleRead=true)
+    {
+        IndexInput* input = nullptr;
+        CLuceneError error;
+        auto* dir = 
lucene::store::FSDirectory::getDirectory(clucene_data_location);
+        bool result = dir->openInput(testFileName2, input, error);
+
+        // Use pfor_decode with compatibleRead=true
+        uint32_t decoded_size =
+                lucene::util::pfor_decode(input, decodedDocs, decodedFreqs, 
true, true);
+
+        // Verify decoded size
+        CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE, 
decoded_size);
+
+        // Verify decoded data matches original
+        for (size_t i = 0; i < TEST_SIZE; i++) {
+            CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original",
+                                    docDeltaBuffer[i] == decodedDocs[i]);
+            CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original",
+                                    freqBuffer[i] == decodedFreqs[i]);
+        }
+
+        input->close();
+        _CLDELETE(input);
+        dir->close();
+        _CLDELETE(dir);
+    }
+#elif defined(__ARM_NEON) || defined(__SSSE3__)
+    {
+        IndexInput* input = nullptr;
+        CLuceneError error;
+        auto* dir = 
lucene::store::FSDirectory::getDirectory(clucene_data_location);
+        bool result = dir->openInput(testFileName1, input, error);
+
+        // Use pfor_decode with compatibleRead=true
+        uint32_t decoded_size =
+                lucene::util::pfor_decode(input, decodedDocs, decodedFreqs, 
true, true);
+
+        // Verify decoded size
+        CuAssertIntEquals(tc, _T("Decoded size mismatch"), TEST_SIZE, 
decoded_size);
+
+        // Verify decoded data matches original
+        for (size_t i = 0; i < TEST_SIZE; i++) {
+            CuAssertTrueWithMessage(tc, "Decoded doc doesn't match original",
+                                    docDeltaBuffer[i] == decodedDocs[i]);
+            CuAssertTrueWithMessage(tc, "Decoded freq doesn't match original",
+                                    freqBuffer[i] == decodedFreqs[i]);
+        }
+
+        input->close();
+        _CLDELETE(input);
+        dir->close();
+        _CLDELETE(dir);
+    }
+#endif
+    printf("compatibility test completed successfully!\n");
+}
+CuSuite* testPFORSuite() {
+    CuSuite* suite = CuSuiteNew(_T("PFOR Test Suite"));
+
+    SUITE_ADD_TEST(suite, test_pfor_has_prox);
+    SUITE_ADD_TEST(suite, test_pfor_no_prox);
+    SUITE_ADD_TEST(suite, test_p4dec_p4enc_compat);
+    SUITE_ADD_TEST(suite, test_cross_platform_compat);
+    SUITE_ADD_TEST(suite, test_p4ndx_compatibility);
+
+    return suite;
+}
\ No newline at end of file
diff --git a/src/test/test.h b/src/test/test.h
index 19f37e81243..7f9bd908ee8 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -86,7 +86,7 @@ CuSuite *testMultiPhraseQuery(void);
 CuSuite *testIndexCompaction(void);
 CuSuite *testStringReader(void);
 CuSuite *testUTF8CharsSuite(void);
-
+CuSuite *testPFORSuite(void);
 #ifdef TEST_CONTRIB_LIBS
 //CuSuite *testGermanAnalyzer(void);
 CuSuite *testchinese(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 7cd9f657385..e0ee6055f32 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -20,7 +20,8 @@ unittest tests[] = {
         {"IndexCompaction", testIndexCompaction},
         {"testStringReader", testStringReader},
         {"TestUTF8Chars", testUTF8CharsSuite},
+        {"testPFOR", testPFORSuite},
 #ifdef TEST_CONTRIB_LIBS
-        {"chinese", testchinese},
+        //{"chinese", testchinese},
 #endif
         {"LastTest", NULL}};


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris-thirdparty) branch clucene-2.0 updated: [improve](pfor) add non-simd implementation for PFOR 256 (#297)

Reply via email to