Re: [PR] [opt](inverted index) Inverted Index Dictionary Compression [doris-thirdparty]

via GitHub Mon, 04 Nov 2024 04:10:38 -0800


xiaokang commented on code in PR #245:
URL: https://github.com/apache/doris-thirdparty/pull/245#discussion_r1827601054



##########
src/core/CLucene/store/v2/GrowableByteArrayDataOutput.h:
##########
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+#include <zstd.h>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexOutput.h"
+
+namespace v2 {
+
+class GrowableByteArrayDataOutput : public CL_NS(store)::IndexOutput {
+public:
+    GrowableByteArrayDataOutput() : bytes_(INITIAL_SIZE) {}
+    ~GrowableByteArrayDataOutput() override = default;
+
+    void writeByte(uint8_t b) override {
+        ensureCapacity(1);
+        bytes_[nextWrite_++] = b;
+    }
+
+    void writeBytes(const uint8_t* b, const int32_t len) override { 
writeBytes(b, len, 0); }
+
+    void writeBytes(const uint8_t* b, const int32_t len, const int32_t offset) 
override {

Review Comment:
   follow normal buffer argument order: b, offset, len



##########
src/core/CLucene/index/SegmentTermEnum.cpp:
##########
@@ -287,7 +305,12 @@ void SegmentTermEnum::seek(const int64_t pointer, const 
int32_t p, Term* t, Term
     //Post - term and terminfo have been repositioned within the enumeration
 
     //Reset the IndexInput input to pointer
-    input->seek(pointer);
+    if (isDictCompress_ && !isIndex) {
+        input->seek(pointer);
+        byteArrayDataInput_.readCompressedFrom(input);
+    } else {
+        input->seek(pointer);
+    }

Review Comment:
   suggest 
   
   ```
   input->seek(pointer);
   if (isDictCompress_ && !isIndex) {
       byteArrayDataInput_.readCompressedFrom(input);
   }
   ```



##########
src/core/CLucene/store/v2/ByteArrayDataInput.h:
##########
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <zstd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexInput.h"
+
+namespace v2 {
+
+class ByteArrayDataInput : public CL_NS(store)::IndexInput {
+public:
+    ByteArrayDataInput() : owns_(true), bytes_(new std::vector<uint8_t>()) {}
+
+    ByteArrayDataInput(std::vector<uint8_t>* bytes) { reset(bytes); }
+
+    ~ByteArrayDataInput() override {
+        if (owns_) {
+            if (bytes_ != nullptr) {
+                delete bytes_;
+                bytes_ = nullptr;
+            }
+        }
+    }
+
+    void reset(std::vector<uint8_t>* bytes) { reset(bytes, 0, bytes->size()); }
+
+    void reset(std::vector<uint8_t>* bytes, int32_t offset, int32_t len) {
+        bytes_ = bytes;
+        pos_ = offset;
+        limit_ = offset + len;
+    }
+
+    uint8_t readByte() override { return (*bytes_)[pos_++]; }
+
+    void readBytes(uint8_t* b, const int32_t len) override { readBytes(b, 0, 
len); }
+
+    void readBytes(uint8_t* b, const int32_t len, int32_t offset) override {

Review Comment:
   limit_ is not checked



##########
src/test/index/TestIndexCompressV3.cpp:
##########
@@ -0,0 +1,325 @@
+#include <CLucene.h> // IWYU pragma: keep
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+#include <CLucene/util/stringUtil.h>
+
+#include <ctime>
+#include <exception>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/index/FieldConfig.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/_RAMDirectory.h"
+#include "CLucene/store/v2/ByteArrayDataInput.h"
+#include "CLucene/store/v2/GrowableByteArrayDataOutput.h"
+#include "CuTest.h"
+#include "test.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+CL_NS_USE(util)
+
+static constexpr int32_t doc_count = 10000;
+
+#define FINALLY(eptr, finallyBlock)       \
+    {                                     \
+        finallyBlock;                     \
+        if (eptr) {                       \
+            std::rethrow_exception(eptr); \
+        }                                 \
+    }
+
+static int32_t getDaySeed() {
+    std::time_t now = std::time(nullptr);
+    std::tm* localTime = std::localtime(&now);
+    localTime->tm_sec = 0;
+    localTime->tm_min = 0;
+    localTime->tm_hour = 0;
+    return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24));
+}
+
+static std::string generateRandomIP() {
+    std::string ip_v4;
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    return ip_v4;
+}
+
+static void write_index(const std::string& name, RAMDirectory* dir, 
IndexVersion index_version,
+                        bool isDictCompress, const std::vector<std::string>& 
datas) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    analyzer->set_stopwords(nullptr);
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true);
+    indexwriter->setRAMBufferSizeMB(512);
+    indexwriter->setMaxBufferedDocs(-1);
+    indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+    indexwriter->setMergeFactor(1000000000);
+    indexwriter->setUseCompoundFile(false);
+
+    auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>;
+
+    auto* doc = _CLNEW lucene::document::Document();
+    int32_t field_config = lucene::document::Field::STORE_NO;
+    field_config |= lucene::document::Field::INDEX_NONORMS;
+    field_config |= lucene::document::Field::INDEX_TOKENIZED;
+    auto field_name = std::wstring(name.begin(), name.end());
+    auto* field = _CLNEW lucene::document::Field(field_name.c_str(), 
field_config);
+    field->setOmitTermFreqAndPositions(false);
+    field->setIndexVersion(index_version);
+    if (isDictCompress) {
+        field->updateFlag(FlagBits::DICT_COMPRESS);
+    }
+    doc->add(*field);
+
+    for (const auto& data : datas) {
+        char_string_reader->init(data.data(), data.size(), false);
+        auto* stream = analyzer->reusableTokenStream(field->name(), 
char_string_reader);
+        field->setValue(stream);
+        indexwriter->addDocument(doc);
+    }
+
+    indexwriter->close();
+
+    _CLLDELETE(indexwriter);
+    _CLLDELETE(doc);
+    _CLLDELETE(analyzer);
+    _CLLDELETE(char_string_reader);
+}
+
+static void read_index(RAMDirectory* dir, int32_t doc_count) {
+    auto* reader = IndexReader::open(dir);
+
+    std::exception_ptr eptr;
+    try {
+        if (doc_count != reader->numDocs()) {
+            std::string msg = "doc_count: " + std::to_string(doc_count) +
+                              ", numDocs: " + 
std::to_string(reader->numDocs());
+            _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str());
+        }
+
+        Term* term = nullptr;
+        TermEnum* enumerator = nullptr;
+        try {
+            enumerator = reader->terms();
+            while (enumerator->next()) {
+                term = enumerator->term();
+
+                auto* term_pos = reader->termPositions(term);
+
+                std::exception_ptr eptr;
+                try {
+                    TermPositionIterator iter(term_pos);
+                    int32_t doc = 0;
+                    while ((doc = iter.nextDoc()) != INT32_MAX) {
+                        for (int32_t i = 0; i < iter.freq(); i++) {
+                            int32_t pos = iter.nextPosition();
+                            if (pos < 0 || pos > 3) {
+                                std::string msg = "pos: " + 
std::to_string(pos);
+                                _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str());
+                            }
+                        }
+                    }
+                } catch (...) {
+                    eptr = std::current_exception();
+                }
+                FINALLY(eptr, { _CLDELETE(term_pos); })
+
+                _CLDECDELETE(term);
+            }
+        }
+        _CLFINALLY({
+            _CLDECDELETE(term);
+            enumerator->close();
+            _CLDELETE(enumerator);
+        })
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+}
+
+static void index_compaction(RAMDirectory* tmp_dir, 
std::vector<lucene::store::Directory*> srcDirs,
+                             std::vector<lucene::store::Directory*> destDirs, 
int32_t count) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(tmp_dir, analyzer, 
true);
+
+    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec(
+            srcDirs.size(), std::vector<std::pair<uint32_t, uint32_t>>(count));
+    int32_t idx = 0;
+    int32_t id = 0;
+    for (int32_t i = 0; i < count; i++) {
+        for (int32_t j = 0; j < srcDirs.size(); j++) {
+            if (id == count * destDirs.size()) {
+                idx++;
+                id = 0;
+            }
+            trans_vec[j][i] = std::make_pair(idx, id++);
+        }
+    }
+
+    std::vector<uint32_t> dest_index_docs(destDirs.size());
+    for (int32_t i = 0; i < destDirs.size(); i++) {
+        dest_index_docs[i] = count * destDirs.size();
+    }
+
+    std::exception_ptr eptr;
+    try {
+        indexwriter->indexCompaction(srcDirs, destDirs, trans_vec, 
dest_index_docs);
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    FINALLY(eptr, {
+        indexwriter->close();
+        _CLDELETE(indexwriter);
+        _CLDELETE(analyzer);
+    })
+}
+
+void TestIndexByteArray(CuTest* tc) {
+    RAMDirectory dir;
+    auto ram_out = dir.createOutput("TestIndexByteArray");
+
+    v2::GrowableByteArrayDataOutput out;
+    for (int32_t i = 0; i < doc_count; i++) {
+        out.writeVInt(i);
+    }
+    out.writeCompressedTo(ram_out);
+    ram_out->close();
+
+    IndexInput* ram_in = nullptr;
+    CLuceneError error;
+    bool ret = dir.openInput("TestIndexByteArray", ram_in, error);
+    if (!ret) {
+        std::cout << error.what() << std::endl;
+    }
+    assertTrue(ret);
+
+    v2::ByteArrayDataInput in;
+    in.readCompressedFrom(ram_in);
+    for (int32_t i = 0; i < doc_count; i++) {
+        assertEquals(in.readVInt(), i);
+    }
+
+    _CLDELETE(ram_out);
+    _CLDELETE(ram_in);
+
+    std::cout << "\nTestIndexByteArray sucess" << std::endl;
+}
+
+void TestIndexCompressV3(CuTest* tc) {
+    std::srand(getDaySeed());
+
+    std::string name = "v2_field_name";
+    std::vector<std::string> datas;
+    for (int32_t i = 0; i < doc_count; i++) {
+        std::string ip_v4 = generateRandomIP();
+        datas.emplace_back(ip_v4);
+    }
+
+    RAMDirectory dir;
+    write_index(name, &dir, IndexVersion::kV3, false, datas);
+
+    try {
+        read_index(&dir, doc_count);

Review Comment:
   read does not check any data.



##########
src/core/CLucene/store/v2/ByteArrayDataInput.h:
##########
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <zstd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexInput.h"
+
+namespace v2 {
+
+class ByteArrayDataInput : public CL_NS(store)::IndexInput {
+public:
+    ByteArrayDataInput() : owns_(true), bytes_(new std::vector<uint8_t>()) {}
+
+    ByteArrayDataInput(std::vector<uint8_t>* bytes) { reset(bytes); }
+
+    ~ByteArrayDataInput() override {
+        if (owns_) {
+            if (bytes_ != nullptr) {
+                delete bytes_;
+                bytes_ = nullptr;
+            }
+        }
+    }
+
+    void reset(std::vector<uint8_t>* bytes) { reset(bytes, 0, bytes->size()); }
+
+    void reset(std::vector<uint8_t>* bytes, int32_t offset, int32_t len) {
+        bytes_ = bytes;
+        pos_ = offset;
+        limit_ = offset + len;
+    }
+
+    uint8_t readByte() override { return (*bytes_)[pos_++]; }
+
+    void readBytes(uint8_t* b, const int32_t len) override { readBytes(b, 0, 
len); }
+
+    void readBytes(uint8_t* b, const int32_t len, int32_t offset) override {

Review Comment:
   follow normal buffer argument order: b, offset, len



##########
src/core/CLucene/store/v2/ByteArrayDataInput.h:
##########
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <zstd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexInput.h"
+
+namespace v2 {
+
+class ByteArrayDataInput : public CL_NS(store)::IndexInput {
+public:
+    ByteArrayDataInput() : owns_(true), bytes_(new std::vector<uint8_t>()) {}
+
+    ByteArrayDataInput(std::vector<uint8_t>* bytes) { reset(bytes); }
+
+    ~ByteArrayDataInput() override {
+        if (owns_) {
+            if (bytes_ != nullptr) {
+                delete bytes_;
+                bytes_ = nullptr;
+            }
+        }
+    }
+
+    void reset(std::vector<uint8_t>* bytes) { reset(bytes, 0, bytes->size()); }
+
+    void reset(std::vector<uint8_t>* bytes, int32_t offset, int32_t len) {
+        bytes_ = bytes;
+        pos_ = offset;
+        limit_ = offset + len;
+    }
+
+    uint8_t readByte() override { return (*bytes_)[pos_++]; }

Review Comment:
   limit_ is not checked



##########
src/test/index/TestIndexCompressV3.cpp:
##########
@@ -0,0 +1,325 @@
+#include <CLucene.h> // IWYU pragma: keep
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+#include <CLucene/util/stringUtil.h>
+
+#include <ctime>
+#include <exception>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/index/FieldConfig.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/_RAMDirectory.h"
+#include "CLucene/store/v2/ByteArrayDataInput.h"
+#include "CLucene/store/v2/GrowableByteArrayDataOutput.h"
+#include "CuTest.h"
+#include "test.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+CL_NS_USE(util)
+
+static constexpr int32_t doc_count = 10000;
+
+#define FINALLY(eptr, finallyBlock)       \
+    {                                     \
+        finallyBlock;                     \
+        if (eptr) {                       \
+            std::rethrow_exception(eptr); \
+        }                                 \
+    }
+
+static int32_t getDaySeed() {
+    std::time_t now = std::time(nullptr);
+    std::tm* localTime = std::localtime(&now);
+    localTime->tm_sec = 0;
+    localTime->tm_min = 0;
+    localTime->tm_hour = 0;
+    return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24));
+}
+
+static std::string generateRandomIP() {
+    std::string ip_v4;
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    return ip_v4;
+}
+
+static void write_index(const std::string& name, RAMDirectory* dir, 
IndexVersion index_version,
+                        bool isDictCompress, const std::vector<std::string>& 
datas) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    analyzer->set_stopwords(nullptr);
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true);
+    indexwriter->setRAMBufferSizeMB(512);
+    indexwriter->setMaxBufferedDocs(-1);
+    indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+    indexwriter->setMergeFactor(1000000000);
+    indexwriter->setUseCompoundFile(false);
+
+    auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>;
+
+    auto* doc = _CLNEW lucene::document::Document();
+    int32_t field_config = lucene::document::Field::STORE_NO;
+    field_config |= lucene::document::Field::INDEX_NONORMS;
+    field_config |= lucene::document::Field::INDEX_TOKENIZED;
+    auto field_name = std::wstring(name.begin(), name.end());
+    auto* field = _CLNEW lucene::document::Field(field_name.c_str(), 
field_config);
+    field->setOmitTermFreqAndPositions(false);
+    field->setIndexVersion(index_version);
+    if (isDictCompress) {
+        field->updateFlag(FlagBits::DICT_COMPRESS);
+    }
+    doc->add(*field);
+
+    for (const auto& data : datas) {
+        char_string_reader->init(data.data(), data.size(), false);
+        auto* stream = analyzer->reusableTokenStream(field->name(), 
char_string_reader);
+        field->setValue(stream);
+        indexwriter->addDocument(doc);
+    }
+
+    indexwriter->close();
+
+    _CLLDELETE(indexwriter);
+    _CLLDELETE(doc);
+    _CLLDELETE(analyzer);
+    _CLLDELETE(char_string_reader);
+}
+
+static void read_index(RAMDirectory* dir, int32_t doc_count) {
+    auto* reader = IndexReader::open(dir);
+
+    std::exception_ptr eptr;
+    try {
+        if (doc_count != reader->numDocs()) {
+            std::string msg = "doc_count: " + std::to_string(doc_count) +
+                              ", numDocs: " + 
std::to_string(reader->numDocs());
+            _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str());
+        }
+
+        Term* term = nullptr;
+        TermEnum* enumerator = nullptr;
+        try {
+            enumerator = reader->terms();
+            while (enumerator->next()) {
+                term = enumerator->term();
+
+                auto* term_pos = reader->termPositions(term);
+
+                std::exception_ptr eptr;
+                try {
+                    TermPositionIterator iter(term_pos);
+                    int32_t doc = 0;
+                    while ((doc = iter.nextDoc()) != INT32_MAX) {
+                        for (int32_t i = 0; i < iter.freq(); i++) {
+                            int32_t pos = iter.nextPosition();
+                            if (pos < 0 || pos > 3) {
+                                std::string msg = "pos: " + 
std::to_string(pos);
+                                _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str());
+                            }
+                        }
+                    }
+                } catch (...) {
+                    eptr = std::current_exception();
+                }
+                FINALLY(eptr, { _CLDELETE(term_pos); })
+
+                _CLDECDELETE(term);
+            }
+        }
+        _CLFINALLY({
+            _CLDECDELETE(term);
+            enumerator->close();
+            _CLDELETE(enumerator);
+        })
+
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+}
+
+static void index_compaction(RAMDirectory* tmp_dir, 
std::vector<lucene::store::Directory*> srcDirs,
+                             std::vector<lucene::store::Directory*> destDirs, 
int32_t count) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(tmp_dir, analyzer, 
true);
+
+    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec(
+            srcDirs.size(), std::vector<std::pair<uint32_t, uint32_t>>(count));
+    int32_t idx = 0;
+    int32_t id = 0;
+    for (int32_t i = 0; i < count; i++) {
+        for (int32_t j = 0; j < srcDirs.size(); j++) {
+            if (id == count * destDirs.size()) {
+                idx++;
+                id = 0;
+            }
+            trans_vec[j][i] = std::make_pair(idx, id++);
+        }
+    }
+
+    std::vector<uint32_t> dest_index_docs(destDirs.size());
+    for (int32_t i = 0; i < destDirs.size(); i++) {
+        dest_index_docs[i] = count * destDirs.size();
+    }
+
+    std::exception_ptr eptr;
+    try {
+        indexwriter->indexCompaction(srcDirs, destDirs, trans_vec, 
dest_index_docs);
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    FINALLY(eptr, {
+        indexwriter->close();
+        _CLDELETE(indexwriter);
+        _CLDELETE(analyzer);
+    })
+}
+
+void TestIndexByteArray(CuTest* tc) {

Review Comment:
   add more boundary test, eg. 0 byte, 1 byte, capacity bytes, capacity -1 
bytes, capacity + 1 bytes, ...



##########
src/core/CLucene/store/v2/GrowableByteArrayDataOutput.h:
##########
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+#include <zstd.h>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexOutput.h"
+
+namespace v2 {
+
+class GrowableByteArrayDataOutput : public CL_NS(store)::IndexOutput {
+public:
+    GrowableByteArrayDataOutput() : bytes_(INITIAL_SIZE) {}
+    ~GrowableByteArrayDataOutput() override = default;
+
+    void writeByte(uint8_t b) override {
+        ensureCapacity(1);
+        bytes_[nextWrite_++] = b;
+    }
+
+    void writeBytes(const uint8_t* b, const int32_t len) override { 
writeBytes(b, len, 0); }
+
+    void writeBytes(const uint8_t* b, const int32_t len, const int32_t offset) 
override {
+        if (len == 0) {
+            return;
+        }
+        ensureCapacity(len);
+        std::copy(b + offset, b + offset + len, bytes_.data() + nextWrite_);
+        nextWrite_ += len;
+    }
+
+    void close() override {
+        _CLTHROWA(CL_ERR_UnsupportedOperation,
+                  "UnsupportedOperationException 
GrowableByteArrayDataOutput::close");
+    }
+
+    int64_t getFilePointer() const override {
+        _CLTHROWA(CL_ERR_UnsupportedOperation,
+                  "UnsupportedOperationException 
GrowableByteArrayDataOutput::getFilePointer");
+    }
+
+    void seek(const int64_t pos) override {
+        _CLTHROWA(CL_ERR_UnsupportedOperation,
+                  "UnsupportedOperationException 
GrowableByteArrayDataOutput::seek");
+    }
+
+    int64_t length() const override {
+        _CLTHROWA(CL_ERR_UnsupportedOperation,
+                  "UnsupportedOperationException 
GrowableByteArrayDataOutput::length");
+    }
+
+    void flush() override {
+        _CLTHROWA(CL_ERR_UnsupportedOperation,
+                  "UnsupportedOperationException 
GrowableByteArrayDataOutput::flush");
+    }
+
+    void writeTo(CL_NS(store)::IndexOutput* out) { 
out->writeBytes(bytes_.data(), nextWrite_); }
+
+    void writeCompressedTo(CL_NS(store)::IndexOutput* out) {
+        if (nextWrite_ == 0) {
+            return;
+        }
+
+        auto compress = [](const std::string_view& source) {
+            size_t compressBound = ZSTD_compressBound(source.size());
+            std::string compressed(compressBound, 0);
+
+            size_t compressedSize = ZSTD_compress(compressed.data(), 
compressBound, source.data(),
+                                                  source.size(), 3);
+
+            if (ZSTD_isError(compressedSize)) {
+                _CLTHROWA(CL_ERR_Runtime, "Compression failed");
+            }
+
+            compressed.resize(compressedSize);
+            return compressed;
+        };
+
+        auto compress_data = compress(std::string_view((const 
char*)bytes_.data(), nextWrite_));
+        out->writeVInt(compress_data.size());
+        out->writeBytes((const uint8_t*)compress_data.data(), 
compress_data.size());
+
+        nextWrite_ = 0;
+    }
+
+private:
+    void ensureCapacity(int capacityToWrite) {
+        assert(capacityToWrite > 0);
+        if (nextWrite_ + capacityToWrite > bytes_.capacity()) {
+            size_t newCapacity = std::max(bytes_.capacity() * 2, nextWrite_ + 
capacityToWrite);
+            bytes_.reserve(newCapacity);
+        }
+        if (nextWrite_ + capacityToWrite > bytes_.size()) {
+            bytes_.resize(nextWrite_ + capacityToWrite);
+        }
+    }
+
+private:
+    static constexpr int32_t INITIAL_SIZE = 1 << 8;
+
+    size_t nextWrite_ = 0;

Review Comment:
   We can use `vector.capacity()/size()/reserve()/resize()` to avoid store a 
state `nextWrite_` .



##########
CMakeLists.txt:
##########
@@ -17,7 +17,7 @@ MATH(EXPR CLUCENE_INT_VERSION "(${CLUCENE_VERSION_MAJOR} * 
1000000) + (${CLUCENE
 SET(CLUCENE_VERSION 
"${CLUCENE_VERSION_MAJOR}.${CLUCENE_VERSION_MINOR}.${CLUCENE_VERSION_REVISION}.${CLUCENE_VERSION_PATCH}")
 
 #CMake 2.6+ is recommended to an improved Boost module
-CMAKE_MINIMUM_REQUIRED(VERSION 2.4.0 FATAL_ERROR)
+CMAKE_MINIMUM_REQUIRED(VERSION 3.12 FATAL_ERROR)

Review Comment:
   Why bump up cmake version? Doris BE does not set CMAKE_MINIMUM_REQUIRED.



##########
src/core/CLucene/store/v2/ByteArrayDataInput.h:
##########
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <zstd.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexInput.h"
+
+namespace v2 {
+
+class ByteArrayDataInput : public CL_NS(store)::IndexInput {

Review Comment:
   unit test is necessary for ByteArrayDataInput and ByteArrayDataOutput



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org
For additional commands, e-mail: dev-h...@doris.apache.org

Re: [PR] [opt](inverted index) Inverted Index Dictionary Compression [doris-thirdparty]

Reply via email to