xiaokang commented on code in PR #245:
URL: https://github.com/apache/doris-thirdparty/pull/245#discussion_r1827601054

@@ -0,0 +1,108 @@
+#pragma once
+#include <cstdint>
+#include <string_view>
+#include <vector>
+#include <zstd.h>
+#include <iostream>
+#include "CLucene.h"
+#include "CLucene/store/IndexOutput.h"
+namespace v2 {
+class GrowableByteArrayDataOutput : public CL_NS(store)::IndexOutput {
+    GrowableByteArrayDataOutput() : bytes_(INITIAL_SIZE) {}
+    ~GrowableByteArrayDataOutput() override = default;
+    void writeByte(uint8_t b) override {
+        ensureCapacity(1);
+        bytes_[nextWrite_++] = b;
+    }
+    void writeBytes(const uint8_t* b, const int32_t len) override { 
writeBytes(b, len, 0); }
+    void writeBytes(const uint8_t* b, const int32_t len, const int32_t offset) 
override {

Review Comment:
   follow normal buffer argument order: b, offset, len

@@ -287,7 +305,12 @@ void SegmentTermEnum::seek(const int64_t pointer, const 
int32_t p, Term* t, Term
     //Post - term and terminfo have been repositioned within the enumeration
     //Reset the IndexInput input to pointer
-    input->seek(pointer);
+    if (isDictCompress_ && !isIndex) {
+        input->seek(pointer);
+        byteArrayDataInput_.readCompressedFrom(input);
+    } else {
+        input->seek(pointer);
+    }

Review Comment:
   if (isDictCompress_ && !isIndex) {

@@ -0,0 +1,126 @@
+#pragma once
+#include <zstd.h>
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+#include <iostream>
+#include "CLucene.h"
+#include "CLucene/store/IndexInput.h"
+namespace v2 {
+class ByteArrayDataInput : public CL_NS(store)::IndexInput {
+    ByteArrayDataInput() : owns_(true), bytes_(new std::vector<uint8_t>()) {}
+    ByteArrayDataInput(std::vector<uint8_t>* bytes) { reset(bytes); }
+    ~ByteArrayDataInput() override {
+        if (owns_) {
+            if (bytes_ != nullptr) {
+                delete bytes_;
+                bytes_ = nullptr;
+            }
+        }
+    }
+    void reset(std::vector<uint8_t>* bytes) { reset(bytes, 0, bytes->size()); }
+    void reset(std::vector<uint8_t>* bytes, int32_t offset, int32_t len) {
+        bytes_ = bytes;
+        pos_ = offset;
+        limit_ = offset + len;
+    }
+    uint8_t readByte() override { return (*bytes_)[pos_++]; }
+    void readBytes(uint8_t* b, const int32_t len) override { readBytes(b, 0, 
len); }
+    void readBytes(uint8_t* b, const int32_t len, int32_t offset) override {

Review Comment:
   limit_ is not checked

@@ -0,0 +1,325 @@
+#include <CLucene.h> // IWYU pragma: keep
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+#include <CLucene/util/stringUtil.h>
+#include <ctime>
+#include <exception>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/index/FieldConfig.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/_RAMDirectory.h"
+#include "CLucene/store/v2/ByteArrayDataInput.h"
+#include "CLucene/store/v2/GrowableByteArrayDataOutput.h"
+#include "CuTest.h"
+#include "test.h"
+static constexpr int32_t doc_count = 10000;
+#define FINALLY(eptr, finallyBlock)       \
+    {                                     \
+        finallyBlock;                     \
+        if (eptr) {                       \
+            std::rethrow_exception(eptr); \
+        }                                 \
+    }
+static int32_t getDaySeed() {
+    std::time_t now = std::time(nullptr);
+    std::tm* localTime = std::localtime(&now);
+    localTime->tm_sec = 0;
+    localTime->tm_min = 0;
+    localTime->tm_hour = 0;
+    return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24));
+static std::string generateRandomIP() {
+    std::string ip_v4;
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    return ip_v4;
+static void write_index(const std::string& name, RAMDirectory* dir, 
IndexVersion index_version,
+                        bool isDictCompress, const std::vector<std::string>& 
datas) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    analyzer->set_stopwords(nullptr);
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true);
+    indexwriter->setRAMBufferSizeMB(512);
+    indexwriter->setMaxBufferedDocs(-1);
+    indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+    indexwriter->setMergeFactor(1000000000);
+    indexwriter->setUseCompoundFile(false);
+    auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>;
+    auto* doc = _CLNEW lucene::document::Document();
+    int32_t field_config = lucene::document::Field::STORE_NO;
+    field_config |= lucene::document::Field::INDEX_NONORMS;
+    field_config |= lucene::document::Field::INDEX_TOKENIZED;
+    auto field_name = std::wstring(name.begin(), name.end());
+    auto* field = _CLNEW lucene::document::Field(field_name.c_str(), 
+    field->setOmitTermFreqAndPositions(false);
+    field->setIndexVersion(index_version);
+    if (isDictCompress) {
+        field->updateFlag(FlagBits::DICT_COMPRESS);
+    }
+    doc->add(*field);
+    for (const auto& data : datas) {
+        char_string_reader->init(data.data(), data.size(), false);
+        auto* stream = analyzer->reusableTokenStream(field->name(), 
+        field->setValue(stream);
+        indexwriter->addDocument(doc);
+    }
+    indexwriter->close();
+    _CLLDELETE(indexwriter);
+    _CLLDELETE(doc);
+    _CLLDELETE(analyzer);
+    _CLLDELETE(char_string_reader);
+static void read_index(RAMDirectory* dir, int32_t doc_count) {
+    auto* reader = IndexReader::open(dir);
+    std::exception_ptr eptr;
+    try {
+        if (doc_count != reader->numDocs()) {
+            std::string msg = "doc_count: " + std::to_string(doc_count) +
+                              ", numDocs: " + 
+            _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str());
+        }
+        Term* term = nullptr;
+        TermEnum* enumerator = nullptr;
+        try {
+            enumerator = reader->terms();
+            while (enumerator->next()) {
+                term = enumerator->term();
+                auto* term_pos = reader->termPositions(term);
+                std::exception_ptr eptr;
+                try {
+                    TermPositionIterator iter(term_pos);
+                    int32_t doc = 0;
+                    while ((doc = iter.nextDoc()) != INT32_MAX) {
+                        for (int32_t i = 0; i < iter.freq(); i++) {
+                            int32_t pos = iter.nextPosition();
+                            if (pos < 0 || pos > 3) {
+                                std::string msg = "pos: " + 
+                                _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str());
+                            }
+                        }
+                    }
+                } catch (...) {
+                    eptr = std::current_exception();
+                }
+                FINALLY(eptr, { _CLDELETE(term_pos); })
+                _CLDECDELETE(term);
+            }
+        }
+        _CLFINALLY({
+            _CLDECDELETE(term);
+            enumerator->close();
+            _CLDELETE(enumerator);
+        })
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    FINALLY(eptr, {
+        reader->close();
+        _CLLDELETE(reader);
+    })
+static void index_compaction(RAMDirectory* tmp_dir, 
std::vector<lucene::store::Directory*> srcDirs,
+                             std::vector<lucene::store::Directory*> destDirs, 
int32_t count) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(tmp_dir, analyzer, 
+    std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec(
+            srcDirs.size(), std::vector<std::pair<uint32_t, uint32_t>>(count));
+    int32_t idx = 0;
+    int32_t id = 0;
+    for (int32_t i = 0; i < count; i++) {
+        for (int32_t j = 0; j < srcDirs.size(); j++) {
+            if (id == count * destDirs.size()) {
+                idx++;
+                id = 0;
+            }
+            trans_vec[j][i] = std::make_pair(idx, id++);
+        }
+    }
+    std::vector<uint32_t> dest_index_docs(destDirs.size());
+    for (int32_t i = 0; i < destDirs.size(); i++) {
+        dest_index_docs[i] = count * destDirs.size();
+    }
+    std::exception_ptr eptr;
+    try {
+        indexwriter->indexCompaction(srcDirs, destDirs, trans_vec, 
+    } catch (...) {
+        eptr = std::current_exception();
+    }
+    FINALLY(eptr, {
+        indexwriter->close();
+        _CLDELETE(indexwriter);
+        _CLDELETE(analyzer);
+    })
+void TestIndexByteArray(CuTest* tc) {
+    RAMDirectory dir;
+    auto ram_out = dir.createOutput("TestIndexByteArray");
+    v2::GrowableByteArrayDataOutput out;
+    for (int32_t i = 0; i < doc_count; i++) {
+        out.writeVInt(i);
+    }
+    out.writeCompressedTo(ram_out);
+    ram_out->close();
+    IndexInput* ram_in = nullptr;
+    CLuceneError error;
+    bool ret = dir.openInput("TestIndexByteArray", ram_in, error);
+    if (!ret) {
+        std::cout << error.what() << std::endl;
+    }
+    assertTrue(ret);
+    v2::ByteArrayDataInput in;
+    in.readCompressedFrom(ram_in);
+    for (int32_t i = 0; i < doc_count; i++) {
+        assertEquals(in.readVInt(), i);
+    }
+    _CLDELETE(ram_out);
+    _CLDELETE(ram_in);
+    std::cout << "\nTestIndexByteArray sucess" << std::endl;
+void TestIndexCompressV3(CuTest* tc) {
+    std::srand(getDaySeed());
+    std::string name = "v2_field_name";
+    std::vector<std::string> datas;
+    for (int32_t i = 0; i < doc_count; i++) {
+        std::string ip_v4 = generateRandomIP();
+        datas.emplace_back(ip_v4);
+    }
+    RAMDirectory dir;
+    write_index(name, &dir, IndexVersion::kV3, false, datas);
+    try {
+        read_index(&dir, doc_count);

Review Comment:
   read does not check any data.

Reply via email to