This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/vector-index-dev by this push:
     new 2a915e4b2ca enable faiss hnsw (#49745)
2a915e4b2ca is described below

commit 2a915e4b2ca613c8dda5ce55f0e5c79f935e1dbb
Author: zhiqiang <hezhiqi...@selectdb.com>
AuthorDate: Wed Apr 2 10:00:40 2025 +0800

    enable faiss hnsw (#49745)
    
    ```
    CREATE TABLE `vector_table` (
      `siteid` int(11) NULL DEFAULT "10" COMMENT "",
      `embedding` array<float>  NOT NULL  COMMENT "",
      `comment` text NULL,
      INDEX idx_test_ann (`embedding`) USING ANN PROPERTIES(
        "index_type"="hnsw",
        "metric_type"="l2",
        "dim"="8",
        "max_degree"="100") COMMENT 'test diskann index',
      INDEX idx_comment (`comment`) USING INVERTED PROPERTIES("support_phrase" 
= "true", "parser" = "english", "lower_case" = "true") COMMENT 'inverted index 
for comment' )
      ENGINE=OLAP duplicate KEY(`siteid`) COMMENT "OLAP" DISTRIBUTED BY 
HASH(`siteid`) BUCKETS 1 PROPERTIES ( "replication_num" = "1" );
    
    INSERT INTO `vector_table` (`siteid`, `embedding`,`comment`) VALUES
    (10, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,20],"emb1"),
    (20, [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,30],"emb2")
    --------------
    
    Query OK, 2 rows affected (0.07 sec)
    {'label':'label_858347013b14baf_b9db5d59b5e30322', 'status':'VISIBLE', 
'txnId':'18029'}
    ```
    
    ```
    I20250401 19:18:17.977408 3765348 faiss_vector_index.cpp:86] Faiss index 
saved to faiss.idx, rows 2
    ```
---
 be/CMakeLists.txt                                  |  4 +
 be/src/olap/rowset/segment_v2/ann_index_writer.cpp | 44 ++++++----
 be/src/olap/rowset/segment_v2/ann_index_writer.h   |  4 +-
 be/src/vector/CMakeLists.txt                       | 16 +++-
 be/src/vector/faiss_vector_index.cpp               | 93 ++++++++++++++++++++++
 be/src/vector/faiss_vector_index.h                 | 84 +++++++++++++++++++
 be/src/vector/vector_index.h                       |  4 +-
 build.sh                                           |  1 -
 8 files changed, 224 insertions(+), 26 deletions(-)

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 72dbf5cf185..f8a976a6e12 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -395,6 +395,10 @@ if (USE_DWARF)
     add_compile_options(-gdwarf-5)
 endif()
 
+if (BUILD_FAISS)
+    add_definitions(-DBUILD_FAISS)
+endif()
+
 # For CMAKE_BUILD_TYPE=Debug
 if (OS_MACOSX AND ARCH_ARM)
     # Using -O0 may meet ARM64 branch out of range errors when linking with 
tcmalloc.
diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
index e08aafdcb69..5d7b70430f5 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
@@ -1,5 +1,9 @@
 #include "olap/rowset/segment_v2/ann_index_writer.h"
 
+#ifdef BUILD_FAISS
+#include "vector/faiss_vector_index.h"
+#endif
+
 namespace doris::segment_v2 {
 
 AnnIndexColumnWriter::AnnIndexColumnWriter(const std::string& field_name,
@@ -27,22 +31,30 @@ std::string get_or_default(const std::map<std::string, 
std::string>& properties,
 }
 
 Status AnnIndexColumnWriter::init_ann_index() {
-    // if(get_or_default(_index_meta->properties(), INDEX_TYPE, 
"")=="diskann"){
-    //     _vector_index_writer = std::make_shared<DiskannVectorIndex>(_dir);
-    //     std::shared_ptr<DiskannBuilderParameter>  builderParameterPtr = 
std::make_shared<DiskannBuilderParameter>();
-    //     
builderParameterPtr->with_dim(std::stoi(get_or_default(_index_meta->properties(),
 DIM,"")))
-    //                         
.with_L(std::stoi(get_or_default(_index_meta->properties(), 
DISKANN_SEARCH_LIST,"")))
-    //                         
.with_R(std::stoi(get_or_default(_index_meta->properties(), 
DISKANN_MAX_DEGREE,"")))
-    //                         .with_build_num_threads(8)
-    //                         .with_sample_rate(1)
-    //                         .with_indexing_ram_budget_mb(10*1024)
-    //                         .with_search_ram_budget_mb(30)
-    //                         
.with_mertic_type(VectorIndex::string_to_metric(get_or_default(_index_meta->properties(),
 METRIC_TYPE,"")));
-    //     
_vector_index_writer->set_build_params(std::static_pointer_cast<BuilderParameter>(builderParameterPtr));
-    //     return Status::OK();
-    // }else{
-    return Status::NotSupported("ANN index is not support for now.");
-    // }
+    _vector_index_writer = nullptr;
+    std::string index_type = get_or_default(_index_meta->properties(), 
INDEX_TYPE, "");
+    if (index_type == "hnsw") {
+#ifdef BUILD_FAISS
+        std::shared_ptr<FaissVectorIndex> faiss_index_writer =
+                std::make_shared<FaissVectorIndex>(_dir);
+
+        FaissBuildParameter builderParameter;
+        builderParameter.index_type = 
FaissBuildParameter::string_to_index_type("hnsw");
+        builderParameter.d = 
std::stoi(get_or_default(_index_meta->properties(), DIM, "512"));
+        builderParameter.m = 
std::stoi(get_or_default(_index_meta->properties(), MAX_DEGREE, "32"));
+        builderParameter.quantilizer = 
FaissBuildParameter::string_to_quantilizer(
+                get_or_default(_index_meta->properties(), QUANTILIZER, 
"flat"));
+        faiss_index_writer->set_build_params(builderParameter);
+        _vector_index_writer = faiss_index_writer;
+#else
+        return Status::NotSupported("Faiss index is not supported, please 
build doris with faiss");
+#endif
+    }
+    if (_vector_index_writer == nullptr) {
+        return Status::NotSupported("Unsupported index type: " + index_type);
+    } else {
+        return Status::OK();
+    }
 }
 
 Status AnnIndexColumnWriter::open_index_directory() {
diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.h 
b/be/src/olap/rowset/segment_v2/ann_index_writer.h
index ce6e7c93a5d..2425fecc153 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.h
@@ -49,9 +49,9 @@ class AnnIndexColumnWriter : public IndexColumnWriter {
 public:
     static constexpr const char* INDEX_TYPE = "index_type";
     static constexpr const char* METRIC_TYPE = "metric_type";
+    static constexpr const char* QUANTILIZER = "quantilizer";
     static constexpr const char* DIM = "dim";
-    static constexpr const char* DISKANN_MAX_DEGREE = "max_degree";
-    static constexpr const char* DISKANN_SEARCH_LIST = "search_list";
+    static constexpr const char* MAX_DEGREE = "max_degree";
 
     explicit AnnIndexColumnWriter(const std::string& field_name,
                                   XIndexFileWriter* index_file_writer,
diff --git a/be/src/vector/CMakeLists.txt b/be/src/vector/CMakeLists.txt
index 8aa56be531d..816d7da34c8 100644
--- a/be/src/vector/CMakeLists.txt
+++ b/be/src/vector/CMakeLists.txt
@@ -18,17 +18,25 @@
 # where to put generated libraries
 set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/vector")
 
-set(HEADER_FILES
+set(VECTOR_LIB_SRC
         vector_index.h
         stream_wrapper.h
 )
 
-# Use INTERFACE library type for header-only libraries
-add_library(vector INTERFACE)
+set(VECTOR_LIB_DEPENDENCIES)
+
 if (BUILD_FAISS)
-    target_link_libraries(vector INTERFACE faiss)
+    # append faiss src to VECTOR_LIB_SRC
+    list(APPEND VECTOR_LIB_SRC
+        faiss_vector_index.h
+        faiss_vector_index.cpp
+    )
+    list(APPEND VECTOR_LIB_DEPENDENCIES faiss)
 endif()
 
+add_library(vector OBJECT ${VECTOR_LIB_SRC})
+target_link_libraries(vector PRIVATE ${VECTOR_LIB_DEPENDENCIES})
+
 # Make the headers available to targets that link against the vector library
 target_include_directories(vector INTERFACE
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
diff --git a/be/src/vector/faiss_vector_index.cpp 
b/be/src/vector/faiss_vector_index.cpp
new file mode 100644
index 00000000000..d05fd920f60
--- /dev/null
+++ b/be/src/vector/faiss_vector_index.cpp
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "faiss_vector_index.h"
+
+#include <faiss/index_io.h>
+
+#include <memory>
+
+#include "CLucene/store/IndexOutput.h"
+#include "common/exception.h"
+#include "common/logging.h"
+#include "common/status.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/impl/io.h"
+
+struct FaissIndexWriter : faiss::IOWriter {
+public:
+    FaissIndexWriter() = default;
+    FaissIndexWriter(lucene::store::IndexOutput* output) : _output(output) {}
+    ~FaissIndexWriter() override {
+        if (_output != nullptr) {
+            _output->close();
+            delete _output;
+        }
+    }
+
+    size_t operator()(const void* ptr, size_t size, size_t nitems) override {
+        size_t bytes = size * nitems;
+        if (bytes > 0) {
+            try {
+                _output->writeBytes(reinterpret_cast<const uint8_t*>(ptr), 
bytes);
+            } catch (const std::exception& e) {
+                throw doris::Exception(doris::ErrorCode::IO_ERROR,
+                                       "Failed to write vector index {}", 
e.what());
+            }
+        }
+        return nitems;
+    };
+
+    lucene::store::IndexOutput* _output = nullptr;
+};
+
+doris::Status FaissVectorIndex::add(int n, const float* vec) {
+    DCHECK(vec != nullptr);
+
+    _index->add(n, vec);
+
+    return doris::Status::OK();
+}
+
+void FaissVectorIndex::set_build_params(const FaissBuildParameter& params) {
+    if (params.index_type == FaissBuildParameter::IndexType::BruteForce) {
+        _index = std::make_shared<faiss::IndexFlatL2>(params.d);
+    } else if (params.index_type == FaissBuildParameter::IndexType::HNSW) {
+        _index = std::make_shared<faiss::IndexHNSWFlat>(params.d, params.m);
+    } else {
+        throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, 
"Unsupported index type: {}",
+                               static_cast<int>(params.index_type));
+    }
+}
+
+doris::Status FaissVectorIndex::search(const float* query_vec, int k, 
SearchResult* result,
+                                       const SearchParameters* params) {
+    return doris::Status::OK();
+}
+
+doris::Status FaissVectorIndex::save() {
+    lucene::store::IndexOutput* idx_output = _dir->createOutput("faiss.idx");
+    auto writer = std::make_unique<FaissIndexWriter>(idx_output);
+    faiss::write_index(_index.get(), writer.get());
+    VLOG_DEBUG << fmt::format("Faiss index saved to faiss.idx, rows {}", 
_index->ntotal);
+    return doris::Status::OK();
+}
+doris::Status FaissVectorIndex::load(Metric type) {
+    // Load the index from the directory
+    // This is a placeholder for actual implementation
+    return doris::Status::OK();
+}
\ No newline at end of file
diff --git a/be/src/vector/faiss_vector_index.h 
b/be/src/vector/faiss_vector_index.h
new file mode 100644
index 00000000000..b99359efee7
--- /dev/null
+++ b/be/src/vector/faiss_vector_index.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/store/IndexInput.h>
+#include <CLucene/store/IndexOutput.h>
+#include <faiss/Index.h>
+
+#include <memory>
+#include <string>
+
+#include "common/status.h"
+#include "vector_index.h"
+
+struct FaissBuildParameter {
+    enum class IndexType { BruteForce, IVF, HNSW };
+
+    enum class Quantilizer { FLAT, SQ, PQ };
+
+    static IndexType string_to_index_type(const std::string& type) {
+        if (type == "brute_force") {
+            return IndexType::BruteForce;
+        } else if (type == "ivf") {
+            return IndexType::IVF;
+        } else if (type == "hnsw") {
+            return IndexType::HNSW;
+        }
+        return IndexType::HNSW; // default
+    }
+
+    static Quantilizer string_to_quantilizer(const std::string& type) {
+        if (type == "flat") {
+            return Quantilizer::FLAT;
+        } else if (type == "sq") {
+            return Quantilizer::SQ;
+        } else if (type == "pq") {
+            return Quantilizer::PQ;
+        }
+        return Quantilizer::FLAT; // default
+    }
+
+    // HNSW
+    int d = 0;
+    int m = 0;
+    IndexType index_type;
+    Quantilizer quantilizer;
+};
+
+class FaissVectorIndex : public VectorIndex {
+public:
+    FaissVectorIndex(std::shared_ptr<lucene::store::Directory> dir) : 
_index(nullptr), _dir(dir) {}
+
+    doris::Status add(int n, const float* vec) override;
+
+    void set_build_params(const FaissBuildParameter& params);
+
+    doris::Status search(const float* query_vec, int k, SearchResult* result,
+                         const SearchParameters* params = nullptr) override;
+
+    doris::Status save() override;
+
+    doris::Status load(Metric type) override;
+
+private:
+    std::shared_ptr<faiss::Index> _index;
+
+    std::shared_ptr<lucene::store::Directory> _dir;
+};
diff --git a/be/src/vector/vector_index.h b/be/src/vector/vector_index.h
index 717c1a79a3a..66442dcecf4 100644
--- a/be/src/vector/vector_index.h
+++ b/be/src/vector/vector_index.h
@@ -54,14 +54,12 @@ struct SearchParameters {
     virtual ~SearchParameters() {}
 };
 
-struct BuilderParameter {};
-
 class VectorIndex {
 public:
     enum class Metric { L2, COSINE, INNER_PRODUCT, UNKNOWN };
 
     virtual doris::Status add(int n, const float* vec) = 0;
-    virtual void set_build_params(std::shared_ptr<BuilderParameter> params) = 
0;
+
     virtual doris::Status search(const float* query_vec, int k, SearchResult* 
result,
                                  const SearchParameters* params = nullptr) = 0;
     //virtual Status save(FileWriter* writer);
diff --git a/build.sh b/build.sh
index 8f057b5b5c8..8f87c08ea10 100755
--- a/build.sh
+++ b/build.sh
@@ -646,7 +646,6 @@ if [[ "${BUILD_BE}" -eq 1 ]]; then
         -DENABLE_CLANG_COVERAGE="${DENABLE_CLANG_COVERAGE}" \
         -DDORIS_JAVA_HOME="${JAVA_HOME}" \
         -DBUILD_AZURE="${BUILD_AZURE}" \
-        -DBUILD_FAISS="${BUILD_FAISS}" \
         "${DORIS_HOME}/be"
 
     if [[ "${OUTPUT_BE_BINARY}" -eq 1 ]]; then


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to