This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch vector-index-dev in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push: new 2a915e4b2ca enable faiss hnsw (#49745) 2a915e4b2ca is described below commit 2a915e4b2ca613c8dda5ce55f0e5c79f935e1dbb Author: zhiqiang <hezhiqi...@selectdb.com> AuthorDate: Wed Apr 2 10:00:40 2025 +0800 enable faiss hnsw (#49745) ``` CREATE TABLE `vector_table` ( `siteid` int(11) NULL DEFAULT "10" COMMENT "", `embedding` array<float> NOT NULL COMMENT "", `comment` text NULL, INDEX idx_test_ann (`embedding`) USING ANN PROPERTIES( "index_type"="hnsw", "metric_type"="l2", "dim"="8", "max_degree"="100") COMMENT 'test diskann index', INDEX idx_comment (`comment`) USING INVERTED PROPERTIES("support_phrase" = "true", "parser" = "english", "lower_case" = "true") COMMENT 'inverted index for comment' ) ENGINE=OLAP duplicate KEY(`siteid`) COMMENT "OLAP" DISTRIBUTED BY HASH(`siteid`) BUCKETS 1 PROPERTIES ( "replication_num" = "1" ); INSERT INTO `vector_table` (`siteid`, `embedding`,`comment`) VALUES (10, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,20],"emb1"), (20, [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,30],"emb2") -------------- Query OK, 2 rows affected (0.07 sec) {'label':'label_858347013b14baf_b9db5d59b5e30322', 'status':'VISIBLE', 'txnId':'18029'} ``` ``` I20250401 19:18:17.977408 3765348 faiss_vector_index.cpp:86] Faiss index saved to faiss.idx, rows 2 ``` --- be/CMakeLists.txt | 4 + be/src/olap/rowset/segment_v2/ann_index_writer.cpp | 44 ++++++---- be/src/olap/rowset/segment_v2/ann_index_writer.h | 4 +- be/src/vector/CMakeLists.txt | 16 +++- be/src/vector/faiss_vector_index.cpp | 93 ++++++++++++++++++++++ be/src/vector/faiss_vector_index.h | 84 +++++++++++++++++++ be/src/vector/vector_index.h | 4 +- build.sh | 1 - 8 files changed, 224 insertions(+), 26 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 72dbf5cf185..f8a976a6e12 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -395,6 +395,10 @@ if (USE_DWARF) add_compile_options(-gdwarf-5) endif() +if (BUILD_FAISS) + add_definitions(-DBUILD_FAISS) +endif() + # For CMAKE_BUILD_TYPE=Debug if (OS_MACOSX AND ARCH_ARM) # Using -O0 may meet ARM64 branch out of range errors when linking with tcmalloc. diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp index e08aafdcb69..5d7b70430f5 100644 --- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp @@ -1,5 +1,9 @@ #include "olap/rowset/segment_v2/ann_index_writer.h" +#ifdef BUILD_FAISS +#include "vector/faiss_vector_index.h" +#endif + namespace doris::segment_v2 { AnnIndexColumnWriter::AnnIndexColumnWriter(const std::string& field_name, @@ -27,22 +31,30 @@ std::string get_or_default(const std::map<std::string, std::string>& properties, } Status AnnIndexColumnWriter::init_ann_index() { - // if(get_or_default(_index_meta->properties(), INDEX_TYPE, "")=="diskann"){ - // _vector_index_writer = std::make_shared<DiskannVectorIndex>(_dir); - // std::shared_ptr<DiskannBuilderParameter> builderParameterPtr = std::make_shared<DiskannBuilderParameter>(); - // builderParameterPtr->with_dim(std::stoi(get_or_default(_index_meta->properties(), DIM,""))) - // .with_L(std::stoi(get_or_default(_index_meta->properties(), DISKANN_SEARCH_LIST,""))) - // .with_R(std::stoi(get_or_default(_index_meta->properties(), DISKANN_MAX_DEGREE,""))) - // .with_build_num_threads(8) - // .with_sample_rate(1) - // .with_indexing_ram_budget_mb(10*1024) - // .with_search_ram_budget_mb(30) - // .with_mertic_type(VectorIndex::string_to_metric(get_or_default(_index_meta->properties(), METRIC_TYPE,""))); - // _vector_index_writer->set_build_params(std::static_pointer_cast<BuilderParameter>(builderParameterPtr)); - // return Status::OK(); - // }else{ - return Status::NotSupported("ANN index is not support for now."); - // } + _vector_index_writer = nullptr; + std::string index_type = get_or_default(_index_meta->properties(), INDEX_TYPE, ""); + if (index_type == "hnsw") { +#ifdef BUILD_FAISS + std::shared_ptr<FaissVectorIndex> faiss_index_writer = + std::make_shared<FaissVectorIndex>(_dir); + + FaissBuildParameter builderParameter; + builderParameter.index_type = FaissBuildParameter::string_to_index_type("hnsw"); + builderParameter.d = std::stoi(get_or_default(_index_meta->properties(), DIM, "512")); + builderParameter.m = std::stoi(get_or_default(_index_meta->properties(), MAX_DEGREE, "32")); + builderParameter.quantilizer = FaissBuildParameter::string_to_quantilizer( + get_or_default(_index_meta->properties(), QUANTILIZER, "flat")); + faiss_index_writer->set_build_params(builderParameter); + _vector_index_writer = faiss_index_writer; +#else + return Status::NotSupported("Faiss index is not supported, please build doris with faiss"); +#endif + } + if (_vector_index_writer == nullptr) { + return Status::NotSupported("Unsupported index type: " + index_type); + } else { + return Status::OK(); + } } Status AnnIndexColumnWriter::open_index_directory() { diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.h b/be/src/olap/rowset/segment_v2/ann_index_writer.h index ce6e7c93a5d..2425fecc153 100644 --- a/be/src/olap/rowset/segment_v2/ann_index_writer.h +++ b/be/src/olap/rowset/segment_v2/ann_index_writer.h @@ -49,9 +49,9 @@ class AnnIndexColumnWriter : public IndexColumnWriter { public: static constexpr const char* INDEX_TYPE = "index_type"; static constexpr const char* METRIC_TYPE = "metric_type"; + static constexpr const char* QUANTILIZER = "quantilizer"; static constexpr const char* DIM = "dim"; - static constexpr const char* DISKANN_MAX_DEGREE = "max_degree"; - static constexpr const char* DISKANN_SEARCH_LIST = "search_list"; + static constexpr const char* MAX_DEGREE = "max_degree"; explicit AnnIndexColumnWriter(const std::string& field_name, XIndexFileWriter* index_file_writer, diff --git a/be/src/vector/CMakeLists.txt b/be/src/vector/CMakeLists.txt index 8aa56be531d..816d7da34c8 100644 --- a/be/src/vector/CMakeLists.txt +++ b/be/src/vector/CMakeLists.txt @@ -18,17 +18,25 @@ # where to put generated libraries set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/vector") -set(HEADER_FILES +set(VECTOR_LIB_SRC vector_index.h stream_wrapper.h ) -# Use INTERFACE library type for header-only libraries -add_library(vector INTERFACE) +set(VECTOR_LIB_DEPENDENCIES) + if (BUILD_FAISS) - target_link_libraries(vector INTERFACE faiss) + # append faiss src to VECTOR_LIB_SRC + list(APPEND VECTOR_LIB_SRC + faiss_vector_index.h + faiss_vector_index.cpp + ) + list(APPEND VECTOR_LIB_DEPENDENCIES faiss) endif() +add_library(vector OBJECT ${VECTOR_LIB_SRC}) +target_link_libraries(vector PRIVATE ${VECTOR_LIB_DEPENDENCIES}) + # Make the headers available to targets that link against the vector library target_include_directories(vector INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}> diff --git a/be/src/vector/faiss_vector_index.cpp b/be/src/vector/faiss_vector_index.cpp new file mode 100644 index 00000000000..d05fd920f60 --- /dev/null +++ b/be/src/vector/faiss_vector_index.cpp @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "faiss_vector_index.h" + +#include <faiss/index_io.h> + +#include <memory> + +#include "CLucene/store/IndexOutput.h" +#include "common/exception.h" +#include "common/logging.h" +#include "common/status.h" +#include "faiss/IndexHNSW.h" +#include "faiss/impl/io.h" + +struct FaissIndexWriter : faiss::IOWriter { +public: + FaissIndexWriter() = default; + FaissIndexWriter(lucene::store::IndexOutput* output) : _output(output) {} + ~FaissIndexWriter() override { + if (_output != nullptr) { + _output->close(); + delete _output; + } + } + + size_t operator()(const void* ptr, size_t size, size_t nitems) override { + size_t bytes = size * nitems; + if (bytes > 0) { + try { + _output->writeBytes(reinterpret_cast<const uint8_t*>(ptr), bytes); + } catch (const std::exception& e) { + throw doris::Exception(doris::ErrorCode::IO_ERROR, + "Failed to write vector index {}", e.what()); + } + } + return nitems; + }; + + lucene::store::IndexOutput* _output = nullptr; +}; + +doris::Status FaissVectorIndex::add(int n, const float* vec) { + DCHECK(vec != nullptr); + + _index->add(n, vec); + + return doris::Status::OK(); +} + +void FaissVectorIndex::set_build_params(const FaissBuildParameter& params) { + if (params.index_type == FaissBuildParameter::IndexType::BruteForce) { + _index = std::make_shared<faiss::IndexFlatL2>(params.d); + } else if (params.index_type == FaissBuildParameter::IndexType::HNSW) { + _index = std::make_shared<faiss::IndexHNSWFlat>(params.d, params.m); + } else { + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, "Unsupported index type: {}", + static_cast<int>(params.index_type)); + } +} + +doris::Status FaissVectorIndex::search(const float* query_vec, int k, SearchResult* result, + const SearchParameters* params) { + return doris::Status::OK(); +} + +doris::Status FaissVectorIndex::save() { + lucene::store::IndexOutput* idx_output = _dir->createOutput("faiss.idx"); + auto writer = std::make_unique<FaissIndexWriter>(idx_output); + faiss::write_index(_index.get(), writer.get()); + VLOG_DEBUG << fmt::format("Faiss index saved to faiss.idx, rows {}", _index->ntotal); + return doris::Status::OK(); +} +doris::Status FaissVectorIndex::load(Metric type) { + // Load the index from the directory + // This is a placeholder for actual implementation + return doris::Status::OK(); +} \ No newline at end of file diff --git a/be/src/vector/faiss_vector_index.h b/be/src/vector/faiss_vector_index.h new file mode 100644 index 00000000000..b99359efee7 --- /dev/null +++ b/be/src/vector/faiss_vector_index.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <CLucene.h> +#include <CLucene/store/IndexInput.h> +#include <CLucene/store/IndexOutput.h> +#include <faiss/Index.h> + +#include <memory> +#include <string> + +#include "common/status.h" +#include "vector_index.h" + +struct FaissBuildParameter { + enum class IndexType { BruteForce, IVF, HNSW }; + + enum class Quantilizer { FLAT, SQ, PQ }; + + static IndexType string_to_index_type(const std::string& type) { + if (type == "brute_force") { + return IndexType::BruteForce; + } else if (type == "ivf") { + return IndexType::IVF; + } else if (type == "hnsw") { + return IndexType::HNSW; + } + return IndexType::HNSW; // default + } + + static Quantilizer string_to_quantilizer(const std::string& type) { + if (type == "flat") { + return Quantilizer::FLAT; + } else if (type == "sq") { + return Quantilizer::SQ; + } else if (type == "pq") { + return Quantilizer::PQ; + } + return Quantilizer::FLAT; // default + } + + // HNSW + int d = 0; + int m = 0; + IndexType index_type; + Quantilizer quantilizer; +}; + +class FaissVectorIndex : public VectorIndex { +public: + FaissVectorIndex(std::shared_ptr<lucene::store::Directory> dir) : _index(nullptr), _dir(dir) {} + + doris::Status add(int n, const float* vec) override; + + void set_build_params(const FaissBuildParameter& params); + + doris::Status search(const float* query_vec, int k, SearchResult* result, + const SearchParameters* params = nullptr) override; + + doris::Status save() override; + + doris::Status load(Metric type) override; + +private: + std::shared_ptr<faiss::Index> _index; + + std::shared_ptr<lucene::store::Directory> _dir; +}; diff --git a/be/src/vector/vector_index.h b/be/src/vector/vector_index.h index 717c1a79a3a..66442dcecf4 100644 --- a/be/src/vector/vector_index.h +++ b/be/src/vector/vector_index.h @@ -54,14 +54,12 @@ struct SearchParameters { virtual ~SearchParameters() {} }; -struct BuilderParameter {}; - class VectorIndex { public: enum class Metric { L2, COSINE, INNER_PRODUCT, UNKNOWN }; virtual doris::Status add(int n, const float* vec) = 0; - virtual void set_build_params(std::shared_ptr<BuilderParameter> params) = 0; + virtual doris::Status search(const float* query_vec, int k, SearchResult* result, const SearchParameters* params = nullptr) = 0; //virtual Status save(FileWriter* writer); diff --git a/build.sh b/build.sh index 8f057b5b5c8..8f87c08ea10 100755 --- a/build.sh +++ b/build.sh @@ -646,7 +646,6 @@ if [[ "${BUILD_BE}" -eq 1 ]]; then -DENABLE_CLANG_COVERAGE="${DENABLE_CLANG_COVERAGE}" \ -DDORIS_JAVA_HOME="${JAVA_HOME}" \ -DBUILD_AZURE="${BUILD_AZURE}" \ - -DBUILD_FAISS="${BUILD_FAISS}" \ "${DORIS_HOME}/be" if [[ "${OUTPUT_BE_BINARY}" -eq 1 ]]; then --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org